@article {, title = {Highly Scalable Self-Healing Algorithms for High Performance Scientific Computing}, journal = {IEEE Transactions on Computers}, volume = {58}, year = {2009}, month = {2009-11}, pages = {1512-1524}, abstract = {As the number of processors in today{\textquoteright}s high-performance computers continues to grow, the mean-time-to-failure of these computers is becoming significantly shorter than the execution time of many current high-performance computing applications. Although today{\textquoteright}s architectures are usually robust enough to survive node failures without suffering complete system failure, most of today{\textquoteright}s high-performance computing applications cannot survive node failures. Therefore, whenever a node fails, all surviving processes on surviving nodes usually have to be aborted and the whole application has to be restarted. In this paper, we present a framework for building self-healing high-performance numerical computing applications so that they can adapt to node or link failures without aborting themselves. The framework is based on FT-MPI and diskless checkpointing. Our diskless checkpointing uses weighted checksum schemes, a variation of Reed-Solomon erasure codes over floating-point numbers. We introduce several scalable encoding strategies into the existing diskless checkpointing and reduce the overhead to survive k failures in p processes from 2[log p]. k ((beta + 2gamma) m + alpha) to (1 + O (radic(p)/radic(m))) 2 . k (beta + 2gamma)m, where alpha is the communication latency, 1/beta is the network bandwidth between processes, {1\over \gamma } is the rate to perform calculations, and m is the size of local checkpoint per process. When additional checkpoint processors are used, the overhead can be reduced to (1 + O (1/radic(m))). k (beta + 2gamma)m, which is independent of the total number of computational processors. The introduced self-healing algorithms are scalable in the sense that the overhead to survive k failures in p processes does not increase as the number of processes p increases. We evaluate the performance overhead of our self-healing approach by using a preconditioned conjugate gradient equation solver as an example.}, doi = {https://doi.org/10.1109/TC.2009.42}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:437, title = {Algorithm-Based Fault Tolerance for Fail-Stop Failures}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {19}, number = {12}, year = {2008}, month = {2008-01}, keywords = {FT-MPI, lapack, scalapack}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:366, title = {Disaster Survival Guide in Petascale Computing: An Algorithmic Approach}, journal = {in Petascale Computing: Algorithms and Applications (to appear)}, year = {2007}, month = {2007-00}, publisher = {Chapman \& Hall - CRC Press}, author = {Jack Dongarra and Zizhong Chen and George Bosilca and Julien Langou} } @article {icl:397, title = {Recovery Patterns for Iterative Methods in a Parallel Unstable Environment}, journal = {SIAM SISC (to appear)}, year = {2007}, month = {2007-05}, author = {Julien Langou and Zizhong Chen and George Bosilca and Jack Dongarra} } @inproceedings {icl:393, title = {Self Adapting Application Level Fault Tolerance for Parallel and Distributed Computing}, journal = {Proceedings of Workshop on Self Adapting Application Level Fault Tolerance for Parallel and Distributed Computing at IPDPS}, year = {2007}, month = {2007-03}, pages = {1-8}, author = {Zizhong Chen and Ming Yang and Guillermo Francia III and Jack Dongarra} } @inproceedings {icl:331, title = {Algorithm-Based Checkpoint-Free Fault Tolerance for Parallel Matrix Computations on Volatile Resources}, journal = {IPDPS 2006, 20th IEEE International Parallel and Distributed Processing Symposium}, year = {2006}, month = {2006-01}, address = {Rhodes Island, Greece}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:332, title = {Self Adapting Numerical Software SANS Effort}, journal = {IBM Journal of Research and Development}, volume = {50}, number = {2/3}, year = {2006}, month = {2006-01}, pages = {223-238}, keywords = {gco}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Victor Eijkhout and Graham Fagg and Erika Fuentes and Julien Langou and Piotr Luszczek and Jelena Pjesivac{\textendash}Grbovic and Keith Seymour and Haihang You and Sathish Vadhiyar} } @techreport {icl:263, title = {Algorithm-Based Checkpoint-Free Fault Tolerance for Parallel Matrix Computations on Volatile Resources}, journal = {University of Tennessee Computer Science Department Technical Report}, volume = {{\textendash}05-561}, year = {2005}, month = {2005-11}, author = {Zizhong Chen and Jack Dongarra} } @techreport {icl:303, title = {Condition Numbers of Gaussian Random Matrices}, journal = {University of Tennessee Computer Science Department Technical Report}, volume = {{\textendash}04-539}, year = {2005}, month = {2005-00}, keywords = {ft-la}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:266, title = {Condition Numbers of Gaussian Random Matrices}, journal = {SIAM Journal on Matrix Analysis and Applications (to appear)}, year = {2005}, month = {2005-01}, keywords = {ftmpi, grads, lacsi, sans}, author = {Zizhong Chen and Jack Dongarra} } @inproceedings {icl:265, title = {Fault Tolerant High Performance Computing by a Coding Approach}, journal = {Proceedings of ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (to appear)}, year = {2005}, month = {2005-01}, address = {Chicago, Illinois}, keywords = {ftmpi, grads, lacsi, sans}, author = {Zizhong Chen and Graham Fagg and Edgar Gabriel and Julien Langou and Thara Angskun and George Bosilca and Jack Dongarra} } @inproceedings {icl:267, title = {Numerically Stable Real Number Codes Based on Random Matrices}, journal = {The International Conference on Computational Science}, year = {2005}, month = {2005-01}, publisher = {LNCS 3514, Springer-Verlag}, address = {Atlanta, GA}, keywords = {ftmpi, grads, lacsi, sans}, author = {Zizhong Chen and Jack Dongarra} } @techreport {icl:301, title = {Recovery Patterns for Iterative Methods in a Parallel Unstable Environment}, journal = {University of Tennessee Computer Science Department Technical Report, UT-CS-04-538}, year = {2005}, month = {2005-00}, keywords = {ft-la}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Julien Langou} } @inproceedings {icl:230, title = {Extending the MPI Specification for Process Fault Tolerance on High Performance Computing Systems}, journal = {Proceedings of ISC2004 (to appear)}, year = {2004}, month = {2004-06}, address = {Heidelberg, Germany}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and George Bosilca and Thara Angskun and Zizhong Chen and Jelena Pjesivac{\textendash}Grbovic and Kevin London and Jack Dongarra} } @inproceedings {icl:142, title = {LAPACK for Clusters Project: An Example of Self Adapting Numerical Software}, journal = {Proceedings of the 37th Annual Hawaii International Conference on System Sciences (HICSS 04{\textquoteright})}, volume = {9}, year = {2004}, month = {2004-01}, pages = {90282}, address = {Big Island, Hawaii}, keywords = {lacsi, lfc}, author = {Zizhong Chen and Jack Dongarra and Piotr Luszczek and Kenneth Roche} } @techreport {icl:234, title = {Numerically Stable Real-Number Codes Based on Random Matrices}, journal = {University of Tennessee Computer Science Department Technical Report}, volume = {{\textendash}04-526}, year = {2004}, month = {2004-10}, keywords = {ftmpi}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:240, title = {Process Fault-Tolerance: Semantics, Design and Applications for High Performance Computing}, journal = {International Journal for High Performance Applications and Supercomputing (to appear)}, year = {2004}, month = {2004-04}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and Zizhong Chen and Thara Angskun and George Bosilca and Jelena Pjesivac{\textendash}Grbovic and Jack Dongarra} } @techreport {icl:251, title = {Recovery Patterns for Iterative Methods in a Parallel Unstable Environment}, journal = {ICL Technical Report}, number = {ICL-UT-04-04}, year = {2004}, month = {2004-01}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Julien Langou} } @inproceedings {icl:153, title = {Fault Tolerant Communication Library and Applications for High Performance Computing}, journal = {Los Alamos Computer Science Institute (LACSI) Symposium 2003 (presented)}, year = {2003}, month = {2003-10}, address = {Santa Fe, NM}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and Zizhong Chen and Thara Angskun and George Bosilca and Antonin Bukovsky and Jack Dongarra} } @article {icl:136, title = {Self Adapting Software for Numerical Linear Algebra and LAPACK for Clusters}, journal = {Parallel Computing}, volume = {29}, number = {11-12}, year = {2003}, month = {2003-11}, pages = {1723-1743}, keywords = {lacsi, lfc, sans}, author = {Zizhong Chen and Jack Dongarra and Piotr Luszczek and Kenneth Roche} } @techreport {icl:209, title = {Self Adapting Software for Numerical Linear Algebra and LAPACK for Clusters (LAPACK Working Note 160)}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-03-499}, year = {2003}, month = {2003-01}, keywords = {lacsi}, author = {Zizhong Chen and Jack Dongarra and Piotr Luszczek and Kenneth Roche} }