@article {869, title = {An evaluation of User-Level Failure Mitigation support in MPI}, journal = {Computing}, volume = {95}, year = {2013}, month = {2013-12}, pages = {1171-1184}, abstract = {As the scale of computing platforms becomes increasingly extreme, the requirements for application fault tolerance are increasing as well. Techniques to address this problem by improving the resilience of algorithms have been developed, but they currently receive no support from the programming model, and without such support, they are bound to fail. This paper discusses the failure-free overhead and recovery impact of the user-level failure mitigation proposal presented in the MPI Forum. Experiments demonstrate that fault-aware MPI has little or no impact on performance for a range of applications, and produces satisfactory recovery times when there are failures.}, keywords = {Fault tolerance, MPI, User-level fault mitigation}, doi = {10.1007/s00607-013-0331-3}, author = {Wesley Bland and Aurelien Bouteiller and Thomas Herault and Joshua Hursey and George Bosilca and Jack Dongarra} } @article {691, title = {Extending the scope of the Checkpoint-on-Failure protocol for forward recovery in standard MPI}, journal = {Concurrency and Computation: Practice and Experience}, year = {2013}, month = {2013-07}, abstract = {Most predictions of exascale machines picture billion ways parallelism, encompassing not only millions of cores but also tens of thousands of nodes. Even considering extremely optimistic advances in hardware reliability, probabilistic amplification entails that failures will be unavoidable. Consequently, software fault tolerance is paramount to maintain future scientific productivity. Two major problems hinder ubiquitous adoption of fault tolerance techniques: (i) traditional checkpoint-based approaches incur a steep overhead on failure free operations and (ii) the dominant programming paradigm for parallel applications (the message passing interface (MPI) Standard) offers extremely limited support of software-level fault tolerance approaches. In this paper, we present an approach that relies exclusively on the features of a high quality implementation, as defined by the current MPI Standard, to enable advanced forward recovery techniques, without incurring the overhead of customary periodic checkpointing. With our approach, when failure strikes, applications regain control to make a checkpoint before quitting execution. This checkpoint is in reaction to the failure occurrence rather than periodic. This checkpoint is reloaded in a new MPI application, which restores a sane environment for the forward, application-based recovery technique to repair the failure-damaged dataset. The validity and performance of this approach are evaluated on large-scale systems, using the QR factorization as an example. Published 2013. This article is a US Government work and is in the public domain in the USA.}, doi = {10.1002/cpe.3100}, url = {http://doi.wiley.com/10.1002/cpe.3100}, author = {Wesley Bland and Peng Du and Aurelien Bouteiller and Thomas Herault and George Bosilca and Jack Dongarra} } @article {693, title = {Post-failure recovery of MPI communication capability: Design and rationale}, journal = {International Journal of High Performance Computing Applications}, volume = {27}, year = {2013}, month = {2013-01}, pages = {244 - 254}, abstract = {As supercomputers are entering an era of massive parallelism where the frequency of faults is increasing, the MPI Standard remains distressingly vague on the consequence of failures on MPI communications. Advanced fault-tolerance techniques have the potential to prevent full-scale application restart and therefore lower the cost incurred for each failure, but they demand from MPI the capability to detect failures and resume communications afterward. In this paper, we present a set of extensions to MPI that allow communication capabilities to be restored, while maintaining the extreme level of performance to which MPI users have become accustomed. The motivation behind the design choices are weighted against alternatives, a task that requires simultaneously considering MPI from the viewpoint of both the user and the implementor. The usability of the interfaces for expressing advanced recovery techniques is then discussed, including the difficult issue of enabling separate software layers to coordinate their recovery. }, issn = {1094-3420}, doi = {10.1177/1094342013488238}, url = {http://hpc.sagepub.com/cgi/doi/10.1177/1094342013488238}, author = {Wesley Bland and Aurelien Bouteiller and Thomas Herault and George Bosilca and Jack Dongarra} } @inproceedings {icl:679, title = {A Checkpoint-on-Failure Protocol for Algorithm-Based Recovery in Standard MPI}, journal = {18th International European Conference on Parallel and Distributed Computing (Euro-Par 2012) (Best Paper Award)}, year = {2012}, month = {2012-08}, publisher = {Springer-Verlag}, address = {Rhodes, Greece}, author = {Wesley Bland and Peng Du and Aurelien Bouteiller and Thomas Herault and George Bosilca and Jack Dongarra}, editor = {Christos Kaklamanis and Theodore Papatheodorou and Paul Spirakis} } @inproceedings {icl:668, title = {Enabling Application Resilience With and Without the MPI Standard}, journal = {11th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing}, year = {2012}, month = {2012-05}, address = {Ottawa, Canada}, author = {Wesley Bland} } @inproceedings {icl:680, title = {An Evaluation of User-Level Failure Mitigation Support in MPI}, journal = {Proceedings of Recent Advances in Message Passing Interface - 19th European MPI Users{\textquoteright} Group Meeting, EuroMPI 2012}, year = {2012}, month = {2012-09}, publisher = {Springer}, address = {Vienna, Austria}, author = {Wesley Bland and Aurelien Bouteiller and Thomas Herault and Joshua Hursey and George Bosilca and Jack Dongarra} } @techreport {icl:724, title = {Extending the Scope of the Checkpoint-on-Failure Protocol for Forward Recovery in Standard MPI}, journal = {University of Tennessee Computer Science Technical Report}, number = {ut-cs-12-702}, year = {2012}, month = {2012-00}, keywords = {ftmpi}, author = {Wesley Bland and Peng Du and Aurelien Bouteiller and Thomas Herault and George Bosilca and Jack Dongarra} } @techreport {icl:667, title = {A Proposal for User-Level Failure Mitigation in the MPI-3 Standard}, journal = {University of Tennessee Electrical Engineering and Computer Science Technical Report}, number = {ut-cs-12-693}, year = {2012}, month = {2012-02}, publisher = {University of Tennessee}, keywords = {ftmpi}, author = {Wesley Bland and George Bosilca and Aurelien Bouteiller and Thomas Herault and Jack Dongarra} } @inproceedings {icl:736, title = {User Level Failure Mitigation in MPI}, journal = {Euro-Par 2012: Parallel Processing Workshops}, volume = {7640}, year = {2012}, month = {2012-08}, pages = {499-504}, publisher = {Springer Berlin Heidelberg}, address = {Rhodes Island, Greece}, keywords = {ftmpi}, author = {Wesley Bland}, editor = {Ioannis Caragiannis and Michael Alexander and Rosa M. Badia and Mario Cannataro and Alexandru Costan and Marco Danelutto and Frederic Desprez and Bettina Krammer and Sahuquillo, J. and Stephen L. Scott and J. Weidendorfer} }