@inproceedings {, title = {{Integrating process, control-flow, and data resiliency layers using a hybrid Fenix/Kokkos approach}}, journal = {2022 IEEE International Conference on Cluster Computing (CLUSTER 2022)}, year = {2022}, month = {2022-09}, address = {Heidelberg, Germany}, keywords = {checkpointing, Fault tolerance, Fenix, HPC, Kokkos, MPI-ULFM, resilience}, url = {https://hal.archives-ouvertes.fr/hal-03772536}, author = {Whitlock, Matthew and Morales, Nicolas and George Bosilca and Bouteiller, Aur{\'e}lien and Nicolae, Bogdan and Teranishi, Keita and Giem, Elisabeth and Sarkar, Vivek} } @article {, title = {Resiliency in numerical algorithm design for extreme scale simulations}, journal = {The International Journal of High Performance Computing Applications}, volume = {36371337212766180823}, year = {2022}, month = {2022-03}, pages = {251 - 285}, keywords = {Fault tolerance, Numerical algorithms, parallel computer architecture, resilience}, issn = {1094-3420}, doi = {10.1177/10943420211055188}, url = {http://journals.sagepub.com/doi/10.1177/10943420211055188http://journals.sagepub.com/doi/pdf/10.1177/10943420211055188http://journals.sagepub.com/doi/pdf/10.1177/10943420211055188http://journals.sagepub.com/doi/full-xml/10.1177/10943420211055188}, author = {Agullo, Emmanuel and Altenbernd, Mirco and Anzt, Hartwig and Bautista-Gomez, Leonardo and Benacchio, Tommaso and Bonaventura, Luca and Bungartz, Hans-Joachim and Chatterjee, Sanjay and Ciorba, Florina M and DeBardeleben, Nathan and Drzisga, Daniel and Eibl, Sebastian and Engelmann, Christian and Gansterer, Wilfried N and Giraud, Luc and G{\"o}ddeke, Dominik and Heisig, Marco and J{\'e}z{\'e}quel, Fabienne and Kohl, Nils and Li, Xiaoye Sherry and Lion, Romain and Mehl, Miriam and Mycek, Paul and Obersteiner, Michael and Quintana-Ort{\'\i}, Enrique S and Rizzi, Francesco and R{\"u}de, Ulrich and Schulz, Martin and Fung, Fred and Speck, Robert and Stals, Linda and Teranishi, Keita and Thibault, Samuel and Th{\"o}nnes, Dominik and Wagner, Andreas and Wohlmuth, Barbara} } @article {1460, title = {Overhead of Using Spare Nodes}, journal = {The International Journal of High Performance Computing Applications}, year = {2020}, month = {2020-02}, abstract = {With the increasing fault rate on high-end supercomputers, the topic of fault tolerance has been gathering attention. To cope with this situation, various fault-tolerance techniques are under investigation; these include user-level, algorithm-based fault-tolerance techniques and parallel execution environments that enable jobs to continue following node failure. Even with these techniques, some programs with static load balancing, such as stencil computation, may underperform after a failure recovery. Even when spare nodes are present, they are not always substituted for failed nodes in an effective way. This article considers the questions of how spare nodes should be allocated, how to substitute them for faulty nodes, and how much the communication performance is affected by such a substitution. The third question stems from the modification of the rank mapping by node substitutions, which can incur additional message collisions. In a stencil computation, rank mapping is done in a straightforward way on a Cartesian network without incurring any message collisions. However, once a substitution has occurred, the optimal node-rank mapping may be destroyed. Therefore, these questions must be answered in a way that minimizes the degradation of communication performance. In this article, several spare node allocation and failed node substitution methods will be proposed, analyzed, and compared in terms of communication performance following the substitution. The proposed substitution methods are named sliding methods. The sliding methods are analyzed by using our developed simulation program and evaluated by using the K computer, Blue Gene/Q (BG/Q), and TSUBAME 2.5. It will be shown that when failures occur, the stencil communication performance on the K and BG/Q can be slowed around 10 times depending on the number of node failures. The barrier performance on the K can be cut in half. On BG/Q, barrier performance can be slowed by a factor of 10. Further, it will also be shown that almost no such communication performance degradation can be seen on TSUBAME 2.5. This is because TSUBAME 2.5 has an Infiniband network connected with a FatTree topology, while the K computer and BG/Q have dedicated Cartesian networks. Thus, the communication performance degradation depends on network characteristics.}, keywords = {communication performance, fault mitigation, Fault tolerance, sliding method, spare node}, issn = {1094-3420}, doi = {https://doi.org/10.1177\%2F1094342020901885}, url = {https://journals.sagepub.com/doi/10.1177/1094342020901885}, author = {Atsushi Hori and Kazumi Yoshinaga and Thomas Herault and Aurelien Bouteiller and George Bosilca and Yutaka Ishikawa} } @conference {1453, title = {Asynchronous Receiver-Driven Replay for Local Rollback of MPI Applications}, booktitle = {Fault Tolerance for HPC at eXtreme Scale (FTXS) Workshop at The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC{\textquoteright}19)}, year = {2019}, month = {2019-11}, abstract = {With the increase in scale and architectural complexity of supercomputers, the management of failures has become integral to successfully executing a long-running high performance computing application. In many instances, failures have a localized scope, usually impacting a subset of the resources being used, yet widely used failure recovery strategies (like checkpoint/restart) fail to take advantage and rely on global, synchronous recovery actions. Even with local rollback recovery, in which only the fault impacted processes are restarted from a checkpoint, the consistency of further progress in the execution is achieved through the replay of communication from a message log. This theoretically sound approach encounters some practical limitations: the presence of collective operations forces a synchronous recovery that prevents survivor processes from continuing their execution, removing any possibility for overlapping further computation with the recovery; and the amount of resources required at recovering peers can be untenable. In this work, we solved both problems by implementing an asynchronous, receiver-driven replay of point-to-point and collective communications, and by exploiting remote-memory access capabilities to access the message logs. This new protocol is evaluated in an implementation of local rollback over the User Level Failure Mitigation fault tolerant Message Passing Interface (MPI). It reduces the recovery times of the failed processes by an average of 59\%, while the time spent in the recovery by the survivor processes is reduced by 95\% when compared to an equivalent global rollback protocol, thus living to the promise of a truly localized impact of recovery actions.}, keywords = {checkpoint/restart, Fault tolerance, Message logging, MPI, ULFM, User Level Fault Mitigation}, url = {https://sc19.supercomputing.org/proceedings/workshops/workshop_files/ws_ftxs103s2-file1.pdf}, author = {Nuria Losada and Aurelien Bouteiller and George Bosilca} } @article {1218, title = {Coping with Silent and Fail-Stop Errors at Scale by Combining Replication and Checkpointing}, journal = {Journal of Parallel and Distributed Computing}, volume = {122}, year = {2018}, month = {2018-12}, pages = {209{\textendash}225}, abstract = {This paper provides a model and an analytical study of replication as a technique to cope with silent errors, as well as a mixture of both silent and fail-stop errors on large-scale platforms. Compared with fail-stop errors that are immediately detected when they occur, silent errors require a detection mechanism. To detect silent errors, many application-specific techniques are available, either based on algorithms (e.g., ABFT), invariant preservation or data analytics, but replication remains the most transparent and least intrusive technique. We explore the right level (duplication, triplication or more) of replication for two frameworks: (i) when the platform is subject to only silent errors, and (ii) when the platform is subject to both silent and fail-stop errors. A higher level of replication is more expensive in terms of resource usage but enables to tolerate more errors and to even correct some errors, hence there is a trade-off to be found. Replication is combined with checkpointing and comes with two flavors: process replication and group replication. Process replication applies to message-passing applications with communicating processes. Each process is replicated, and the platform is composed of process pairs, or triplets. Group replication applies to black-box applications, whose parallel execution is replicated several times. The platform is partitioned into two halves (or three thirds). In both scenarios, results are compared before each checkpoint, which is taken only when both results (duplication) or two out of three results (triplication) coincide. Otherwise, one or more silent errors have been detected, and the application rolls back to the last checkpoint, as well as when fail-stop errors have struck. We provide a detailed analytical study for all of these scenarios, with formulas to decide, for each scenario, the optimal parameters as a function of the error rate, checkpoint cost, and platform size. We also report a set of extensive simulation results that nicely corroborates the analytical model.}, keywords = {checkpointing, fail-stop errors, Fault tolerance, High-performance computing, Replication, silent errors}, doi = {https://doi.org/10.1016/j.jpdc.2018.08.002}, author = {Anne Benoit and Aurelien Cavelan and Franck Cappello and Padma Raghavan and Yves Robert and Hongyang Sun} } @article {1089, title = {A Failure Detector for HPC Platforms}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-01}, pages = {139{\textendash}158}, abstract = {Building an infrastructure for exascale applications requires, in addition to many other key components, a stable and efficient failure detector. This article describes the design and evaluation of a robust failure detector that can maintain and distribute the correct list of alive resources within proven and scalable bounds. The detection and distribution of the fault information follow different overlay topologies that together guarantee minimal disturbance to the applications. A virtual observation ring minimizes the overhead by allowing each node to be observed by another single node, providing an unobtrusive behavior. The propagation stage uses a nonuniform variant of a reliable broadcast over a circulant graph overlay network and guarantees a logarithmic fault propagation. Extensive simulations, together with experiments on the Titan Oak Ridge National Laboratory supercomputer, show that the algorithm performs extremely well and exhibits all the desired properties of an exascale-ready algorithm.}, keywords = {failure detection, Fault tolerance, MPI}, doi = {https://doi.org/10.1177/1094342017711505}, author = {George Bosilca and Aurelien Bouteiller and Amina Guermouche and Thomas Herault and Yves Robert and Pierre Sens and Jack Dongarra} } @article {989, title = {Fine-grained Bit-Flip Protection for Relaxation Methods}, journal = {Journal of Computational Science}, year = {2016}, month = {2016-11}, abstract = {Resilience is considered a challenging under-addressed issue that the high performance computing community (HPC) will have to face in order to produce reliable Exascale systems by the beginning of the next decade. As part of a push toward a resilient HPC ecosystem, in this paper we propose an error-resilient iterative solver for sparse linear systems based on stationary component-wise relaxation methods. Starting from a plain implementation of the Jacobi iteration, our approach introduces a low-cost component-wise technique that detects bit-flips, rejecting some component updates, and turning the initial synchronized solver into an asynchronous iteration. Our experimental study with sparse incomplete factorizations from a collection of real-world applications, and a practical GPU implementation, exposes the convergence delay incurred by the fault-tolerant implementation and its practical performance.}, keywords = {Bit flips, Fault tolerance, High Performance Computing, iterative solvers, Jacobi method, sparse linear systems}, doi = {https://doi.org/10.1016/j.jocs.2016.11.013}, author = {Hartwig Anzt and Jack Dongarra and Enrique S. Quintana-Orti} } @article {931, title = {Efficient Checkpoint/Verification Patterns}, journal = {International Journal on High Performance Computing Applications}, year = {2015}, month = {2015-07}, abstract = {Errors have become a critical problem for high performance computing. Checkpointing protocols are often used for error recovery after fail-stop failures. However, silent errors cannot be ignored, and their peculiarity is that such errors are identified only when the corrupted data is activated. To cope with silent errors, we need a verification mechanism to check whether the application state is correct. Checkpoints should be supplemented with verifications to detect silent errors. When a verification is successful, only the last checkpoint needs to be kept in memory because it is known to be correct. In this paper, we analytically determine the best balance of verifications and checkpoints so as to optimize platform throughput. We introduce a balanced algorithm using a pattern with p checkpoints and q verifications, which regularly interleaves both checkpoints and verifications across same-size computational chunks. We show how to compute the waste of an arbitrary pattern, and we prove that the balanced algorithm is optimal when the platform MTBF (Mean Time Between Failures) is large in front of the other parameters (checkpointing, verification and recovery costs). We conduct several simulations to show the gain achieved by this balanced algorithm for well-chosen values of p and q, compared to the base algorithm that always perform a verification just before taking a checkpoint (p = q = 1), and we exhibit gains of up to 19\%.}, keywords = {checkpointing, Fault tolerance, High Performance Computing, silent data corruption, silent error, verification}, doi = {10.1177/1094342015594531}, author = {Anne Benoit and Saurabh K. Raina and Yves Robert} } @article {869, title = {An evaluation of User-Level Failure Mitigation support in MPI}, journal = {Computing}, volume = {95}, year = {2013}, month = {2013-12}, pages = {1171-1184}, abstract = {As the scale of computing platforms becomes increasingly extreme, the requirements for application fault tolerance are increasing as well. Techniques to address this problem by improving the resilience of algorithms have been developed, but they currently receive no support from the programming model, and without such support, they are bound to fail. This paper discusses the failure-free overhead and recovery impact of the user-level failure mitigation proposal presented in the MPI Forum. Experiments demonstrate that fault-aware MPI has little or no impact on performance for a range of applications, and produces satisfactory recovery times when there are failures.}, keywords = {Fault tolerance, MPI, User-level fault mitigation}, doi = {10.1007/s00607-013-0331-3}, author = {Wesley Bland and Aurelien Bouteiller and Thomas Herault and Joshua Hursey and George Bosilca and Jack Dongarra} }