@article {, title = {{Resilient scheduling heuristics for rigid parallel jobs}}, journal = {Int. J. of Networking and Computing}, volume = {11}, number = {1}, year = {2021}, pages = {2-26}, author = {Anne Benoit and Valentin Le F{\`e}vre and Padma Raghavan and Yves Robert and Hongyang Sun} } @conference {, title = {Design and Comparison of Resilient Scheduling Heuristics for Parallel Jobs}, booktitle = {22nd Workshop on Advances in Parallel and Distributed Computational Models (APDCM 2020)}, year = {2020}, month = {2020-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {New Orleans, LA}, author = {Anne Benoit and Valentin Le F{\`e}vre and Padma Raghavan and Yves Robert and Hongyang Sun} } @article {1312, title = {Combining Checkpointing and Replication for Reliable Execution of Linear Workflows with Fail-Stop and Silent Errors}, journal = {International Journal of Networking and Computing}, volume = {9}, number = {1}, year = {2019}, month = {2019}, pages = {2-27}, abstract = {Large-scale platforms currently experience errors from two di?erent sources, namely fail-stop errors (which interrupt the execution) and silent errors (which strike unnoticed and corrupt data). This work combines checkpointing and replication for the reliable execution of linear work?ows on platforms subject to these two error types. While checkpointing and replication have been studied separately, their combination has not yet been investigated despite its promising potential to minimize the execution time of linear work?ows in error-prone environments. Moreover, combined checkpointing and replication has not yet been studied in the presence of both fail-stop and silent errors. The combination raises new problems: for each task, we have to decide whether to checkpoint and/or replicate it to ensure its reliable execution. We provide an optimal dynamic programming algorithm of quadratic complexity to solve both problems. This dynamic programming algorithm has been validated through extensive simulations that reveal the conditions in which checkpointing only, replication only, or the combination of both techniques, lead to improved performance.}, keywords = {checkpoint, fail-stop error; silent error, HPC, linear workflow, Replication}, issn = {2185-2847}, url = {http://www.ijnc.org/index.php/ijnc/article/view/194}, author = {Anne Benoit and Aurelien Cavelan and Florina M. Ciorba and Valentin Le F{\`e}vre and Yves Robert} } @article {1301, title = {Comparing the Performance of Rigid, Moldable, and Grid-Shaped Applications on Failure-Prone HPC Platforms}, journal = {Parallel Computing}, volume = {85}, year = {2019}, month = {2019-07}, pages = {1{\textendash}12}, doi = {https://doi.org/10.1016/j.parco.2019.02.002}, author = {Valentin Le F{\`e}vre and Thomas Herault and Yves Robert and Aurelien Bouteiller and Atsushi Hori and George Bosilca and Jack Dongarra} } @article {1314, title = {A Generic Approach to Scheduling and Checkpointing Workflows}, journal = {International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1255-1274}, keywords = {checkpoint, fail-stop error, resilience, workflow}, doi = {https://doi.org/10.1177/1094342019866891}, author = {Li Han and Valentin Le F{\`e}vre and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @conference {1371, title = {Replication is More Efficient Than You Think}, booktitle = {The IEEE/ACM Conference on High Performance Computing Networking, Storage and Analysis (SC19)}, year = {2019}, month = {2019-11}, publisher = {ACM Press}, organization = {ACM Press}, address = {Denver, CO}, author = {Anne Benoit and Thomas Herault and Valentin Le F{\`e}vre and Yves Robert} } @techreport {1320, title = {Distributed Termination Detection for HPC Task-Based Environments}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-14}, year = {2018}, month = {2018-06}, publisher = {University of Tennessee}, abstract = {This paper revisits distributed termination detection algorithms in the context of high-performance computing applications in task systems. We first outline the need to efficiently detect termination in workflows for which the total number of tasks is data dependent and therefore not known statically but only revealed dynamically during execution. We introduce an efficient variant of the Credit Distribution Algorithm (CDA) and compare it to the original algorithm (HCDA) as well as to its two primary competitors: the Four Counters algorithm (4C) and the Efficient Delay-Optimal Distributed algorithm (EDOD). On the theoretical side, we analyze the behavior of each algorithm for some simplified task-based kernels and show the superiority of CDA in terms of the number of control messages. On the practical side, we provide a highly tuned implementation of each termination detection algorithm within PaRSEC and compare their performance for a variety of benchmarks, extracted from scientific applications that exhibit dynamic behaviors.}, author = {George Bosilca and Aurelien Bouteiller and Thomas Herault and Valentin Le F{\`e}vre and Yves Robert and Jack Dongarra} } @conference {1214, title = {Do moldable applications perform better on failure-prone HPC platforms?}, booktitle = {11th Workshop on Resiliency in High Performance Computing in Clusters, Clouds, and Grids}, series = {LNCS}, year = {2018}, month = {2018-08}, publisher = {Springer Verlag}, organization = {Springer Verlag}, address = {Turin, Italy}, abstract = {This paper compares the performance of different approaches to tolerate failures using checkpoint/restart when executed on large-scale failure-prone platforms. We study (i) Rigid applications, which use a constant number of processors throughout execution; (ii) Moldable applications, which can use a different number of processors after each restart following a fail-stop error; and (iii) GridShaped applications, which are moldable applications restricted to use rectangular processor grids (such as many dense linear algebra kernels). For each application type, we compute the optimal number of failures to tolerate before relinquishing the current allocation and waiting until a new resource can be allocated, and we determine the optimal yield that can be achieved. We instantiate our performance model with a realistic applicative scenario and make it publicly available for further usage.}, author = {Valentin Le F{\`e}vre and George Bosilca and Aurelien Bouteiller and Thomas Herault and Atsushi Hori and Yves Robert and Jack Dongarra} } @conference {1215, title = {A Generic Approach to Scheduling and Checkpointing Workflows}, booktitle = { The 47th International Conference on Parallel Processing (ICPP 2018)}, year = {2018}, month = {2018-08}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Eugene, OR}, abstract = {This work deals with scheduling and checkpointing strategies to execute scientific workflows on failure-prone large-scale platforms. To the best of our knowledge, this work is the first to target failstop errors for arbitrary workflows. Most previous work addresses soft errors, which corrupt the task being executed by a processor but do not cause the entire memory of that processor to be lost, contrarily to fail-stop errors. We revisit classical mapping heuristics such as HEFT and MinMin and complement them with several checkpointing strategies. The objective is to derive an efficient trade-off between checkpointing every task (CkptAll), which is an overkill when failures are rare events, and checkpointing no task (CkptNone), which induces dramatic re-execution overhead even when only a few failures strike during execution. Contrarily to previous work, our approach applies to arbitrary workflows, not just special classes of dependence graphs such as M-SPGs (Minimal Series-Parallel Graphs). Extensive experiments report significant gain over both CkptAll and CkptNone, for a wide variety of workflows.}, author = {Li Han and Valentin Le F{\`e}vre and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @conference {1095, title = {Optimal Checkpointing Period with replicated execution on heterogeneous platforms}, booktitle = {2017 Workshop on Fault-Tolerance for HPC at Extreme Scale}, year = {2017}, month = {2017-06}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Washington, DC}, abstract = {In this paper, we design and analyze strategies to replicate the execution of an application on two different platforms subject to failures, using checkpointing on a shared stable storage. We derive the optimal pattern size~W for a periodic checkpointing strategy where both platforms concurrently try and execute W units of work before checkpointing. The first platform that completes its pattern takes a checkpoint, and the other platform interrupts its execution to synchronize from that checkpoint. We compare this strategy to a simpler on-failure checkpointing strategy, where a checkpoint is taken by one platform only whenever the other platform encounters a failure. We use first or second-order approximations to compute overheads and optimal pattern sizes, and show through extensive simulations that these models are very accurate. The simulations show the usefulness of a secondary platform to reduce execution time, even when the platforms have relatively different speeds: in average, over a wide range of scenarios, the overhead is reduced by 30\%. The simulations also demonstrate that the periodic checkpointing strategy is globally more efficient, unless platform speeds are quite close.}, doi = {10.1145/3086157.3086165}, author = {Anne Benoit and Aurelien Cavelan and Valentin Le F{\`e}vre and Yves Robert} } @article {1090, title = {Towards Optimal Multi-Level Checkpointing}, journal = {IEEE Transactions on Computers}, volume = {66}, year = {2017}, month = {2017-07}, pages = {1212{\textendash}1226}, keywords = {checkpointing, Dynamic programming, Error analysis, Heuristic algorithms, Optimized production technology, protocols, Shape}, doi = {10.1109/TC.2016.2643660}, author = {Anne Benoit and Aurelien Cavelan and Valentin Le F{\`e}vre and Yves Robert and Hongyang Sun} }