@article {, title = {Revisiting I/O bandwidth-sharing strategies for HPC applications}, number = {RR-9502}, year = {2023}, month = {2023-03}, institution = {INRIA}, abstract = {This work revisits I/O bandwidth-sharing strategies for HPC applications. When several applications post concurrent I/O operations, well-known approaches include serializing these operations (First-Come First-Served) or fair-sharing the bandwidth across them (FairShare). Another recent approach, I/O-Sets, assigns priorities to the applications, which are classified into different sets based upon the average length of their iterations. We introduce several new bandwidth-sharing strategies, some of them simple greedy algorithms, and some of them more complicated to implement, and we compare them with existing ones. Our new strategies do not rely on any a-priori knowledge of the behavior of the applications, such as the length of work phases, the volume of I/O operations, or some expected periodicity. We introduce a rigorous framework, namely steady-state windows, which enables to derive bounds on the competitive ratio of all bandwidth-sharing strategies for three different objectives: minimum yield, platform utilization, and global efficiency. To the best of our knowledge, this work is the first to provide a quantitative assessment of the online competitiveness of any bandwidth-sharing strategy. This theory-oriented assessment is complemented by a comprehensive set of simulations, based upon both synthetic and realistic traces. The main conclusion is that our simple and low-complexity greedy strategies significantly outperform First-Come First-Served, FairShare and I/O-Sets, and we recommend that the I/O community implements them for further assessment.}, keywords = {bandwidth sharing, HPC applications, I/O, scheduling strategy}, url = {https://hal.inria.fr/hal-04038011}, author = {Anne Benoit and Thomas Herault and Lucas Perotin and Yves Robert and Frederic Vivien} } @conference {, title = {When to checkpoint at the end of a fixed-length reservation?}, booktitle = {Fault Tolerance for HPC at eXtreme Scales (FTXS) Workshop}, year = {2023}, month = {2023-08}, address = {Denver, United States}, abstract = {This work considers an application executing for a fixed duration, namely the length of the reservation that it has been granted. The checkpoint duration is a stochastic random variable that obeys some well-known probability distribution law. The question is when to take a checkpoint towards the end of the execution, so that the expectation of the work done is maximized. We address two scenarios. In the first scenario, a checkpoint can be taken at any time; despite its simplicity, this natural problem has not been considered yet (to the best of our knowledge). We provide the optimal solution for a variety of probability distribution laws modeling checkpoint duration. The second scenario is more involved: the application is a linear workflow consisting of a chain of tasks with IID stochastic execution times, and a checkpoint can be taken only at the end of a task. First, we introduce a static strategy where we compute the optimal number of tasks before the application checkpoints at the beginning of the execution. Then, we design a dynamic strategy that decides whether to checkpoint or to continue executing at the end of each task. We instantiate this second scenario with several examples of probability distribution laws for task durations.}, url = {https://inria.hal.science/hal-04215554}, author = {Quentin Barbut and Anne Benoit and Thomas Herault and Yves Robert and Frederic Vivien} } @inproceedings {, title = {Checkpointing {\`a} la Young/Daly: An Overview}, journal = {IC3-2022: Proceedings of the 2022 Fourteenth International Conference on Contemporary Computing}, year = {2022}, month = {2022-08}, pages = {701-710}, publisher = {ACM Press}, address = {Noida, India}, abstract = {The Young/Daly formula provides an approximation of the optimal checkpoint period for a parallel application executing on a supercomputing platform. The Young/Daly formula was originally designed for preemptible tightly-coupled applications. We provide some background and survey various application scenarios to assess the usefulness and limitations of the formula.}, isbn = {9781450396752}, doi = {10.1145/3549206}, url = {https://dl.acm.org/doi/fullHtml/10.1145/3549206.3549328}, author = {Anne Benoit and Yishu Du and Thomas Herault and Loris Marchal and Guillaume Pallez and Lucas Perotin and Yves Robert and Hongyang Sun and Frederic Vivien} } @article {, title = {Optimal Checkpointing Strategies for Iterative Applications}, journal = {IEEE Transactions on Parallel Distributed Systems}, volume = {33}, year = {2022}, month = {2022-03}, pages = {507-522}, doi = {10.1109/TPDS.2021.3099440}, url = {https://ieeexplore.ieee.org/document/9495174}, author = {Yishu Du and Guillaume Pallez and Loris Marchal and Yves Robert} } @article {, title = {Budget-aware scheduling algorithms for scientific workflows with stochastic task weights on IaaS Cloud platforms}, journal = {Concurrency and Computation: Practice and Experience}, volume = {33}, number = {17}, year = {2021}, pages = {e6065}, doi = {https://doi.org/10.1002/cpe.6065}, author = {Eddy Caron and Yves Caniou and Aur{\'e}lie Kong Win Chang and Yves Robert} } @conference {, title = {Distributed-Memory Multi-GPU Block-Sparse Tensor Contraction for Electronic Structure}, booktitle = {35th IEEE International Parallel \& Distributed Processing Symposium (IPDPS 2021)}, year = {2021}, month = {2021-05}, publisher = {IEEE}, organization = {IEEE}, address = {Portland, OR}, abstract = {Many domains of scientific simulation (chemistry, condensed matter physics, data science) increasingly eschew dense tensors for block-sparse tensors, sometimes with additional structure (recursive hierarchy, rank sparsity, etc.). Distributed-memory parallel computation with block-sparse tensorial data is paramount to minimize the time-tosolution (e.g., to study dynamical problems or for real-time analysis) and to accommodate problems of realistic size that are too large to fit into the host/device memory of a single node equipped with accelerators. Unfortunately, computation with such irregular data structures is a poor match to the dominant imperative, bulk-synchronous parallel programming model. In this paper, we focus on the critical element of block-sparse tensor algebra, namely binary tensor contraction, and report on an efficient and scalable implementation using the task-focused PaRSEC runtime. High performance of the block-sparse tensor contraction on the Summit supercomputer is demonstrated for synthetic data as well as for real data involved in electronic structure simulations of unprecedented size.}, keywords = {block-sparse matrix multiplication, distributed-memory, Electronic structure, multi-GPU node, parsec, tensor contraction}, url = {https://hal.inria.fr/hal-02970659/document}, author = {Thomas Herault and Yves Robert and George Bosilca and Robert Harrison and Cannada Lewis and Edward Valeev and Jack Dongarra} } @article {, title = {{Dynamic DAG scheduling under memory constraints for shared-memory platforms}}, journal = {Int. J. of Networking and Computing}, volume = {11}, number = {1}, year = {2021}, pages = {27-49}, author = {Gabriel Bathie and Loris Marchal and Yves Robert and Samuel Thibault} } @inproceedings {, title = {Evaluating Task Dropping Strategies for Overloaded Real-Time Systems (Work-In-Progress)}, journal = {42nd Real Time Systems Symposium (RTSS)}, year = {2021}, publisher = {IEEE Computer Society Press}, author = {Yiqin Gao and Guillaume Pallez and Yves Robert and Frederic Vivien} } @inproceedings {, title = {Max-Stretch Minimization on an Edge-Cloud Platform}, journal = {IPDPS{\textquoteright}2021, the 34th IEEE International Parallel and Distributed Processing Symposium}, year = {2021}, publisher = {IEEE Computer Society Press}, author = {Anne Benoit and Redouane Elghazi and Yves Robert} } @article {, title = {{Resilient scheduling heuristics for rigid parallel jobs}}, journal = {Int. J. of Networking and Computing}, volume = {11}, number = {1}, year = {2021}, pages = {2-26}, author = {Anne Benoit and Valentin Le F{\`e}vre and Padma Raghavan and Yves Robert and Hongyang Sun} } @conference {, title = {Design and Comparison of Resilient Scheduling Heuristics for Parallel Jobs}, booktitle = {22nd Workshop on Advances in Parallel and Distributed Computational Models (APDCM 2020)}, year = {2020}, month = {2020-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {New Orleans, LA}, author = {Anne Benoit and Valentin Le F{\`e}vre and Padma Raghavan and Yves Robert and Hongyang Sun} } @conference {, title = {Energy-Aware Strategies for Reliability-Oriented Real-Time Task Allocation on Heterogeneous Platforms}, booktitle = {49th International Conference on Parallel Processing (ICPP 2020)}, year = {2020}, publisher = {ACM Press}, organization = {ACM Press}, address = {Edmonton, AB, Canada}, author = {Li Han and Yiqin Gao and Jing Liu and Yves Robert and Frederic Vivien} } @conference {1372, title = {Improved Energy-Aware Strategies for Periodic Real-Time Tasks under Reliability Constraints}, booktitle = {40th IEEE Real-Time Systems Symposium (RTSS 2019)}, year = {2020}, month = {2020-02}, publisher = {IEEE Press}, organization = {IEEE Press}, address = {York, UK}, author = {Li Han and Louis-Claude Canon and Jing Liu and Yves Robert and Frederic Vivien} } @conference {, title = {Reservation and Checkpointing Strategies for Stochastic Jobs}, booktitle = {34th IEEE International Parallel and Distributed Processing Symposium (IPDPS 2020)}, year = {2020}, month = {2020-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {New Orleans, LA}, author = {Ana Gainaru and Brice Goglin and Valentin Honor{\'e} and Padma Raghavan and Guillaume Pallez and Padma Raghavan and Yves Robert and Hongyang Sun} } @conference {, title = {Revisiting Dynamic DAG Scheduling under Memory Constraints for Shared-Memory Platforms}, booktitle = {22nd Workshop on Advances in Parallel and Distributed Computational Models (APDCM 2020)}, year = {2020}, month = {2020-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {New Orleans, LA}, author = {Gabriel Bathie and Loris Marchal and Yves Robert and Samuel Thibault} } @conference {, title = {Robustness of the Young/Daly Formula for Stochastic Iterative Applications}, booktitle = {49th International Conference on Parallel Processing (ICPP 2020)}, year = {2020}, month = {2020-08}, publisher = {ACM Press}, organization = {ACM Press}, address = {Edmonton, AB, Canada}, author = {Yishu Du and Loris Marchal and Guillaume Pallez and Yves Robert} } @article {1305, title = {Checkpointing Strategies for Shared High-Performance Computing Platforms}, journal = {International Journal of Networking and Computing}, volume = {9}, number = {1}, year = {2019}, pages = {28{\textendash}52}, abstract = {Input/output (I/O) from various sources often contend for scarcely available bandwidth. For example, checkpoint/restart (CR) protocols can help to ensure application progress in failure-prone environments. However, CR I/O alongside an application{\textquoteright}s normal, requisite I/O can increase I/O contention and might negatively impact performance. In this work, we consider different aspects (system-level scheduling policies and hardware) that optimize the overall performance of concurrently executing CR-based applications that share I/O resources. We provide a theoretical model and derive a set of necessary constraints to minimize the global waste on a given platform. Our results demonstrate that Young/Daly{\textquoteright}s optimal checkpoint interval, despite providing a sensible metric for a single, undisturbed application, is not sufficient to optimally address resource contention at scale. We show that by combining optimal checkpointing periods with contention-aware system-level I/O scheduling strategies, we can significantly improve overall application performance and maximize the platform throughput. Finally, we evaluate how specialized hardware, namely burst buffers, may help to mitigate the I/O contention problem. Overall, these results provide critical analysis and direct guidance on how to design efficient, CR ready, large -scale platforms without a large investment in the I/O subsystem.}, issn = {2185-2847}, url = {http://www.ijnc.org/index.php/ijnc/article/view/195}, author = {Thomas Herault and Yves Robert and Aurelien Bouteiller and Dorian Arnold and Kurt Ferreira and George Bosilca and Jack Dongarra} } @article {1312, title = {Combining Checkpointing and Replication for Reliable Execution of Linear Workflows with Fail-Stop and Silent Errors}, journal = {International Journal of Networking and Computing}, volume = {9}, number = {1}, year = {2019}, month = {2019}, pages = {2-27}, abstract = {Large-scale platforms currently experience errors from two di?erent sources, namely fail-stop errors (which interrupt the execution) and silent errors (which strike unnoticed and corrupt data). This work combines checkpointing and replication for the reliable execution of linear work?ows on platforms subject to these two error types. While checkpointing and replication have been studied separately, their combination has not yet been investigated despite its promising potential to minimize the execution time of linear work?ows in error-prone environments. Moreover, combined checkpointing and replication has not yet been studied in the presence of both fail-stop and silent errors. The combination raises new problems: for each task, we have to decide whether to checkpoint and/or replicate it to ensure its reliable execution. We provide an optimal dynamic programming algorithm of quadratic complexity to solve both problems. This dynamic programming algorithm has been validated through extensive simulations that reveal the conditions in which checkpointing only, replication only, or the combination of both techniques, lead to improved performance.}, keywords = {checkpoint, fail-stop error; silent error, HPC, linear workflow, Replication}, issn = {2185-2847}, url = {http://www.ijnc.org/index.php/ijnc/article/view/194}, author = {Anne Benoit and Aurelien Cavelan and Florina M. Ciorba and Valentin Le F{\`e}vre and Yves Robert} } @article {1301, title = {Comparing the Performance of Rigid, Moldable, and Grid-Shaped Applications on Failure-Prone HPC Platforms}, journal = {Parallel Computing}, volume = {85}, year = {2019}, month = {2019-07}, pages = {1{\textendash}12}, doi = {https://doi.org/10.1016/j.parco.2019.02.002}, author = {Valentin Le F{\`e}vre and Thomas Herault and Yves Robert and Aurelien Bouteiller and Atsushi Hori and George Bosilca and Jack Dongarra} } @article {1238, title = {Computing Dense Tensor Decompositions with Optimal Dimension Trees}, journal = {Algorithmica}, volume = {81}, year = {2019}, month = {2019-05}, pages = {2092{\textendash}2121}, keywords = {CP decomposition, Dimension tree, Tensor computations, Tucker decomposition}, issn = {1432-0541}, doi = {https://doi.org/10.1007/s00453-018-0525-3}, author = {Oguz Kaya and Yves Robert} } @article {1313, title = {Co-Scheduling HPC Workloads on Cache-Partitioned CMP Platforms}, journal = {International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1221-1239}, abstract = {With the recent advent of many-core architectures such as chip multiprocessors (CMPs), the number of processing units accessing a global shared memory is constantly increasing. Co-scheduling techniques are used to improve application throughput on such architectures, but sharing resources often generates critical interferences. In this article, we focus on the interferences in the last level of cache (LLC) and use the Cache Allocation Technology (CAT) recently provided by Intel to partition the LLC and give each co-scheduled application their own cache area. We consider m iterative HPC applications running concurrently and answer to the following questions: (i) How to precisely model the behavior of these applications on the cache-partitioned platform? and (ii) how many cores and cache fractions should be assigned to each application to maximize the platform efficiency? Here, platform efficiency is defined as maximizing the performance either globally, or as guaranteeing a fixed ratio of iterations per second for each application. Through extensive experiments using CAT, we demonstrate the impact of cache partitioning when multiple HPC applications are co-scheduled onto CMP platforms.}, keywords = {cache partitioning, chip multiprocessor, co-scheduling, HPC application}, doi = {https://doi.org/10.1177/1094342019846956}, author = {Guillaume Aupy and Anne Benoit and Brice Goglin and Lo{\"\i}c Pottier and Yves Robert} } @article {1314, title = {A Generic Approach to Scheduling and Checkpointing Workflows}, journal = {International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1255-1274}, keywords = {checkpoint, fail-stop error, resilience, workflow}, doi = {https://doi.org/10.1177/1094342019866891}, author = {Li Han and Valentin Le F{\`e}vre and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @conference {1410, title = {Generic Matrix Multiplication for Multi-GPU Accelerated Distributed-Memory Platforms over PaRSEC}, booktitle = {ScalA{\textquoteright}19: 10th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, year = {2019}, month = {2019-11}, publisher = {IEEE}, organization = {IEEE}, address = {Denver, CO}, author = {Thomas Herault and Yves Robert and George Bosilca and Jack Dongarra} } @conference {1371, title = {Replication is More Efficient Than You Think}, booktitle = {The IEEE/ACM Conference on High Performance Computing Networking, Storage and Analysis (SC19)}, year = {2019}, month = {2019-11}, publisher = {ACM Press}, organization = {ACM Press}, address = {Denver, CO}, author = {Anne Benoit and Thomas Herault and Valentin Le F{\`e}vre and Yves Robert} } @conference {1316, title = {Reservation Strategies for Stochastic Jobs}, booktitle = {33rd IEEE International Parallel and Distributed Processing Symposium (IPDPS 2019)}, year = {2019}, month = {2019-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Rio de Janeiro, Brazil}, author = {Guillaume Aupy and Ana Gainaru and Valentin Honor{\'e} and Padma Raghavan and Yves Robert and Hongyang Sun} } @conference {1339, title = {Scheduling Independent Stochastic Tasks on Heterogeneous Cloud Platforms}, booktitle = {IEEE Cluster 2019}, year = {2019}, month = {2019-09}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Albuquerque, New Mexico}, author = {Yiqin Gao and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @article {1315, title = {Scheduling Independent Stochastic Tasks under Deadline and Budget Constraints}, journal = {International Journal of High Performance Computing Applications}, volume = {34}, year = {2019}, month = {2019-06}, pages = {246-264}, abstract = {This article discusses scheduling strategies for the problem of maximizing the expected number of tasks that can be executed on a cloud platform within a given budget and under a deadline constraint. The execution times of tasks follow independent and identically distributed probability laws. The main questions are how many processors to enroll and whether and when to interrupt tasks that have been executing for some time. We provide complexity results and an asymptotically optimal strategy for the problem instance with discrete probability distributions and without deadline. We extend the latter strategy for the general case with continuous distributions and a deadline and we design an efficient heuristic which is shown to outperform standard approaches when running simulations for a variety of useful distribution laws.}, doi = {https://doi.org/10.1177/1094342019852135}, author = {Louis-Claude Canon and Aur{\'e}lie Kong Win Chang and Yves Robert and Frederic Vivien} } @conference {1197, title = {Budget-Aware Scheduling Algorithms for Scientific Workflows with Stochastic Task Weights on Heterogeneous IaaS Cloud Platforms}, booktitle = {2018 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2018}, month = {2018-05}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {This paper introduces several budget-aware algorithms to deploy scientific workflows on IaaS cloud platforms, where users can request Virtual Machines (VMs) of different types, each with specific cost and speed parameters. We use a realistic application/platform model with stochastic task weights, and VMs communicating through a datacenter. We extend two well-known algorithms, MinMin and HEFT, and make scheduling decisions based upon machine availability and available budget. During the mapping process, the budget-aware algorithms make conservative assumptions to avoid exceeding the initial budget; we further improve our results with refined versions that aim at re-scheduling some tasks onto faster VMs, thereby spending any budget fraction leftover by the first allocation. These refined variants are much more time-consuming than the former algorithms, so there is a trade-off to find in terms of scalability. We report an extensive set of simulations with workflows from the Pegasus benchmark suite. Most of the time our budget-aware algorithms succeed in achieving efficient makespans while enforcing the given budget, despite (i) the uncertainty in task weights and (ii) the heterogeneity of VMs in both cost and speed values.}, keywords = {budget aware algorithm, multi criteria scheduling, workflow}, doi = {10.1109/IPDPSW.2018.00014}, author = {Yves Caniou and Eddy Caron and Aur{\'e}lie Kong Win Chang and Yves Robert} } @article {1187, title = {Checkpointing Workflows for Fail-Stop Errors}, journal = {IEEE Transactions on Computers}, volume = {67}, year = {2018}, month = {2018-08}, pages = {1105{\textendash}1120}, abstract = {We consider the problem of orchestrating the execution of workflow applications structured as Directed Acyclic Graphs (DAGs) on parallel computing platforms that are subject to fail-stop failures. The objective is to minimize expected overall execution time, or makespan. A solution to this problem consists of a schedule of the workflow tasks on the available processors and of a decision of which application data to checkpoint to stable storage, so as to mitigate the impact of processor failures. To address this challenge, we consider a restricted class of graphs, Minimal Series-Parallel Graphs (M-SPGS), which is relevant to many real-world workflow applications. For this class of graphs, we propose a recursive list-scheduling algorithm that exploits the M-SPG structure to assign sub-graphs to individual processors, and uses dynamic programming to decide how to checkpoint these sub-graphs. We assess the performance of our algorithm for production workflow configurations, comparing it to an approach in which all application data is checkpointed and an approach in which no application data is checkpointed. Results demonstrate that our algorithm outperforms both the former approach, because of lower checkpointing overhead, and the latter approach, because of better resilience to failures.}, keywords = {checkpoint, fail-stop error, resilience, workflow}, url = {http://ieeexplore.ieee.org/document/8279499/}, author = {Li Han and Louis-Claude Canon and Henri Casanova and Yves Robert and Frederic Vivien} } @article {1193, title = {Computing the Expected Makespan of Task Graphs in the Presence of Silent Errors}, journal = {Parallel Computing}, volume = {75}, year = {2018}, month = {2018-07}, pages = {41{\textendash}60}, abstract = {Applications structured as Directed Acyclic Graphs (DAGs) of tasks occur in many domains, including popular scientific workflows. DAG scheduling has thus received an enormous amount of attention. Many of the popular DAG scheduling heuristics make scheduling deci- sions based on path lengths. At large scale compute platforms are subject to various types of failures with non-negligible probabilities of occurrence. Failures that have recently re- ceived increased attention are {\textquotedblleft}silent errors,{\textquotedblright} which cause data corruption. Tolerating silent errors is done by checking the validity of computed results and possibly re-executing a task from scratch. The execution time of a task then becomes a random variable, and so do path lengths in a DAG. Unfortunately, computing the expected makespan of a DAG (and equivalently computing expected path lengths in a DAG) is a computationally dif- ficult problem. Consequently, designing effective scheduling heuristics in this context is challenging. In this work, we propose an algorithm that computes a first order approxi- mation of the expected makespan of a DAG when tasks are subject to silent errors. We find that our proposed approximation outperforms previously proposed approaches both in terms of approximation error and of speed.}, keywords = {Expected makespan, fault-tolerance, scheduling, Scientific workflows, silent errors, Task graphs}, doi = {https://doi.org/10.1016/j.parco.2018.03.004}, author = {Henri Casanova and Julien Herrmann and Yves Robert} } @article {1218, title = {Coping with Silent and Fail-Stop Errors at Scale by Combining Replication and Checkpointing}, journal = {Journal of Parallel and Distributed Computing}, volume = {122}, year = {2018}, month = {2018-12}, pages = {209{\textendash}225}, abstract = {This paper provides a model and an analytical study of replication as a technique to cope with silent errors, as well as a mixture of both silent and fail-stop errors on large-scale platforms. Compared with fail-stop errors that are immediately detected when they occur, silent errors require a detection mechanism. To detect silent errors, many application-specific techniques are available, either based on algorithms (e.g., ABFT), invariant preservation or data analytics, but replication remains the most transparent and least intrusive technique. We explore the right level (duplication, triplication or more) of replication for two frameworks: (i) when the platform is subject to only silent errors, and (ii) when the platform is subject to both silent and fail-stop errors. A higher level of replication is more expensive in terms of resource usage but enables to tolerate more errors and to even correct some errors, hence there is a trade-off to be found. Replication is combined with checkpointing and comes with two flavors: process replication and group replication. Process replication applies to message-passing applications with communicating processes. Each process is replicated, and the platform is composed of process pairs, or triplets. Group replication applies to black-box applications, whose parallel execution is replicated several times. The platform is partitioned into two halves (or three thirds). In both scenarios, results are compared before each checkpoint, which is taken only when both results (duplication) or two out of three results (triplication) coincide. Otherwise, one or more silent errors have been detected, and the application rolls back to the last checkpoint, as well as when fail-stop errors have struck. We provide a detailed analytical study for all of these scenarios, with formulas to decide, for each scenario, the optimal parameters as a function of the error rate, checkpoint cost, and platform size. We also report a set of extensive simulation results that nicely corroborates the analytical model.}, keywords = {checkpointing, fail-stop errors, Fault tolerance, High-performance computing, Replication, silent errors}, doi = {https://doi.org/10.1016/j.jpdc.2018.08.002}, author = {Anne Benoit and Aurelien Cavelan and Franck Cappello and Padma Raghavan and Yves Robert and Hongyang Sun} } @article {1198, title = {Co-Scheduling Amdhal Applications on Cache-Partitioned Systems}, journal = {International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-01}, pages = {123{\textendash}138}, abstract = {Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are as follows: (i) which proportion of cache and (ii) how many processors should be given to each application? In this article, we provide answers to (i) and (ii) for Amdahl applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for Amdahl applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.}, keywords = {cache partitioning, co-scheduling, complexity results}, doi = {https://doi.org/10.1177/1094342017710806}, author = {Guillaume Aupy and Anne Benoit and Sicheng Dai and Lo{\"\i}c Pottier and Padma Raghavan and Yves Robert and Manu Shantharam} } @conference {1217, title = {Co-Scheduling HPC Workloads on Cache-Partitioned CMP Platforms}, booktitle = {Cluster 2018}, year = {2018}, month = {2018-09}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Belfast, UK}, author = {Guillaume Aupy and Anne Benoit and Brice Goglin and Lo{\"\i}c Pottier and Yves Robert} } @techreport {1320, title = {Distributed Termination Detection for HPC Task-Based Environments}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-14}, year = {2018}, month = {2018-06}, publisher = {University of Tennessee}, abstract = {This paper revisits distributed termination detection algorithms in the context of high-performance computing applications in task systems. We first outline the need to efficiently detect termination in workflows for which the total number of tasks is data dependent and therefore not known statically but only revealed dynamically during execution. We introduce an efficient variant of the Credit Distribution Algorithm (CDA) and compare it to the original algorithm (HCDA) as well as to its two primary competitors: the Four Counters algorithm (4C) and the Efficient Delay-Optimal Distributed algorithm (EDOD). On the theoretical side, we analyze the behavior of each algorithm for some simplified task-based kernels and show the superiority of CDA in terms of the number of control messages. On the practical side, we provide a highly tuned implementation of each termination detection algorithm within PaRSEC and compare their performance for a variety of benchmarks, extracted from scientific applications that exhibit dynamic behaviors.}, author = {George Bosilca and Aurelien Bouteiller and Thomas Herault and Valentin Le F{\`e}vre and Yves Robert and Jack Dongarra} } @conference {1214, title = {Do moldable applications perform better on failure-prone HPC platforms?}, booktitle = {11th Workshop on Resiliency in High Performance Computing in Clusters, Clouds, and Grids}, series = {LNCS}, year = {2018}, month = {2018-08}, publisher = {Springer Verlag}, organization = {Springer Verlag}, address = {Turin, Italy}, abstract = {This paper compares the performance of different approaches to tolerate failures using checkpoint/restart when executed on large-scale failure-prone platforms. We study (i) Rigid applications, which use a constant number of processors throughout execution; (ii) Moldable applications, which can use a different number of processors after each restart following a fail-stop error; and (iii) GridShaped applications, which are moldable applications restricted to use rectangular processor grids (such as many dense linear algebra kernels). For each application type, we compute the optimal number of failures to tolerate before relinquishing the current allocation and waiting until a new resource can be allocated, and we determine the optimal yield that can be achieved. We instantiate our performance model with a realistic applicative scenario and make it publicly available for further usage.}, author = {Valentin Le F{\`e}vre and George Bosilca and Aurelien Bouteiller and Thomas Herault and Atsushi Hori and Yves Robert and Jack Dongarra} } @article {1089, title = {A Failure Detector for HPC Platforms}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-01}, pages = {139{\textendash}158}, abstract = {Building an infrastructure for exascale applications requires, in addition to many other key components, a stable and efficient failure detector. This article describes the design and evaluation of a robust failure detector that can maintain and distribute the correct list of alive resources within proven and scalable bounds. The detection and distribution of the fault information follow different overlay topologies that together guarantee minimal disturbance to the applications. A virtual observation ring minimizes the overhead by allowing each node to be observed by another single node, providing an unobtrusive behavior. The propagation stage uses a nonuniform variant of a reliable broadcast over a circulant graph overlay network and guarantees a logarithmic fault propagation. Extensive simulations, together with experiments on the Titan Oak Ridge National Laboratory supercomputer, show that the algorithm performs extremely well and exhibits all the desired properties of an exascale-ready algorithm.}, keywords = {failure detection, Fault tolerance, MPI}, doi = {https://doi.org/10.1177/1094342017711505}, author = {George Bosilca and Aurelien Bouteiller and Amina Guermouche and Thomas Herault and Yves Robert and Pierre Sens and Jack Dongarra} } @conference {1215, title = {A Generic Approach to Scheduling and Checkpointing Workflows}, booktitle = { The 47th International Conference on Parallel Processing (ICPP 2018)}, year = {2018}, month = {2018-08}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Eugene, OR}, abstract = {This work deals with scheduling and checkpointing strategies to execute scientific workflows on failure-prone large-scale platforms. To the best of our knowledge, this work is the first to target failstop errors for arbitrary workflows. Most previous work addresses soft errors, which corrupt the task being executed by a processor but do not cause the entire memory of that processor to be lost, contrarily to fail-stop errors. We revisit classical mapping heuristics such as HEFT and MinMin and complement them with several checkpointing strategies. The objective is to derive an efficient trade-off between checkpointing every task (CkptAll), which is an overkill when failures are rare events, and checkpointing no task (CkptNone), which induces dramatic re-execution overhead even when only a few failures strike during execution. Contrarily to previous work, our approach applies to arbitrary workflows, not just special classes of dependence graphs such as M-SPGs (Minimal Series-Parallel Graphs). Extensive experiments report significant gain over both CkptAll and CkptNone, for a wide variety of workflows.}, author = {Li Han and Valentin Le F{\`e}vre and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @article {1239, title = {Multi-Level Checkpointing and Silent Error Detection for Linear Workflows}, journal = {Journal of Computational Science}, volume = {28}, year = {2018}, month = {2018-09}, pages = {398{\textendash}415}, abstract = {We focus on High Performance Computing (HPC) workflows whose dependency graph forms a linear chain, and we extend single-level checkpointing in two important directions. Our first contribution targets silent errors, and combines in-memory checkpoints with both partial and guaranteed verifications. Our second contribution deals with multi-level checkpointing for fail-stop errors. We present sophisticated dynamic programming algorithms that return the optimal solution for each problem in polynomial time. We also show how to combine all these techniques and solve the problem with both fail-stop and silent errors. Simulation results demonstrate that these extensions lead to significantly improved performance compared to the standard single-level checkpointing algorithm.}, author = {Anne Benoit and Aurelien Cavelan and Yves Robert and Hongyang Sun} } @conference {1196, title = {Optimal Cooperative Checkpointing for Shared High-Performance Computing Platforms}, booktitle = {2018 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), Best Paper Award}, year = {2018}, month = {2018-05}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {In high-performance computing environments, input/output (I/O) from various sources often contend for scarce available bandwidth. Adding to the I/O operations inherent to the failure-free execution of an application, I/O from checkpoint/restart (CR) operations (used to ensure progress in the presence of failures) place an additional burden as it increase I/O contention, leading to degraded performance. In this work, we consider a cooperative scheduling policy that optimizes the overall performance of concurrently executing CR-based applications which share valuable I/O resources. First, we provide a theoretical model and then derive a set of necessary constraints needed to minimize the global waste on the platform. Our results demonstrate that the optimal checkpoint interval, as defined by Young/Daly, despite providing a sensible metric for a single application, is not sufficient to optimally address resource contention at the platform scale. We therefore show that combining optimal checkpointing periods with I/O scheduling strategies can provide a significant improvement on the overall application performance, thereby maximizing platform throughput. Overall, these results provide critical analysis and direct guidance on checkpointing large-scale workloads in the presence of competing I/O while minimizing the impact on application performance.}, doi = {10.1109/IPDPSW.2018.00127}, author = {Thomas Herault and Yves Robert and Aurelien Bouteiller and Dorian Arnold and Kurt Ferreira and George Bosilca and Jack Dongarra} } @conference {1216, title = {A Performance Model to Execute Workflows on High-Bandwidth Memory Architectures}, booktitle = {The 47th International Conference on Parallel Processing (ICPP 2018)}, year = {2018}, month = {2018-08}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Eugene, OR}, abstract = {This work presents a realistic performance model to execute scientific workflows on high-bandwidth memory architectures such as the Intel Knights Landing. We provide a detailed analysis of the execution time on such platforms, taking into account transfers from both fast and slow memory and their overlap with computations. We discuss several scheduling and mapping strategies: not only tasks must be assigned to computing resource, but also one has to decide which fraction of input and output data will reside in fast memory, and which will have to stay in slow memory. Extensive simulations allow us to assess the impact of the mapping strategies on performance. We also conduct actual experiments for a simple 1D Gauss-Seidel kernel, which assess the accuracy of the model and further demonstrate the importance of a tuned memory management. Altogether, our model and results lay the foundations for further studies and experiments on dual-memory systems.}, author = {Anne Benoit and Swann Perarnau and Lo{\"\i}c Pottier and Yves Robert} } @inbook {1261, title = {Scheduling for Fault-Tolerance: An Introduction}, booktitle = {Topics in Parallel and Distributed Computing}, year = {2018}, pages = {143{\textendash}170}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, isbn = { 978-3-319-93108-1}, doi = {10.1007/978-3-319-93109-8}, author = {Guillaume Aupy and Yves Robert} } @conference {1099, title = {Assuming failure independence: are we right to be wrong?}, booktitle = {The 3rd International Workshop on Fault Tolerant Systems (FTS)}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii}, abstract = {This paper revisits the failure temporal independence hypothesis which is omnipresent in the analysis of resilience methods for HPC. We explain why a previous approach is incorrect, and we propose a new method to detect failure cascades, i.e., series of non-independent consecutive failures. We use this new method to assess whether public archive failure logs contain failure cascades. Then we design and compare several cascadeaware checkpointing algorithms to quantify the maximum gain that could be obtained, and we report extensive simulation results with archive and synthetic failure logs. Altogether, there are a few logs that contain cascades, but we show that the gain that can be achieved from this knowledge is not significant. The conclusion is that we can wrongly, but safely, assume failure independence!}, author = {Guillaume Aupy and Yves Robert and Frederic Vivien} } @conference {1093, title = {Bidiagonalization and R-Bidiagonalization: Parallel Tiled Algorithms, Critical Paths and Distributed-Memory Implementation}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2017}, month = {2017-05}, publisher = {IEEE}, organization = {IEEE}, address = {Orlando, FL}, abstract = {We study tiled algorithms for going from a "full" matrix to a condensed "band bidiagonal" form using orthog-onal transformations: (i) the tiled bidiagonalization algorithm BIDIAG, which is a tiled version of the standard scalar bidiago-nalization algorithm; and (ii) the R-bidiagonalization algorithm R-BIDIAG, which is a tiled version of the algorithm which consists in first performing the QR factorization of the initial matrix, then performing the band-bidiagonalization of the R- factor. For both BIDIAG and R-BIDIAG, we use four main types of reduction trees, namely FLATTS, FLATTT, GREEDY, and a newly introduced auto-adaptive tree, AUTO. We provide a study of critical path lengths for these tiled algorithms, which shows that (i) R-BIDIAG has a shorter critical path length than BIDIAG for tall and skinny matrices, and (ii) GREEDY based schemes are much better than earlier proposed algorithms with unbounded resources. We provide experiments on a single multicore node, and on a few multicore nodes of a parallel distributed shared- memory system, to show the superiority of the new algorithms on a variety of matrix sizes, matrix shapes and core counts.}, keywords = {Algorithm design and analysis, Approximation algorithms, Kernel, Multicore processing, Shape, Software algorithms, Transforms}, doi = {10.1109/IPDPS.2017.46}, author = {Mathieu Faverge and Julien Langou and Yves Robert and Jack Dongarra} } @conference {1098, title = {Checkpointing Workflows for Fail-Stop Errors}, booktitle = {IEEE Cluster}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii}, abstract = {We consider the problem of orchestrating the execution of workflow applications structured as Directed Acyclic Graphs (DAGs) on parallel computing platforms that are subject to fail-stop failures. The objective is to minimize expected overall execution time, or makespan. A solution to this problem consists of a schedule of the workflow tasks on the available processors and of a decision of which application data to checkpoint to stable storage, so as to mitigate the impact of processor failures. For general DAGs this problem is hopelessly intractable. In fact, given a solution, computing its expected makespan is still a difficult problem. To address this challenge, we consider a restricted class of graphs, Minimal Series-Parallel Graphs (M-SPGS). It turns out that many real-world workflow applications are naturally structured as M-SPGS. For this class of graphs, we propose a recursive list-scheduling algorithm that exploits the M-SPG structure to assign sub-graphs to individual processors, and uses dynamic programming to decide which tasks in these sub-graphs should be checkpointed. Furthermore, it is possible to efficiently compute the expected makespan for the solution produced by this algorithm, using a first-order approximation of task weights and existing evaluation algorithms for 2-state probabilistic DAGs. We assess the performance of our algorithm for production workflow configurations, comparing it to (i) an approach in which all application data is checkpointed, which corresponds to the standard way in which most production workflows are executed today; and (ii) an approach in which no application data is checkpointed. Our results demonstrate that our algorithm strikes a good compromise between these two approaches, leading to lower checkpointing overhead than the former and to better resilience to failure than the latter.}, author = {Li Han and Louis-Claude Canon and Henri Casanova and Yves Robert and Frederic Vivien} } @conference {1094, title = {Co-Scheduling Algorithms for Cache-Partitioned Systems}, booktitle = {19th Workshop on Advances in Parallel and Distributed Computational Models}, year = {2017}, month = {2017-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Orlando, FL}, abstract = {Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are: (i) which proportion of cache and (ii) how many processors should be given to each application? Here, we assign rational numbers of processors to each application, since they can be shared across applications through multi-threading. In this paper, we provide answers to (i) and (ii) for perfectly parallel applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for general applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.}, keywords = {Computational modeling, Degradation, Interference, Mathematical model, Program processors, Supercomputers, Throughput}, doi = {10.1109/IPDPSW.2017.60}, author = {Guillaume Aupy and Anne Benoit and Lo{\"\i}c Pottier and Padma Raghavan and Yves Robert and Manu Shantharam} } @article {1165, title = {Design and Implementation of the PULSAR Programming System for Large Scale Computing}, journal = {Supercomputing Frontiers and Innovations}, volume = {4}, year = {2017}, abstract = {The objective of the PULSAR project was to design a programming model suitable for large scale machines with complex memory hierarchies, and to deliver a prototype implementation of a runtime system supporting that model. PULSAR tackled the challenge by proposing a programming model based on systolic processing and virtualization. The PULSAR programming model is quite simple, with point-to-point channels as the main communication abstraction. The runtime implementation is very lightweight and fully distributed, and provides multithreading, message-passing and multi-GPU offload capabilities. Performance evaluation shows good scalability up to one thousand nodes with one thousand GPU accelerators.}, doi = {10.14529/jsfi170101}, url = {http://superfri.org/superfri/article/view/121/210}, author = {Jakub Kurzak and Piotr Luszczek and Ichitaro Yamazaki and Yves Robert and Jack Dongarra} } @conference {1096, title = {Identifying the Right Replication Level to Detect and Correct Silent Errors at Scale}, booktitle = {2017 Workshop on Fault-Tolerance for HPC at Extreme Scale}, year = {2017}, month = {2017-06}, publisher = {ACM}, organization = {ACM}, address = {Washington, DC}, abstract = {This paper provides a model and an analytical study of replication as a technique to detect and correct silent errors. Although other detection techniques exist for HPC applications, based on algorithms (ABFT), invariant preservation or data analytics, replication remains the most transparent and least intrusive technique. We explore the right level (duplication, triplication or more) of replication needed to efficiently detect and correct silent errors. Replication is combined with checkpointing and comes with two flavors: process replication and group replication. Process replication applies to message-passing applications with communicating processes. Each process is replicated, and the platform is composed of process pairs, or triplets. Group replication applies to black-box applications, whose parallel execution is replicated several times. The platform is partitioned into two halves (or three thirds). In both scenarios, results are compared before each checkpoint, which is taken only when both results (duplication) or two out of three results (triplication) coincide. If not, one or more silent errors have been detected, and the application rolls back to the last checkpoint. We provide a detailed analytical study of both scenarios, with formulas to decide, for each scenario, the optimal parameters as a function of the error rate, checkpoint cost, and platform size. We also report a set of extensive simulation results that corroborates the analytical model.}, doi = {10.1145/3086157.3086162}, author = {Anne Benoit and Franck Cappello and Aurelien Cavelan and Yves Robert and Hongyang Sun} } @conference {1095, title = {Optimal Checkpointing Period with replicated execution on heterogeneous platforms}, booktitle = {2017 Workshop on Fault-Tolerance for HPC at Extreme Scale}, year = {2017}, month = {2017-06}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Washington, DC}, abstract = {In this paper, we design and analyze strategies to replicate the execution of an application on two different platforms subject to failures, using checkpointing on a shared stable storage. We derive the optimal pattern size~W for a periodic checkpointing strategy where both platforms concurrently try and execute W units of work before checkpointing. The first platform that completes its pattern takes a checkpoint, and the other platform interrupts its execution to synchronize from that checkpoint. We compare this strategy to a simpler on-failure checkpointing strategy, where a checkpoint is taken by one platform only whenever the other platform encounters a failure. We use first or second-order approximations to compute overheads and optimal pattern sizes, and show through extensive simulations that these models are very accurate. The simulations show the usefulness of a secondary platform to reduce execution time, even when the platforms have relatively different speeds: in average, over a wide range of scenarios, the overhead is reduced by 30\%. The simulations also demonstrate that the periodic checkpointing strategy is globally more efficient, unless platform speeds are quite close.}, doi = {10.1145/3086157.3086165}, author = {Anne Benoit and Aurelien Cavelan and Valentin Le F{\`e}vre and Yves Robert} } @conference {1097, title = {Resilience for Stencil Computations with Latent Errors}, booktitle = {International Conference on Parallel Processing (ICPP)}, year = {2017}, month = {2017-08}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Bristol, UK}, abstract = {Projections and measurements of error rates in near-exascale and exascale systems suggest a dramatic growth, due to extreme scale (109,109 cores), concurrency, software complexity, and deep submicron transistor scaling. Such a growth makes resilience a critical concern, and may increase the incidence of errors that "escape," silently corrupting application state. Such errors can often be revealed by application software tests but with long latencies, and thus are known as latent errors. We explore how to efficiently recover from latent errors, with an approach called application-based focused recovery (ABFR). Specifically we present a case study of stencil computations, a widely useful computational structure, showing how ABFR focuses recovery effort where needed, using intelligent testing and pruning to reduce recovery effort, and enables recovery effort to be overlapped with application computation. We analyze and characterize the ABFR approach on stencils, creating a performance model parameterized by error rate and detection interval (latency). We compare projections from the model to experimental results with the Chombo stencil application, validating the model and showing that ABFR on stencil can achieve a significant reductions in error recovery cost (up to 400x) and recovery latency (up to 4x). Such reductions enable efficient execution at scale with high latent error rates.}, author = {Aiman Fang and Aurelien Cavelan and Yves Robert and Andrew Chien} } @article {1091, title = {Resilient Co-Scheduling of Malleable Applications}, journal = {International Journal of High Performance Computing Applications (IJHPCA)}, year = {2017}, month = {2017-05}, abstract = {Recently, the benefits of co-scheduling several applications have been demonstrated in a fault-free context, both in terms of performance and energy savings. However, large-scale computer systems are confronted by frequent failures, and resilience techniques must be employed for large applications to execute efficiently. Indeed, failures may create severe imbalance between applications and significantly degrade performance. In this article, we aim at minimizing the expected completion time of a set of co-scheduled applications. We propose to redistribute the resources assigned to each application upon the occurrence of failures, and upon the completion of some applications, in order to achieve this goal. First, we introduce a formal model and establish complexity results. The problem is NP-complete for malleable applications, even in a fault-free context. Therefore, we design polynomial-time heuristics that perform redistributions and account for processor failures. A fault simulator is used to perform extensive simulations that demonstrate the usefulness of redistribution and the performance of the proposed heuristics.}, keywords = {co-scheduling, complexity results, heuristics, Redistribution, resilience, simulations}, doi = {10.1177/1094342017704979}, author = {Anne Benoit and Lo{\"\i}c Pottier and Yves Robert} } @article {1090, title = {Towards Optimal Multi-Level Checkpointing}, journal = {IEEE Transactions on Computers}, volume = {66}, year = {2017}, month = {2017-07}, pages = {1212{\textendash}1226}, keywords = {checkpointing, Dynamic programming, Error analysis, Heuristic algorithms, Optimized production technology, protocols, Shape}, doi = {10.1109/TC.2016.2643660}, author = {Anne Benoit and Aurelien Cavelan and Valentin Le F{\`e}vre and Yves Robert and Hongyang Sun} } @article {933, title = {Assessing General-purpose Algorithms to Cope with Fail-stop and Silent Errors}, journal = {ACM Transactions on Parallel Computing}, year = {2016}, month = {2016-08}, abstract = {In this paper, we combine the traditional checkpointing and rollback recovery strategies with verification mechanisms to cope with both fail-stop and silent errors. The objective is to minimize makespan and/or energy consumption. For divisible load applications, we use first-order approximations to find the optimal checkpointing period to minimize execution time, with an additional verification mechanism to detect silent errors before each checkpoint, hence extending the classical formula by Young and Daly for fail-stop errors only. We further extend the approach to include intermediate verifications, and to consider a bi-criteria problem involving both time and energy (linear combination of execution time and energy consumption). Then, we focus on application workflows whose dependence graph is a linear chain of tasks. Here, we determine the optimal checkpointing and verification locations, with or without intermediate verifications, for the bicriteria problem. Rather than using a single speed during the whole execution, we further introduce a new execution scenario, which allows for changing the execution speed via dynamic voltage and frequency scaling (DVFS). We determine in this scenario the optimal checkpointing and verification locations, as well as the optimal speed pairs. Finally, we conduct an extensive set of simulations to support the theoretical study, and to assess the performance of each algorithm, showing that the best overall performance is achieved under the most flexible scenario using intermediate verifications and different speeds.}, keywords = {checkpoint, fail-stop error, failure, HPC, resilience, silent data corruption, silent error, verification}, doi = {10.1145/2897189}, author = {Anne Benoit and Aurelien Cavelan and Yves Robert and Hongyang Sun} } @article {924, title = {Assessing the Cost of Redistribution followed by a Computational Kernel: Complexity and Performance Results}, journal = {Parallel Computing}, volume = {52}, year = {2016}, month = {2016-02}, pages = {22-41}, abstract = {The classical redistribution problem aims at optimally scheduling communications when reshuffling from an initial data distribution to a target data distribution. This target data distribution is usually chosen to optimize some objective for the algorithmic kernel under study (good computational balance or low communication volume or cost), and therefore to provide high efficiency for that kernel. However, the choice of a distribution minimizing the target objective is not unique. This leads to generalizing the redistribution problem as follows: find a re-mapping of data items onto processors such that the data redistribution cost is minimal, and the operation remains as efficient. This paper studies the complexity of this generalized problem. We compute optimal solutions and evaluate, through simulations, their gain over classical redistribution. We also show the NP-hardness of the problem to find the optimal data partition and processor permutation (defined by new subsets) that minimize the cost of redistribution followed by a simple computational kernel. Finally, experimental validation of the new redistribution algorithms are conducted on a multicore cluster, for both a 1D-stencil kernel and a more compute-intensive dense linear algebra routine.}, keywords = {Data partition, linear algebra, parsec, QR factorization, Redistribution, Stencil}, doi = {doi:10.1016/j.parco.2015.09.005}, author = {Julien Herrmann and George Bosilca and Thomas Herault and Loris Marchal and Yves Robert and Jack Dongarra} } @inproceedings {979, title = {Failure Detection and Propagation in HPC Systems}, journal = { Proceedings of the The International Conference for High Performance Computing, Networking, Storage and Analysis (SC{\textquoteright}16)}, year = {2016}, month = {2016-11}, pages = {27:1-27:11}, publisher = {IEEE Press}, address = {Salt Lake City, Utah}, keywords = {failure detection, fault-tolerance, MPI}, isbn = {978-1-4673-8815-3}, url = {http://dl.acm.org/citation.cfm?id=3014904.3014941}, author = {George Bosilca and Aurelien Bouteiller and Amina Guermouche and Thomas Herault and Yves Robert and Pierre Sens and Jack Dongarra} } @conference {930, title = {Optimal Resilience Patterns to Cope with Fail-stop and Silent Errors}, booktitle = {2016 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This work focuses on resilience techniques at extreme scale. Many papers deal with fail-stop errors. Many others deal with silent errors (or silent data corruptions). But very few papers deal with fail-stop and silent errors simultaneously. However, HPC applications will obviously have to cope with both error sources. This paper presents a unified framework and optimal algorithmic solutions to this double challenge. Silent errors are handled via verification mechanisms (either partially or fully accurate) and in-memory checkpoints. Fail-stop errors are processed via disk checkpoints. All verification and checkpoint types are combined into computational patterns. We provide a unified model, and a full characterization of the optimal pattern. Our results nicely extend several published solutions and demonstrate how to make use of different techniques to solve the double threat of fail-stop and silent errors. Extensive simulations based on real data confirm the accuracy of the model, and show that patterns that combine all resilience mechanisms are required to provide acceptable overheads.}, keywords = {fail-stop errors, multilevel checkpoint, optimal pattern, resilience, silent errors, verification}, doi = {10.1109/IPDPS.2016.39}, author = {Anne Benoit and Aurelien Cavelan and Yves Robert and Hongyang Sun} } @article {932, title = {Scheduling Computational Workflows on Failure-prone Platforms}, journal = {International Journal of Networking and Computing}, volume = {6}, number = {1}, year = {2016}, pages = {2-26}, abstract = {We study the scheduling of computational workflows on compute resources that experience exponentially distributed failures. When a failure occurs, rollback and recovery is used to resume the execution from the last checkpointed state. The scheduling problem is to minimize the expected execution time by deciding in which order to execute the tasks in the workflow and deciding for each task whether to checkpoint it or not after it completes. We give a polynomialtime optimal algorithm for fork DAGs (Directed Acyclic Graphs) and show that the problem is NP-complete with join DAGs. We also investigate the complexity of the simple case in which no task is checkpointed. Our main result is a polynomial-time algorithm to compute the expected execution time of a workflow, with a given task execution order and specified to-be-checkpointed tasks. Using this algorithm as a basis, we propose several heuristics for solving the scheduling problem. We evaluate these heuristics for representative workflow configurations.}, keywords = {checkpointing, fault-tolerance, reliability, scheduling, workflow}, issn = { 2185-2847}, author = {Guillaume Aupy and Anne Benoit and Henri Casanova and Yves Robert} } @article {852, title = {Composing Resilience Techniques: ABFT, Periodic, and Incremental Checkpointing}, journal = {International Journal of Networking and Computing}, volume = {5}, number = {1}, year = {2015}, month = {2015-01}, pages = {2-15}, abstract = {Algorithm Based Fault Tolerant (ABFT) approaches promise unparalleled scalability and performance in failure-prone environments. Thanks to recent advances in the understanding of the involved mechanisms, a growing number of important algorithms (including all widely used factorizations) have been proven ABFT-capable. In the context of larger applications, these algorithms provide a temporal section of the execution, where the data is protected by its own intrinsic properties, and can therefore be algorithmically recomputed without the need of checkpoints. However, while typical scientific applications spend a significant fraction of their execution time in library calls that can be ABFT-protected, they interleave sections that are difficult or even impossible to protect with ABFT. As a consequence, the only practical fault-tolerance approach for these applications is checkpoint/restart. In this paper we propose a model to investigate the efficiency of a composite protocol, that alternates between ABFT and checkpoint/restart for the effective protection of an iterative application composed of ABFT- aware and ABFT-unaware sections. We also consider an incremental checkpointing composite approach in which the algorithmic knowledge is leveraged by a novel optimal dynamic program- ming to compute checkpoint dates. We validate these models using a simulator. The model and simulator show that the composite approach drastically increases the performance delivered by an execution platform, especially at scale, by providing the means to increase the interval between checkpoints while simultaneously decreasing the volume of each checkpoint. }, keywords = {ABFT, checkpoint, fault-tolerance, High-performance computing, model, performance evaluation, resilience}, issn = {ISSN 2185-2839}, author = {George Bosilca and Aurelien Bouteiller and Thomas Herault and Yves Robert and Jack Dongarra} } @article {931, title = {Efficient Checkpoint/Verification Patterns}, journal = {International Journal on High Performance Computing Applications}, year = {2015}, month = {2015-07}, abstract = {Errors have become a critical problem for high performance computing. Checkpointing protocols are often used for error recovery after fail-stop failures. However, silent errors cannot be ignored, and their peculiarity is that such errors are identified only when the corrupted data is activated. To cope with silent errors, we need a verification mechanism to check whether the application state is correct. Checkpoints should be supplemented with verifications to detect silent errors. When a verification is successful, only the last checkpoint needs to be kept in memory because it is known to be correct. In this paper, we analytically determine the best balance of verifications and checkpoints so as to optimize platform throughput. We introduce a balanced algorithm using a pattern with p checkpoints and q verifications, which regularly interleaves both checkpoints and verifications across same-size computational chunks. We show how to compute the waste of an arbitrary pattern, and we prove that the balanced algorithm is optimal when the platform MTBF (Mean Time Between Failures) is large in front of the other parameters (checkpointing, verification and recovery costs). We conduct several simulations to show the gain achieved by this balanced algorithm for well-chosen values of p and q, compared to the base algorithm that always perform a verification just before taking a checkpoint (p = q = 1), and we exhibit gains of up to 19\%.}, keywords = {checkpointing, Fault tolerance, High Performance Computing, silent data corruption, silent error, verification}, doi = {10.1177/1094342015594531}, author = {Anne Benoit and Saurabh K. Raina and Yves Robert} } @techreport {919, title = {Fault Tolerance Techniques for High-performance Computing}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 289)}, number = {UT-EECS-15-734}, year = {2015}, month = {2015-05}, publisher = {University of Tennessee}, abstract = {This report provides an introduction to resilience methods. The emphasis is on checkpointing, the de-facto standard technique for resilience in High Performance Computing. We present the main two protocols, namely coordinated checkpointing and hierarchical checkpointing. Then we introduce performance models and use them to assess the performance of theses protocols. We cover the Young/Daly formula for the optimal period and much more! Next we explain how the efficiency of checkpointing can be improved via fault prediction or replication. Then we move to application-specific methods, such as ABFT. We conclude the report by discussing techniques to cope with silent errors (or silent data corruption).}, url = {http://www.netlib.org/lapack/lawnspdf/lawn289.pdf}, author = {Jack Dongarra and Thomas Herault and Yves Robert} } @article {917, title = {Mixing LU-QR Factorization Algorithms to Design High-Performance Dense Linear Algebra Solvers}, journal = {Journal of Parallel and Distributed Computing}, volume = {85}, year = {2015}, month = {2015-11}, pages = {32-46}, abstract = {This paper introduces hybrid LU{\textendash}QR algorithms for solving dense linear systems of the form Ax=b. Throughout a matrix factorization, these algorithms dynamically alternate LU with local pivoting and QR elimination steps based upon some robustness criterion. LU elimination steps can be very efficiently parallelized, and are twice as cheap in terms of floating-point operations, as QR steps. However, LU steps are not necessarily stable, while QR steps are always stable. The hybrid algorithms execute a QR step when a robustness criterion detects some risk for instability, and they execute an LU step otherwise. The choice between LU and QR steps must have a small computational overhead and must provide a satisfactory level of stability with as few QR steps as possible. In this paper, we introduce several robustness criteria and we establish upper bounds on the growth factor of the norm of the updated matrix incurred by each of these criteria. In addition, we describe the implementation of the hybrid algorithms through an extension of the PaRSEC software to allow for dynamic choices during execution. Finally, we analyze both stability and performance results compared to state-of-the-art linear solvers on parallel distributed multicore platforms. A comprehensive set of experiments shows that hybrid LU{\textendash}QR algorithms provide a continuous range of trade-offs between stability and performances.}, keywords = {lu factorization, Numerical algorithms, QR factorization, Stability; Performance}, doi = {doi:10.1016/j.jpdc.2015.06.007}, author = {Mathieu Faverge and Julien Herrmann and Julien Langou and Bradley Lowery and Yves Robert and Jack Dongarra} } @techreport {874, title = {Scheduling for fault-tolerance: an introduction}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-15-02}, year = {2015}, month = {2015-01}, publisher = {University of Tennessee}, author = {Guillaume Aupy and Yves Robert} } @conference {708, title = {Assessing the Impact of ABFT and Checkpoint Composite Strategies}, booktitle = {16th Workshop on Advances in Parallel and Distributed Computational Models, IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Algorithm-specific fault tolerant approaches promise unparalleled scalability and performance in failure-prone environments. With the advances in the theoretical and practical understanding of algorithmic traits enabling such approaches, a growing number of frequently used algorithms (including all widely used factorization kernels) have been proven capable of such properties. These algorithms provide a temporal section of the execution when the data is protected by it{\textquoteright}s own intrinsic properties, and can be algorithmically recomputed without the need of checkpoints. However, while typical scientific applications spend a significant fraction of their execution time in library calls that can be ABFT-protected, they interleave sections that are difficult or even impossible to protect with ABFT. As a consequence, the only fault-tolerance approach that is currently used for these applications is checkpoint/restart. In this paper we propose a model and a simulator to investigate the behavior of a composite protocol, that alternates between ABFT and checkpoint/restart protection for effective protection of each phase of an iterative application composed of ABFT-aware and ABFTunaware sections. We highlight this approach drastically increases the performance delivered by the system, especially at scale, by providing means to rarefy the checkpoints while simultaneously decreasing the volume of data needed to be checkpointed.}, keywords = {ABFT, checkpoint, fault-tolerance, High-performance computing, resilience}, author = {George Bosilca and Aurelien Bouteiller and Thomas Herault and Yves Robert and Jack Dongarra} } @conference {813, title = {Designing LU-QR Hybrid Solvers for Performance and Stability}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {This paper introduces hybrid LU-QR algorithms for solving dense linear systems of the form Ax = b. Throughout a matrix factorization, these algorithms dynamically alternate LU with local pivoting and QR elimination steps, based upon some robustness criterion. LU elimination steps can be very efficiently parallelized, and are twice as cheap in terms of operations, as QR steps. However, LU steps are not necessarily stable, while QR steps are always stable. The hybrid algorithms execute a QR step when a robustness criterion detects some risk for instability, and they execute an LU step otherwise. Ideally, the choice between LU and QR steps must have a small computational overhead and must provide a satisfactory level of stability with as few QR steps as possible. In this paper, we introduce several robustness criteria and we establish upper bounds on the growth factor of the norm of the updated matrix incurred by each of these criteria. In addition, we describe the implementation of the hybrid algorithms through an extension of the Parsec software to allow for dynamic choices during execution. Finally, we analyze both stability and performance results compared to state-of-the-art linear solvers on parallel distributed multicore platforms.}, keywords = {plasma}, isbn = {978-1-4799-3800-1}, doi = {10.1109/IPDPS.2014.108}, author = {Mathieu Faverge and Julien Herrmann and Julien Langou and Bradley Lowery and Yves Robert and Jack Dongarra} } @techreport {808, title = {Efficient checkpoint/verification patterns for silent error detection}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-14-03}, year = {2014}, month = {2014-05}, publisher = {University of Tennessee}, type = {LAWN 287}, abstract = {Resilience has become a critical problem for high performance computing. Checkpointing protocols are often used for error recovery after fail-stop failures. However, silent errors cannot be ignored, and their particularities is that such errors are identified only when the corrupted data is activated. To cope with silent errors, we need a verification mechanism to check whether the application state is correct. Checkpoints should be supplemented with verifications to detect silent errors. When a verification is successful, only the last checkpoint needs to be kept in memory because it is known to be correct. In this paper, we analytically determine the best balance of verifications and checkpoints so as to optimize platform throughput. We introduce a balanced algorithm using a pattern with p checkpoints and q verifications, which regularly interleaves both checkpoints and verifications across same-size computational chunks. We show how to compute the waste of an arbitrary pattern, and we prove that the balanced algorithm is optimal when the platform MTBF (Mean Time Between Failures) is large in front of the other parameters (checkpointing, verification and recovery costs). We conduct several simulations to show the gain achieved by this balanced algorithm for well-chosen values of p and q, compared to the base algorithm that always perform a verification just before taking a checkpoint (p = q = 1), and we exhibit gains of up to 19\%.}, author = {Anne Benoit and Yves Robert and Saurabh K. Raina} } @article {814, title = {Performance and Reliability Trade-offs for the Double Checkpointing Algorithm}, journal = {International Journal of Networking and Computing}, volume = {4}, number = {1}, year = {2014}, month = {2014}, pages = {32-41}, chapter = {32}, abstract = {Fast checkpointing algorithms require distributed access to stable storage. This paper revisits the approach based upon double checkpointing, and compares the blocking algorithm of Zheng, Shi and Kal{\'e} [23], with the non-blocking algorithm of Ni, Meneses and Kal{\'e} [15] in terms of both performance and risk. We also extend the model proposedcan provide a better efficiency in [23, 15] to assess the impact of the overhead associated to non-blocking communications. In addition, we deal with arbitrary failure distributions (as opposed to uniform distributions in [23]). We then provide a new peer-to-peer checkpointing algorithm, called the triple checkpointing algorithm, that can work without additional memory, and achieves both higher efficiency and better risk handling than the double checkpointing algorithm. We provide performance and risk models for all the evaluated protocols, and compare them through comprehensive simulations.}, keywords = {communication contention, in-memory checkpoint, performance, resilience, risk}, issn = {2185-2847}, author = {Jack Dongarra and Thomas Herault and Yves Robert} } @techreport {694, title = {Assessing the impact of ABFT and Checkpoint composite strategies}, journal = {University of Tennessee Computer Science Technical Report}, number = {ICL-UT-13-03}, year = {2013}, abstract = {Algorithm-specific fault tolerant approaches promise unparalleled scalability and performance in failure-prone environments. With the advances in the theoretical and practical understanding of algorithmic traits enabling such approaches, a growing number of frequently used algorithms (including all widely used factorization kernels) have been proven capable of such properties. These algorithms provide a temporal section of the execution when the data is protected by it{\textquoteright}s own intrinsic properties, and can be algorithmically recomputed without the need of checkpoints. However, while typical scientific applications spend a significant fraction of their execution time in library calls that can be ABFT-protected, they interleave sections that are difficult or even impossible to protect with ABFT. As a consequence, the only fault-tolerance approach that is currently used for these applications is checkpoint/restart. In this paper we propose a model and a simulator to investigate the behavior of a composite protocol, that alternates between ABFT and checkpoint/restart protection for effective protection of each phase of an iterative application composed of ABFT-aware and ABFT-unaware sections. We highlight this approach drastically increases the performance delivered by the system, especially at scale, by providing means to rarefy the checkpoints while simultaneously decreasing the volume of data needed to be checkpointed.}, keywords = {ABFT, checkpoint, fault-tolerance, High-performance computing, resilience}, author = {George Bosilca and Aurelien Bouteiller and Thomas Herault and Yves Robert and Jack Dongarra} } @techreport {684, title = {On the Combination of Silent Error Detection and Checkpointing}, journal = {UT-CS-13-710}, year = {2013}, month = {2013-06}, publisher = {University of Tennessee Computer Science Technical Report}, abstract = {In this paper, we revisit traditional checkpointing and rollback recovery strategies, with a focus on silent data corruption errors. Contrarily to fail-stop failures, such latent errors cannot be detected immediately, and a mechanism to detect them must be provided. We consider two models: (i) errors are detected after some delays following a probability distribution (typically, an Exponential distribution); (ii) errors are detected through some verification mechanism. In both cases, we compute the optimal period in order to minimize the waste, i.e., the fraction of time where nodes do not perform useful computations. In practice, only a fixed number of checkpoints can be kept in memory, and the first model may lead to an irrecoverable failure. In this case, we compute the minimum period required for an acceptable risk. For the second model, there is no risk of irrecoverable failure, owing to the verification mechanism, but the corresponding overhead is included in the waste. Finally, both models are instantiated using realistic scenarios and application/architecture parameters.}, keywords = {checkpointing, error recovery, High-performance computing, silent data corruption, verification}, url = {http://www.netlib.org/lapack/lawnspdf/lawn278.pdf}, author = {Guillaume Aupy and Anne Benoit and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @techreport {703, title = {Designing LU-QR hybrid solvers for performance and stability}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 282)}, number = {ut-eecs-13-719}, year = {2013}, month = {2013-10}, publisher = {University of Tennessee}, author = {Mathieu Faverge and Julien Herrmann and Julien Langou and Bradley Lowery and Yves Robert and Jack Dongarra} } @article {752, title = {Hierarchical QR Factorization Algorithms for Multi-core Cluster Systems}, journal = {Parallel Computing}, volume = {39}, year = {2013}, month = {2013-05}, pages = {212-232}, abstract = {This paper describes a new QR factorization algorithm which is especially designed for massively parallel platforms combining parallel distributed nodes, where a node is a multi-core processor. These platforms represent the present and the foreseeable future of high-performance computing. Our new QR factorization algorithm falls in the category of the tile algorithms which naturally enables good data locality for the sequential kernels executed by the cores (high sequential performance), low number of messages in a parallel distributed setting (small latency term), and fine granularity (high parallelism). Each tile algorithm is uniquely characterized by its sequence of reduction trees. In the context of a cluster of nodes, in order to minimize the number of inter-processor communications (aka, {\textquoteleft}{\textquoteleft}communication-avoiding{\textquoteright}{\textquoteright}), it is natural to consider hierarchical trees composed of an {\textquoteleft}{\textquoteleft}inter-node{\textquoteright}{\textquoteright} tree which acts on top of {\textquoteleft}{\textquoteleft}intra-node{\textquoteright}{\textquoteright} trees. At the intra-node level, we propose a hierarchical tree made of three levels: (0) {\textquoteleft}{\textquoteleft}TS level{\textquoteright}{\textquoteright} for cache-friendliness, (1) {\textquoteleft}{\textquoteleft}low-level{\textquoteright}{\textquoteright} for decoupled highly parallel inter-node reductions, (2) {\textquoteleft}{\textquoteleft}domino level{\textquoteright}{\textquoteright} to efficiently resolve interactions between local reductions and global reductions. Our hierarchical algorithm and its implementation are flexible and modular, and can accommodate several kernel types, different distribution layouts, and a variety of reduction trees at all levels, both inter-node and intra-node. Numerical experiments on a cluster of multi-core nodes (i) confirm that each of the four levels of our hierarchical tree contributes to build up performance and (ii) build insights on how these levels influence performance and interact within each other. Our implementation of the new algorithm with the DAGUE scheduling tool significantly outperforms currently available QR factorization software for all matrix shapes, thereby bringing a new advance in numerical linear algebra for petascale and exascale platforms.}, keywords = {Cluster, Distributed memory, Hierarchical architecture, multi-core, numerical linear algebra, QR factorization}, author = {Jack Dongarra and Mathieu Faverge and Thomas Herault and Mathias Jacquelin and Julien Langou and Yves Robert} } @techreport {682, title = {Implementing a systolic algorithm for QR factorization on multicore clusters with PaRSEC}, journal = {Lawn 277}, number = {UT-CS-13-709}, year = {2013}, month = {2013-05}, abstract = {This article introduces a new systolic algorithm for QR factorization, and its implementation on a supercomputing cluster of multicore nodes. The algorithm targets a virtual 3D-array and requires only local communications. The implementation of the algorithm uses threads at the node level, and MPI for inter-node communications. The complexity of the implementation is addressed with the PaRSEC software, which takes as input a parametrized dependence graph, which is derived from the algorithm, and only requires the user to decide, at the high-level, the allocation of tasks to nodes. We show that the new algorithm exhibits competitive performance with state-of-the-art QR routines on a supercomputer called Kraken, which shows that high-level programming environments, such as PaRSEC, provide a viable alternative to enhance the production of quality software on complex and hierarchical architectures}, author = {Guillaume Aupy and Mathieu Faverge and Yves Robert and Jakub Kurzak and Piotr Luszczek and Jack Dongarra} } @techreport {icl:733, title = {Multi-criteria checkpointing strategies: optimizing response-time versus resource utilization}, journal = {University of Tennessee Computer Science Technical Report}, number = {ICL-UT-13-01}, year = {2013}, month = {2013-02}, abstract = {Failures are increasingly threatening the eciency of HPC systems, and current projections of Exascale platforms indicate that rollback recovery, the most convenient method for providing fault tolerance to generalpurpose applications, reaches its own limits at such scales. One of the reasons explaining this unnerving situation comes from the focus that has been given to per-application completion time, rather than to platform efficiency. In this paper, we discuss the case of uncoordinated rollback recovery where the idle time spent waiting recovering processors is used to progress a different, independent application from the system batch queue. We then propose an extended model of uncoordinated checkpointing that can discriminate between idle time and wasted computation. We instantiate this model in a simulator to demonstrate that, with this strategy, uncoordinated checkpointing per application completion time is unchanged, while it delivers near-perfect platform efficiency.}, author = {Aurelien Bouteiller and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert} } @conference {868, title = {Multi-criteria Checkpointing Strategies: Response-Time versus Resource Utilization}, booktitle = {Euro-Par 2013}, year = {2013}, month = {2013-08}, publisher = {Springer}, organization = {Springer}, address = {Aachen, Germany}, abstract = {Failures are increasingly threatening the efficiency of HPC systems, and current projections of Exascale platforms indicate that roll- back recovery, the most convenient method for providing fault tolerance to general-purpose applications, reaches its own limits at such scales. One of the reasons explaining this unnerving situation comes from the focus that has been given to per-application completion time, rather than to platform efficiency. In this paper, we discuss the case of uncoordinated rollback recovery where the idle time spent waiting recovering processors is used to progress a different, independent application from the sys- tem batch queue. We then propose an extended model of uncoordinated checkpointing that can discriminate between idle time and wasted com- putation. We instantiate this model in a simulator to demonstrate that, with this strategy, uncoordinated checkpointing per application comple- tion time is unchanged, while it delivers near-perfect platform efficiency.}, author = {Aurelien Bouteiller and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert} } @techreport {702, title = {Optimal Checkpointing Period: Time vs. Energy}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 281)}, number = {ut-eecs-13-718}, year = {2013}, month = {2013-10}, publisher = {University of Tennessee}, author = {Guillaume Aupy and Anne Benoit and Thomas Herault and Yves Robert and Jack Dongarra} } @techreport {icl:735, title = {Revisiting the Double Checkpointing Algorithm}, journal = {University of Tennessee Computer Science Technical Report (LAWN 274)}, number = {ut-cs-13-705}, year = {2013}, month = {2013-01}, abstract = {Fast checkpointing algorithms require distributed access to stable storage. This paper revisits the approach base upon double checkpointing, and compares the blocking algorithm of Zheng, Shi and Kal{\'e}, with the non-blocking algorithm of Ni, Meneses and Kal{\'e} in terms of both performance and risk. We also extend the model that they have proposed to assess the impact of the overhead associated to non-blocking communications. We then provide a new peer-to-peer checkpointing algorithm, called the triple checkpointing algorithm, that can work at constant memory, and achieves both higher efficiency and better risk handling than the double checkpointing algorithm. We provide performance and risk models for all the evaluated protocols, and compare them through comprehensive simulations.}, keywords = {checkpoint algorithm, communication overlap, fault-tolerance, performance model, resilience}, author = {Jack Dongarra and Thomas Herault and Yves Robert} } @conference {717, title = {Revisiting the Double Checkpointing Algorithm}, booktitle = {15th Workshop on Advances in Parallel and Distributed Computational Models, at the IEEE International Parallel \& Distributed Processing Symposium}, year = {2013}, month = {2013-05}, address = {Boston, MA}, abstract = {Abstract{\textemdash}Fast checkpointing algorithms require distributed access to stable storage. This paper revisits the approach base upon double checkpointing, and compares the blocking algorithm of Zheng, Shi and Kale [1], with the non-blocking algorithm of Ni, Meneses and Kale [2] in terms of both performance and risk. We also extend the model proposed in [1], [2] to assess the impact of the overhead associated to non-blocking communications. We then provide a new peer-topeer checkpointing algorithm, called the triple checkpointing algorithm, that can work at constant memory, and achieves both higher efficiency and better risk handling than the double checkpointing algorithm. We provide performance and risk models for all the evaluated protocols, and compare them through comprehensive simulations.}, author = {Jack Dongarra and Thomas Herault and Yves Robert} } @article {748, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {Concurrency and Computation: Practice and Experience}, year = {2013}, month = {2013-11}, abstract = {In this paper, we present a unified model for several well-known checkpoint/restart protocols. The proposed model is generic enough to encompass both extremes of the checkpoint/restart space, from coordinated approaches to a variety of uncoordinated checkpoint strategies (with message logging). We identify a set of crucial parameters, instantiate them, and compare the expected efficiency of the fault tolerant protocols, for a given application/platform pair. We then propose a detailed analysis of several scenarios, including some of the most powerful currently available high performance computing platforms, as well as anticipated Exascale designs. The results of this analytical comparison are corroborated by a comprehensive set of simulations. Altogether, they outline comparative behaviors of checkpoint strategies at very large scale, thereby providing insight that is hardly accessible to direct experimentation.}, doi = {10.1002/cpe.3173}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @inproceedings {icl:687, title = {Hierarchical QR Factorization Algorithms for Multi-Core Cluster Systems}, journal = {IPDPS 2012, the 26th IEEE International Parallel and Distributed Processing Symposium}, year = {2012}, month = {2012-05}, publisher = {IEEE Computer Society Press}, address = {Shanghai, China}, author = {Jack Dongarra and Mathieu Faverge and Thomas Herault and Julien Langou and Yves Robert} } @article {icl:705, title = {Looking Back at Dense Linear Algebra Software}, journal = {Perspectives on Parallel and Distributed Processing: Looking Back and What{\textquoteright}s Ahead (to appear)}, year = {2012}, month = {2012-00}, author = {Piotr Luszczek and Jakub Kurzak and Jack Dongarra}, editor = {Viktor K. Prasanna and Yves Robert and Per Stenstr{\"o}m} } @techreport {icl:716, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 269)}, number = {UT-CS-12-697}, year = {2012}, month = {2012-06}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @techreport {icl:645, title = {Hierarchical QR Factorization Algorithms for Multi-Core Cluster Systems}, journal = {University of Tennessee Computer Science Technical Report (also Lawn 257)}, number = {UT-CS-11-684}, year = {2011}, month = {2011-10}, keywords = {magma, plasma}, author = {Jack Dongarra and Mathieu Faverge and Thomas Herault and Julien Langou and Yves Robert} } @inproceedings {icl:455, title = {Matrix Product on Heterogeneous Master Worker Platforms}, journal = {2008 PPoPP Conference}, year = {2008}, month = {2008-01}, address = {Salt Lake City, Utah}, author = {Jack Dongarra and Jean-Francois Pineau and Yves Robert and Frederic Vivien} } @article {icl:504, title = {Revisiting Matrix Product on Master-Worker Platforms}, journal = {International Journal of Foundations of Computer Science (IJFCS)}, volume = {19}, number = {6}, year = {2008}, month = {2008-12}, pages = {1317-1336}, author = {Jack Dongarra and Jean-Francois Pineau and Yves Robert and Zhiao Shi and Frederic Vivien} } @article {icl:371, title = {Revisiting Matrix Product on Master-Worker Platforms}, journal = {International Journal of Foundations of Computer Science (IJFCS) (accepted)}, year = {2007}, month = {2007-00}, author = {Jack Dongarra and Jean-Francois Pineau and Yves Robert and Zhiao Shi and Frederic Vivien} } @article {icl:294, title = {Recent Developments in GridSolve}, journal = {International Journal of High Performance Computing Applications (Special Issue: Scheduling for Large-Scale Heterogeneous Platforms)}, volume = {20}, number = {1}, year = {2006}, month = {2006-00}, publisher = {Sage Science Press}, keywords = {netsolve}, author = {Asim YarKhan and Keith Seymour and Kiran Sagi and Zhiao Shi and Jack Dongarra}, editor = {Yves Robert} } @article {icl:57, title = {Algorithmic Issues on Heterogeneous Computing Platforms}, journal = {Parallel Processing Letters}, volume = {9}, number = {2}, year = {1999}, month = {1999-01}, pages = {197-213}, author = {Pierre Boulet and Jack Dongarra and Fabrice Rastello and Yves Robert and Frederic Vivien} } @article {icl:229, title = {A Numerical Linear Algebra Problem Solving Environment Designer{\textquoteright}s Perspective (LAPACK Working Note 139)}, journal = {SIAM Annual Meeting}, year = {1999}, month = {1999-05}, address = {Atlanta, GA}, author = {Antoine Petitet and Henri Casanova and Clint Whaley and Jack Dongarra and Yves Robert} } @article {icl:75, title = {Parallel and Distributed Scientific Computing: A Numerical Linear Algebra Problem Solving Environment Designer{\textquoteright}s Perspective}, journal = {Handbook on Parallel and Distributed Processing}, year = {1999}, month = {1999-01}, author = {Antoine Petitet and Henri Casanova and Jack Dongarra and Yves Robert and Clint Whaley} } @article {icl:58, title = {Static Tiling for Heterogeneous Computing Platforms}, journal = {Parallel Computing}, volume = {25}, number = {5}, year = {1999}, month = {1999-01}, pages = {547-568}, author = {Pierre Boulet and Jack Dongarra and Yves Robert and Frederic Vivien} } @article {icl:62, title = {Tiling on Systems with Communication/Computation Overlap}, journal = {Concurrency: Practice and Experience}, volume = {11}, number = {3}, year = {1999}, month = {1999-01}, pages = {139-153}, author = {Pierre-Yves Calland and Jack Dongarra and Yves Robert} }