@article {1313, title = {Co-Scheduling HPC Workloads on Cache-Partitioned CMP Platforms}, journal = {International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1221-1239}, abstract = {With the recent advent of many-core architectures such as chip multiprocessors (CMPs), the number of processing units accessing a global shared memory is constantly increasing. Co-scheduling techniques are used to improve application throughput on such architectures, but sharing resources often generates critical interferences. In this article, we focus on the interferences in the last level of cache (LLC) and use the Cache Allocation Technology (CAT) recently provided by Intel to partition the LLC and give each co-scheduled application their own cache area. We consider m iterative HPC applications running concurrently and answer to the following questions: (i) How to precisely model the behavior of these applications on the cache-partitioned platform? and (ii) how many cores and cache fractions should be assigned to each application to maximize the platform efficiency? Here, platform efficiency is defined as maximizing the performance either globally, or as guaranteeing a fixed ratio of iterations per second for each application. Through extensive experiments using CAT, we demonstrate the impact of cache partitioning when multiple HPC applications are co-scheduled onto CMP platforms.}, keywords = {cache partitioning, chip multiprocessor, co-scheduling, HPC application}, doi = {https://doi.org/10.1177/1094342019846956}, author = {Guillaume Aupy and Anne Benoit and Brice Goglin and Lo{\"\i}c Pottier and Yves Robert} } @conference {1316, title = {Reservation Strategies for Stochastic Jobs}, booktitle = {33rd IEEE International Parallel and Distributed Processing Symposium (IPDPS 2019)}, year = {2019}, month = {2019-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Rio de Janeiro, Brazil}, author = {Guillaume Aupy and Ana Gainaru and Valentin Honor{\'e} and Padma Raghavan and Yves Robert and Hongyang Sun} } @article {1198, title = {Co-Scheduling Amdhal Applications on Cache-Partitioned Systems}, journal = {International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-01}, pages = {123{\textendash}138}, abstract = {Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are as follows: (i) which proportion of cache and (ii) how many processors should be given to each application? In this article, we provide answers to (i) and (ii) for Amdahl applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for Amdahl applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.}, keywords = {cache partitioning, co-scheduling, complexity results}, doi = {https://doi.org/10.1177/1094342017710806}, author = {Guillaume Aupy and Anne Benoit and Sicheng Dai and Lo{\"\i}c Pottier and Padma Raghavan and Yves Robert and Manu Shantharam} } @conference {1217, title = {Co-Scheduling HPC Workloads on Cache-Partitioned CMP Platforms}, booktitle = {Cluster 2018}, year = {2018}, month = {2018-09}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Belfast, UK}, author = {Guillaume Aupy and Anne Benoit and Brice Goglin and Lo{\"\i}c Pottier and Yves Robert} } @inbook {1261, title = {Scheduling for Fault-Tolerance: An Introduction}, booktitle = {Topics in Parallel and Distributed Computing}, year = {2018}, pages = {143{\textendash}170}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, isbn = { 978-3-319-93108-1}, doi = {10.1007/978-3-319-93109-8}, author = {Guillaume Aupy and Yves Robert} } @conference {1099, title = {Assuming failure independence: are we right to be wrong?}, booktitle = {The 3rd International Workshop on Fault Tolerant Systems (FTS)}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii}, abstract = {This paper revisits the failure temporal independence hypothesis which is omnipresent in the analysis of resilience methods for HPC. We explain why a previous approach is incorrect, and we propose a new method to detect failure cascades, i.e., series of non-independent consecutive failures. We use this new method to assess whether public archive failure logs contain failure cascades. Then we design and compare several cascadeaware checkpointing algorithms to quantify the maximum gain that could be obtained, and we report extensive simulation results with archive and synthetic failure logs. Altogether, there are a few logs that contain cascades, but we show that the gain that can be achieved from this knowledge is not significant. The conclusion is that we can wrongly, but safely, assume failure independence!}, author = {Guillaume Aupy and Yves Robert and Frederic Vivien} } @conference {1094, title = {Co-Scheduling Algorithms for Cache-Partitioned Systems}, booktitle = {19th Workshop on Advances in Parallel and Distributed Computational Models}, year = {2017}, month = {2017-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Orlando, FL}, abstract = {Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are: (i) which proportion of cache and (ii) how many processors should be given to each application? Here, we assign rational numbers of processors to each application, since they can be shared across applications through multi-threading. In this paper, we provide answers to (i) and (ii) for perfectly parallel applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for general applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.}, keywords = {Computational modeling, Degradation, Interference, Mathematical model, Program processors, Supercomputers, Throughput}, doi = {10.1109/IPDPSW.2017.60}, author = {Guillaume Aupy and Anne Benoit and Lo{\"\i}c Pottier and Padma Raghavan and Yves Robert and Manu Shantharam} } @article {932, title = {Scheduling Computational Workflows on Failure-prone Platforms}, journal = {International Journal of Networking and Computing}, volume = {6}, number = {1}, year = {2016}, pages = {2-26}, abstract = {We study the scheduling of computational workflows on compute resources that experience exponentially distributed failures. When a failure occurs, rollback and recovery is used to resume the execution from the last checkpointed state. The scheduling problem is to minimize the expected execution time by deciding in which order to execute the tasks in the workflow and deciding for each task whether to checkpoint it or not after it completes. We give a polynomialtime optimal algorithm for fork DAGs (Directed Acyclic Graphs) and show that the problem is NP-complete with join DAGs. We also investigate the complexity of the simple case in which no task is checkpointed. Our main result is a polynomial-time algorithm to compute the expected execution time of a workflow, with a given task execution order and specified to-be-checkpointed tasks. Using this algorithm as a basis, we propose several heuristics for solving the scheduling problem. We evaluate these heuristics for representative workflow configurations.}, keywords = {checkpointing, fault-tolerance, reliability, scheduling, workflow}, issn = { 2185-2847}, author = {Guillaume Aupy and Anne Benoit and Henri Casanova and Yves Robert} } @techreport {874, title = {Scheduling for fault-tolerance: an introduction}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-15-02}, year = {2015}, month = {2015-01}, publisher = {University of Tennessee}, author = {Guillaume Aupy and Yves Robert} } @techreport {684, title = {On the Combination of Silent Error Detection and Checkpointing}, journal = {UT-CS-13-710}, year = {2013}, month = {2013-06}, publisher = {University of Tennessee Computer Science Technical Report}, abstract = {In this paper, we revisit traditional checkpointing and rollback recovery strategies, with a focus on silent data corruption errors. Contrarily to fail-stop failures, such latent errors cannot be detected immediately, and a mechanism to detect them must be provided. We consider two models: (i) errors are detected after some delays following a probability distribution (typically, an Exponential distribution); (ii) errors are detected through some verification mechanism. In both cases, we compute the optimal period in order to minimize the waste, i.e., the fraction of time where nodes do not perform useful computations. In practice, only a fixed number of checkpoints can be kept in memory, and the first model may lead to an irrecoverable failure. In this case, we compute the minimum period required for an acceptable risk. For the second model, there is no risk of irrecoverable failure, owing to the verification mechanism, but the corresponding overhead is included in the waste. Finally, both models are instantiated using realistic scenarios and application/architecture parameters.}, keywords = {checkpointing, error recovery, High-performance computing, silent data corruption, verification}, url = {http://www.netlib.org/lapack/lawnspdf/lawn278.pdf}, author = {Guillaume Aupy and Anne Benoit and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @techreport {682, title = {Implementing a systolic algorithm for QR factorization on multicore clusters with PaRSEC}, journal = {Lawn 277}, number = {UT-CS-13-709}, year = {2013}, month = {2013-05}, abstract = {This article introduces a new systolic algorithm for QR factorization, and its implementation on a supercomputing cluster of multicore nodes. The algorithm targets a virtual 3D-array and requires only local communications. The implementation of the algorithm uses threads at the node level, and MPI for inter-node communications. The complexity of the implementation is addressed with the PaRSEC software, which takes as input a parametrized dependence graph, which is derived from the algorithm, and only requires the user to decide, at the high-level, the allocation of tasks to nodes. We show that the new algorithm exhibits competitive performance with state-of-the-art QR routines on a supercomputer called Kraken, which shows that high-level programming environments, such as PaRSEC, provide a viable alternative to enhance the production of quality software on complex and hierarchical architectures}, author = {Guillaume Aupy and Mathieu Faverge and Yves Robert and Jakub Kurzak and Piotr Luszczek and Jack Dongarra} } @techreport {702, title = {Optimal Checkpointing Period: Time vs. Energy}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 281)}, number = {ut-eecs-13-718}, year = {2013}, month = {2013-10}, publisher = {University of Tennessee}, author = {Guillaume Aupy and Anne Benoit and Thomas Herault and Yves Robert and Jack Dongarra} }