@article {1313, title = {Co-Scheduling HPC Workloads on Cache-Partitioned CMP Platforms}, journal = {International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1221-1239}, abstract = {With the recent advent of many-core architectures such as chip multiprocessors (CMPs), the number of processing units accessing a global shared memory is constantly increasing. Co-scheduling techniques are used to improve application throughput on such architectures, but sharing resources often generates critical interferences. In this article, we focus on the interferences in the last level of cache (LLC) and use the Cache Allocation Technology (CAT) recently provided by Intel to partition the LLC and give each co-scheduled application their own cache area. We consider m iterative HPC applications running concurrently and answer to the following questions: (i) How to precisely model the behavior of these applications on the cache-partitioned platform? and (ii) how many cores and cache fractions should be assigned to each application to maximize the platform efficiency? Here, platform efficiency is defined as maximizing the performance either globally, or as guaranteeing a fixed ratio of iterations per second for each application. Through extensive experiments using CAT, we demonstrate the impact of cache partitioning when multiple HPC applications are co-scheduled onto CMP platforms.}, keywords = {cache partitioning, chip multiprocessor, co-scheduling, HPC application}, doi = {https://doi.org/10.1177/1094342019846956}, author = {Guillaume Aupy and Anne Benoit and Brice Goglin and Lo{\"\i}c Pottier and Yves Robert} } @article {1198, title = {Co-Scheduling Amdhal Applications on Cache-Partitioned Systems}, journal = {International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-01}, pages = {123{\textendash}138}, abstract = {Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are as follows: (i) which proportion of cache and (ii) how many processors should be given to each application? In this article, we provide answers to (i) and (ii) for Amdahl applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for Amdahl applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.}, keywords = {cache partitioning, co-scheduling, complexity results}, doi = {https://doi.org/10.1177/1094342017710806}, author = {Guillaume Aupy and Anne Benoit and Sicheng Dai and Lo{\"\i}c Pottier and Padma Raghavan and Yves Robert and Manu Shantharam} } @conference {1217, title = {Co-Scheduling HPC Workloads on Cache-Partitioned CMP Platforms}, booktitle = {Cluster 2018}, year = {2018}, month = {2018-09}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Belfast, UK}, author = {Guillaume Aupy and Anne Benoit and Brice Goglin and Lo{\"\i}c Pottier and Yves Robert} } @conference {1216, title = {A Performance Model to Execute Workflows on High-Bandwidth Memory Architectures}, booktitle = {The 47th International Conference on Parallel Processing (ICPP 2018)}, year = {2018}, month = {2018-08}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Eugene, OR}, abstract = {This work presents a realistic performance model to execute scientific workflows on high-bandwidth memory architectures such as the Intel Knights Landing. We provide a detailed analysis of the execution time on such platforms, taking into account transfers from both fast and slow memory and their overlap with computations. We discuss several scheduling and mapping strategies: not only tasks must be assigned to computing resource, but also one has to decide which fraction of input and output data will reside in fast memory, and which will have to stay in slow memory. Extensive simulations allow us to assess the impact of the mapping strategies on performance. We also conduct actual experiments for a simple 1D Gauss-Seidel kernel, which assess the accuracy of the model and further demonstrate the importance of a tuned memory management. Altogether, our model and results lay the foundations for further studies and experiments on dual-memory systems.}, author = {Anne Benoit and Swann Perarnau and Lo{\"\i}c Pottier and Yves Robert} } @conference {1094, title = {Co-Scheduling Algorithms for Cache-Partitioned Systems}, booktitle = {19th Workshop on Advances in Parallel and Distributed Computational Models}, year = {2017}, month = {2017-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Orlando, FL}, abstract = {Cache-partitioned architectures allow subsections of the shared last-level cache (LLC) to be exclusively reserved for some applications. This technique dramatically limits interactions between applications that are concurrently executing on a multicore machine. Consider n applications that execute concurrently, with the objective to minimize the makespan, defined as the maximum completion time of the n applications. Key scheduling questions are: (i) which proportion of cache and (ii) how many processors should be given to each application? Here, we assign rational numbers of processors to each application, since they can be shared across applications through multi-threading. In this paper, we provide answers to (i) and (ii) for perfectly parallel applications. Even though the problem is shown to be NP-complete, we give key elements to determine the subset of applications that should share the LLC (while remaining ones only use their smaller private cache). Building upon these results, we design efficient heuristics for general applications. Extensive simulations demonstrate the usefulness of co-scheduling when our efficient cache partitioning strategies are deployed.}, keywords = {Computational modeling, Degradation, Interference, Mathematical model, Program processors, Supercomputers, Throughput}, doi = {10.1109/IPDPSW.2017.60}, author = {Guillaume Aupy and Anne Benoit and Lo{\"\i}c Pottier and Padma Raghavan and Yves Robert and Manu Shantharam} } @article {1091, title = {Resilient Co-Scheduling of Malleable Applications}, journal = {International Journal of High Performance Computing Applications (IJHPCA)}, year = {2017}, month = {2017-05}, abstract = {Recently, the benefits of co-scheduling several applications have been demonstrated in a fault-free context, both in terms of performance and energy savings. However, large-scale computer systems are confronted by frequent failures, and resilience techniques must be employed for large applications to execute efficiently. Indeed, failures may create severe imbalance between applications and significantly degrade performance. In this article, we aim at minimizing the expected completion time of a set of co-scheduled applications. We propose to redistribute the resources assigned to each application upon the occurrence of failures, and upon the completion of some applications, in order to achieve this goal. First, we introduce a formal model and establish complexity results. The problem is NP-complete for malleable applications, even in a fault-free context. Therefore, we design polynomial-time heuristics that perform redistributions and account for processor failures. A fault simulator is used to perform extensive simulations that demonstrate the usefulness of redistribution and the performance of the proposed heuristics.}, keywords = {co-scheduling, complexity results, heuristics, Redistribution, resilience, simulations}, doi = {10.1177/1094342017704979}, author = {Anne Benoit and Lo{\"\i}c Pottier and Yves Robert} }