@booklet {, title = {XaaS: Acceleration as a Service to Enable Productive High-Performance Cloud Computing}, year = {2024}, month = {2024-01}, publisher = {arXiv}, abstract = {HPC and Cloud have evolved independently, specializing their innovations into performance or productivity. Acceleration as a Service (XaaS) is a recipe to empower both fields with a shared execution platform that provides transparent access to computing resources, regardless of the underlying cloud or HPC service provider. Bridging HPC and cloud advancements, XaaS presents a unified architecture built on performance-portable containers. Our converged model concentrates on low-overhead, high-performance communication and computing, targeting resource-intensive workloads from climate simulations to machine learning. XaaS lifts the restricted allocation model of Function-as-a-Service (FaaS), allowing users to benefit from the flexibility and efficient resource utilization of serverless while supporting long-running and performance-sensitive workloads from HPC.}, url = {https://arxiv.org/abs/2401.04552}, author = {Torsten Hoefler and Marcin Copik and Pete Beckman and Andrew Jones and Ian Foster and Manish Parashar and Daniel Reed and Matthias Troyer and Thomas Schulthess and Dan Ernst and Jack Dongarra} } @conference {, title = {PAQR: Pivoting Avoiding QR factorization}, booktitle = {2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2023}, publisher = {IEEE}, organization = {IEEE}, address = {St. Petersburg, FL, USA}, doi = {10.1109/IPDPS54959.2023.00040}, url = {https://ieeexplore.ieee.org/document/10177407/}, author = {Sid-Lakhdar, Wissam and Cayrols, Sebastien and Bielich, Daniel and Abdelfattah, Ahmad and Luszczek, Piotr and Gates, Mark and Tomov, Stanimire and Johansen, Hans and Williams-Young, David and Davis, Timothy and Dongarra, Jack and Anzt, Hartwig} } @conference {, title = {Performance Insights into Device-initiated RMA Using Kokkos Remote Spaces}, booktitle = {2023 IEEE International Conference on Cluster Computing Workshops (CLUSTER Workshops)}, year = {2023}, month = {2023-11}, publisher = {IEEE}, organization = {IEEE}, address = {Santa Fe, NM, USA}, abstract = {Achieving scalable performance on supercomputers requires careful coordination of communication and computation. Often, MPI applications rely on buffering, packing, and sorting techniques to accommodate a two-sided API, minimize communication overhead, and achieve performance goals. As interconnects between accelerators become more performant and scalable, programming models such as SHMEM may have the opportunity to enable bandwidth maximization along with ease of programming. In this work, we take a closer look at device-initiated PGAS programming models using NVIDIA Corp{\textquoteright}s NVSHMEM communication library and our interface through the Kokkos Remote Spaces project. We show that benchmarks can benefit from this programming model in terms of performance and programmability. We anticipate similar results for miniapplications.}, doi = {10.1109/CLUSTERWorkshops61457.2023.00028}, url = {https://ieeexplore.ieee.org/document/10321871/}, author = {Mishler, Daniel and Ciesko, Jan and Olivier, Stephen and Bosilca, George} } @conference {, title = {Reducing Data Motion and Energy Consumption of Geospatial Modeling Applications Using Automated Precision Conversion}, booktitle = {2023 IEEE International Conference on Cluster Computing (CLUSTER)}, year = {2023}, month = {2023-11}, publisher = {IEEE}, organization = {IEEE}, address = {Santa Fe, NM, USA}, abstract = {The burgeoning interest in large-scale geospatial modeling, particularly within the domains of climate and weather prediction, underscores the concomitant critical importance of accuracy, scalability, and computational speed. Harnessing these complex simulations{\textquoteright} potential, however, necessitates innovative computational strategies, especially considering the increasing volume of data involved. Recent advancements in Graphics Processing Units (GPUs) have opened up new avenues for accelerating these modeling processes. In particular, their efficient utilization necessitates new strategies, such as mixed-precision arithmetic, that can balance the trade-off between computational speed and model accuracy. This paper leverages PaRSEC runtime system and delves into the opportunities provided by mixed-precision arithmetic to expedite large-scale geospatial modeling in heterogeneous environments. By using an automated conversion strategy, our mixed-precision approach significantly improves computational performance (up to 3X) on Summit supercomputer and reduces the associated energy consumption on various Nvidia GPU generations. Importantly, this implementation ensures the requisite accuracy in environmental applications, a critical factor in their operational viability. The findings of this study bear significant implications for future research and development in high-performance computing, underscoring the transformative potential of mixed-precision arithmetic on GPUs in addressing the computational demands of large-scale geospatial modeling and making a stride toward a more sustainable, efficient, and accurate future in large-scale environmental applications.}, doi = {10.1109/CLUSTER52292.2023.00035}, url = {https://ieeexplore.ieee.org/document/10319946/}, author = {Cao, Qinglei and Abdulah, Sameh and Ltaief, Hatem and Genton, Marc G. and Keyes, David and Bosilca, George} } @article {, title = {Accelerating Geostatistical Modeling and Prediction With Mixed-Precision Computations: A High-Productivity Approach With PaRSEC}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {33}, year = {2022}, month = {2022-04}, pages = {964 - 976}, abstract = {Geostatistical modeling, one of the prime motivating applications for exascale computing, is a technique for predicting desired quantities from geographically distributed data, based on statistical models and optimization of parameters. Spatial data are assumed to possess properties of stationarity or non-stationarity via a kernel fitted to a covariance matrix. A primary workhorse of stationary spatial statistics is Gaussian maximum log-likelihood estimation (MLE), whose central data structure is a dense, symmetric positive definite covariance matrix of the dimension of the number of correlated observations. Two essential operations in MLE are the application of the inverse and evaluation of the determinant of the covariance matrix. These can be rendered through the Cholesky decomposition and triangular solution. In this contribution, we reduce the precision of weakly correlated locations to single- or half- precision based on distance. We thus exploit mathematical structure to migrate MLE to a three-precision approximation that takes advantage of contemporary architectures offering BLAS3-like operations in a single instruction that are extremely fast for reduced precision. We illustrate application-expected accuracy worthy of double-precision from a majority half-precision computation, in a context where uniform single-precision is by itself insufficient. In tackling the complexity and imbalance caused by the mixing of three precisions, we deploy the PaRSEC runtime system. PaRSEC delivers on-demand casting of precisions while orchestrating tasks and data movement in a multi-GPU distributed-memory environment within a tile-based Cholesky factorization. Application-expected accuracy is maintained while achieving up to 1.59X by mixing FP64/FP32 operations on 1536 nodes of HAWK or 4096 nodes of Shaheen II , and up to 2.64X by mixing FP64/FP32/FP16 operations on 128 nodes of Summit , relative to FP64-only operations. This translates into up to 4.5, 4.7, ...}, keywords = {Computational modeling, Covariance matrices, Data models, Maximum likelihood estimation, Predictive models, runtime, Task analysis}, issn = {1045-9219}, doi = {10.1109/TPDS.2021.3084071}, url = {https://ieeexplore.ieee.org/document/9442267/https://ieeexplore.ieee.org/ielam/71/9575177/9442267-aam.pdfhttp://xplorestaging.ieee.org/ielx7/71/9575177/09442267.pdf?arnumber=9442267}, author = {Abdulah, Sameh and Qinglei Cao and Pei, Yu and George Bosilca and Jack Dongarra and Genton, Marc G. and Keyes, David E. and Ltaief, Hatem and Sun, Ying} } @techreport {, title = {Analysis of the Communication and Computation Cost of FFT Libraries towards Exascale}, journal = {ICL Technical Report}, number = {ICL-UT-22-07}, year = {2022}, month = {2022-07}, publisher = {Innovative Computing Laboratory}, author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Sebastien Cayrols and Gerald Ragghianti and Jack Dongarra} } @inbook {, title = {Approximate Computing for Scientific Applications}, booktitle = {Approximate Computing Techniques}, year = {2022}, month = {2022-01}, pages = {415 - 465}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, edition = {322}, abstract = {This chapter reviews the performance benefits that result from applying (software) approximate computing to scientific applications. For this purpose, we target two particular areas, linear algebra and deep learning, with the first one selected for being ubiquitous in scientific problems and the second one for its considerable and growing number of important applications both in industry and science. The review of linear algebra in scientific computing is focused on the iterative solution of sparse linear systems, exposing the prevalent costs of memory accesses in these methods, and demonstrating how approximate computing can help to reduce these overheads, for example, in the case of stationary solvers themselves or the application of preconditioners for the solution of sparse linear systems via Krylov subspace methods. The discussion of deep learning is focused on the use of approximate data transfer for cutting costs of host-to-device operations, as well as the use of adaptive precision for accelerating training of classical CNN architectures. Additionally we discuss model optimization and architecture search in presence of constraints for edge devices applications.}, isbn = {978-3-030-94704-0}, doi = {10.1007/978-3-030-94705-7_14}, url = {https://link.springer.com/chapter/10.1007/978-3-030-94705-7_14}, author = {Anzt, Hartwig and Casas, Marc and Malossi, ~Cristiano I. and Quintana-Ort{\'\i}, Enrique S and Scheidegger, Florian and Zhuang, Sicong}, editor = {Bosio, Alberto and M{\'e}nard, Daniel and Sentieys, Olivier} } @techreport {, title = {Communication Avoiding LU with Tournament Pivoting in SLATE}, journal = {SLATE Working Notes}, number = {18, ICL-UT-22-01}, year = {2022}, month = {2022-01}, author = {Rabab Alomairy and Mark Gates and Sebastien Cayrols and Dalal Sukkari and Kadir Akbudak and Asim YarKhan and Paul Bagwell and Jack Dongarra} } @article {, title = {Evaluating Data Redistribution in PaRSEC}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {33}, number = {8}, year = {2022}, month = {2022-08}, pages = {1856-1872}, doi = {10.1109/TPDS.2021.3131657}, author = {Qinglei Cao and George Bosilca and Losada, Nuria and Wu, Wei and Zhong, Dong and Jack Dongarra} } @techreport {, title = {FFT Benchmark Performance Experiments on Systems Targeting Exascale}, journal = {ICL Technical Report}, number = {ICL-UT-22-02}, year = {2022}, month = {2022-03}, author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Sebastien Cayrols and Gerald Ragghianti and Jack Dongarra} } @conference {, title = {A Framework to Exploit Data Sparsity in Tile Low-Rank Cholesky Factorization}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2022}, month = {2022-07}, doi = {10.1109/IPDPS53621.2022.00047}, url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=9820680\&isnumber=9820610}, author = {Qinglei Cao and Rabab Alomairy and Yu Pei and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @article {, title = {Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing}, journal = {ACM Transactions on Mathematical Software}, volume = {48}, year = {2022}, month = {2022-03}, pages = {1 - 33}, abstract = {In this article, we present Ginkgo, a modern C++ math library for scientific high performance computing. While classical linear algebra libraries act on matrix and vector objects, Ginkgo{\textquoteright}s design principle abstracts all functionality as {\textquotedblleft}linear operators,{\textquotedblright} motivating the notation of a {\textquotedblleft}linear operator algebra library.{\textquotedblright} Ginkgo{\textquoteright}s current focus is oriented toward providing sparse linear algebra functionality for high performance graphics processing unit (GPU) architectures, but given the library design, this focus can be easily extended to accommodate other algorithms and hardware architectures. We introduce this sophisticated software architecture that separates core algorithms from architecture-specific backends and provide details on extensibility and sustainability measures. We also demonstrate Ginkgo{\textquoteright}s usability by providing examples on how to use its functionality inside the MFEM and deal.ii finite element ecosystems. Finally, we offer a practical demonstration of Ginkgo{\textquoteright}s high performance on state-of-the-art GPU architectures.}, issn = {0098-3500}, doi = {10.1145/3480935}, url = {https://dl.acm.org/doi/10.1145/3480935}, author = {Anzt, Hartwig and Cojean, Terry and Flegar, Goran and G{\"o}bel, Fritz and Gr{\"u}tzmacher, Thomas and Nayak, Pratik and Ribizel, Tobias and Tsai, Yuhsiang Mike and Quintana-Ort{\'\i}, Enrique S} } @article {, title = {Ginkgo{\textemdash}A math library designed for platform portability}, journal = {Parallel Computing}, volume = {111}, year = {2022}, month = {2022-02}, pages = {102902}, abstract = {In an era of increasing computer system diversity, the portability of software from one system to another plays a central role. Software portability is important for the software developers as many software projects have a lifetime longer than a specific system, e.g., a supercomputer, and it is important for the domain scientists that realize their scientific application in a software framework and want to be able to run on one or another system. On a high level, there exist two approaches for realizing platform portability: (1) implementing software using a portability layer leveraging any technique which always generates specific kernels from another language or through an interface for running on different architectures; and (2) providing backends for different hardware architectures, with the backends typically differing in how and in which programming language functionality is realized due to using the language of choice for each hardware (e.g., CUDA kernels for NVIDIA GPUs, SYCL (DPC++) kernels to targeting Intel GPUs and other supported hardware, {\textellipsis}). In practice, these two approaches can be combined in applications to leverage their respective strengths. In this paper, we present how we realize portability across different hardware architectures for the Ginkgo library by following the second strategy and the goal to not only port to new hardware architectures but also achieve good performance. We present the Ginkgo library design, separating algorithms from hardware-specific kernels forming the distinct hardware executors, and report our experience when adding execution backends for NVIDIA, AMD, and Intel GPUs. We also present the performance we achieve with this approach for distinct hardware backends.}, keywords = {AMD, Intel, nVidia, performance portability, Platform Portability, Porting to GPU accelerators}, issn = {0167-8191}, doi = {https://doi.org/10.1016/j.parco.2022.102902}, url = {https://www.sciencedirect.com/science/article/pii/S0167819122000096}, author = {Terry Cojean and Yu-Hsiang Mike Tsai and Hartwig Anzt} } @inproceedings {, title = {Lossy all-to-all exchange for accelerating parallel 3-D FFTs on hybrid architectures with GPUs}, journal = {2022 IEEE International Conference on Cluster Computing (CLUSTER)}, year = {2022}, month = {2022-09}, pages = {152-160}, abstract = {In the context of parallel applications, communication is a critical part of the infrastructure and a potential bottleneck. The traditional approach to tackle communication challenges consists of redesigning algorithms so that the complexity or the communication volume is reduced. However, there are algorithms like the Fast Fourier Transform (FFT) where reducing the volume of communication is very challenging yet can reap large benefit in terms of time-to-completion. In this paper, we revisit the implementation of the MPI all-to-all routine at the core of 3D FFTs by using advanced MPI features, such as One-Sided Communication, and integrate data compression during communication to reduce the volume of data exchanged. Since some compression techniques are {\textquoteleft}lossy{\textquoteright} in the sense that they involve a loss of accuracy, we study the impact of lossy compression in heFFTe, the state-of-the-art FFT library for large scale 3D FFTs on hybrid architectures with GPUs. Consequently, we design an approximate FFT algorithm that trades off user-controlled accuracy for speed. We show that we speedup the 3D FFTs proportionally to the compression rate. In terms of accuracy, comparing our approach with a reduced precision execution, where both the data and the computation are in reduced precision, we show that when the volume of communication is compressed to the size of the reduced precision data, the approximate FFT algorithm is as fast as the one in reduced precision while the accuracy is one order of magnitude better.}, doi = {10.1109/CLUSTER51413.2022.00029}, author = {Cayrols, Sebastien and Li, Jiali and George Bosilca and Stanimire Tomov and Ayala, Alan and Dongarra, Jack} } @techreport {, title = {Mixed precision and approximate 3D FFTs: Speed for accuracy trade-off with GPU-aware MPI and run-time data compression}, journal = {ICL Technical Report}, number = {ICL-UT-22-04}, year = {2022}, month = {2022-05}, keywords = {All-to-all, Approximate FFTs, ECP, heFFTe, Lossy compression, mixed-precision algorithms, MPI}, author = {Sebastien Cayrols and Jiali Li and George Bosilca and Stanimire Tomov and Alan Ayala and Jack Dongarra} } @article {, title = {OpenMP application experiences: Porting to accelerated nodes}, journal = {Parallel Computing}, volume = {109}, year = {2022}, month = {2022-03}, abstract = {As recent enhancements to the OpenMP specification become available in its implementations, there is a need to share the results of experimentation in order to better understand the OpenMP implementation{\textquoteright}s behavior in practice, to identify pitfalls, and to learn how the implementations can be effectively deployed in scientific codes. We report on experiences gained and practices adopted when using OpenMP to port a variety of ECP applications, mini-apps and libraries based on different computational motifs to accelerator-based leadership-class high-performance supercomputer systems at the United States Department of Energy. Additionally, we identify important challenges and open problems related to the deployment of OpenMP. Through our report of experiences, we find that OpenMP implementations are successful on current supercomputing platforms and that OpenMP is a promising programming model to use for applications to be run on emerging and future platforms with accelerated nodes.}, issn = {01678191}, doi = {10.1016/j.parco.2021.102856}, url = {https://www.sciencedirect.com/science/article/pii/S0167819121001009}, author = {Bak, Seonmyeong and Bertoni, Colleen and Boehm, Swen and Budiardja, Reuben and Chapman, Barbara M. and Doerfert, Johannes and Eisenbach, Markus and Finkel, Hal and Hernandez, Oscar and Huber, Joseph and Iwasaki, Shintaro and Kale, Vivek and Kent, Paul R.C. and Kwack, JaeHyuk and Lin, Meifeng and Luszczek, Piotr and Luo, Ye and Pham, Buu and Pophale, Swaroop and Ravikumar, Kiran and Sarkar, Vivek and Scogland, Thomas and Tian, Shilei and Yeung, P.K.} } @techreport {, title = {PAQR: Pivoting Avoiding QR factorization}, journal = {ICL Technical Report}, number = {ICL-UT-22-06}, year = {2022}, month = {2022-06}, abstract = {The solution of linear least-squares problems is at the heart of many scientific and engineering applications. While any method able to minimize the backward error of such problems is considered numerically stable, the theory states that the forward error depends on the condition number of the matrix in the system of equations. On the one hand, the QR factorization is an efficient method to solve such problems, but the solutions it produces may have large forward errors when the matrix is deficient. On the other hand, QR with column pivoting (QRCP) is able to produce smaller forward errors on deficient matrices, but its cost is prohibitive compared to QR. The aim of this paper is to propose PAQR, an alternative solution method with the same cost (or smaller) as QR and as accurate as QRCP in practical cases, for the solution of rank-deficient linear least-squares problems. After presenting the algorithm and its implementations on different architectures, we compare its accuracy and performance results on a variety of application problems. }, author = {Wissam M. Sid-Lakhdar and Sebastien Cayrols and Daniel Bielich and Ahmad Abdelfattah and Piotr Luszczek and Mark Gates and Stanimire Tomov and Hans Johansen and David Williams-Young and Timothy A. Davis and Jack Dongarra} } @inproceedings {, title = {Porting Sparse Linear Algebra to~Intel GPUs}, journal = {Euro-Par 2021: Parallel Processing Workshops}, volume = {13098}, year = {2022}, month = {2022-06}, pages = {57 - 68}, publisher = {Springer International Publishing}, address = {Lisbon, Portugal}, abstract = {With discrete Intel GPUs entering the high performance computing landscape, there is an urgent need for production-ready software stacks for these platforms. In this paper, we report how we prepare the Ginkgo math library for Intel GPUs by developing a kernel backed based on the DPC++ programming environment. We discuss conceptual differences to the CUDA and HIP programming models and describe workflows for simplified code conversion. We benchmark advanced sparse linear algebra routines utilizing the converted kernels to assess the efficiency of the DPC++ backend in the hardware-specific performance bounds. We compare the performance of basic building blocks against routines providing the same functionality that ship with Intel{\textquoteright}s oneMKL vendor library.}, keywords = {Ginkgo, Intel GPUs, math library, oneAPI, SpMV}, isbn = {978-3-031-06155-4}, doi = {10.1007/978-3-031-06156-1_5}, url = {https://link.springer.com/chapter/10.1007/978-3-031-06156-1_5}, author = {Tsai, Yuhsiang M. and Cojean, Terry and Anzt, Hartwig}, editor = {Chaves, Ricardo and B. Heras, Dora and Ilic, Aleksandar and Unat, Didem and Badia, Rosa M. and Bracciali, Andrea and Diehl, Patrick and Dubey, Anshu and Sangyoon, Oh and L. Scott, Stephen and Ricci, Laura} } @article {, title = {Providing performance portable numerics for Intel GPUs}, journal = {Concurrency and Computation: Practice and Experience}, volume = {17}, year = {2022}, month = {2022-10}, abstract = {With discrete Intel GPUs entering the high-performance computing landscape, there is an urgent need for production-ready software stacks for these platforms. In this article, we report how we enable the Ginkgo math library to execute on Intel GPUs by developing a kernel backed based on the DPC++ programming environment. We discuss conceptual differences between the CUDA and DPC++ programming models and describe workflows for simplified code conversion. We evaluate the performance of basic and advanced sparse linear algebra routines available in Ginkgo{\textquoteright}s DPC++ backend in the hardware-specific performance bounds and compare against routines providing the same functionality that ship with Intel{\textquoteright}s oneMKL vendor library.}, keywords = {Ginkgo, Intel GPUs, math library, oneAPI, SpMV}, issn = {1532-0626}, doi = {10.1002/cpe.7400}, url = {https://onlinelibrary.wiley.com/doi/full/10.1002/cpe.7400}, author = {Tsai, Yu-Hsiang M. and Cojean, Terry and Anzt, Hartwig} } @inproceedings {, title = {Reshaping Geostatistical Modeling and Prediction for Extreme-Scale Environmental Applications}, journal = {2022 International Conference for High Performance Computing, Networking, Storage and Analysis (SC22)}, year = {2022}, month = {2022-11}, publisher = {IEEE Press}, address = {Dallas, TX}, abstract = {We extend the capability of space-time geostatistical modeling using algebraic approximations, illustrating application-expected accuracy worthy of double precision from majority low-precision computations and low-rank matrix approximations. We exploit the mathematical structure of the dense covariance matrix whose inverse action and determinant are repeatedly required in Gaussian log-likelihood optimization. Geostatistics augments first-principles modeling approaches for the prediction of environmental phenomena given the availability of measurements at a large number of locations; however, traditional Cholesky-based approaches grow cubically in complexity, gating practical extension to continental and global datasets now available. We combine the linear algebraic contributions of mixed-precision and low-rank computations within a tilebased Cholesky solver with on-demand casting of precisions and dynamic runtime support from PaRSEC to orchestrate tasks and data movement. Our adaptive approach scales on various systems and leverages the Fujitsu A64FX nodes of Fugaku to achieve up to 12X performance speedup against the highly optimized dense Cholesky implementation.}, keywords = {climate/weather prediction, dynamic runtime systems, high performance computing., low- rank matrix approximations, mixed-precision computations, space-time geospatial statistics, Task-based programming models}, isbn = {9784665454445}, url = {https://dl.acm.org/doi/abs/10.5555/3571885.3571888}, author = {Cao, Qinglei and Abdulah, Sameh and Rabab Alomairy and Pei, Yu and Pratik Nag and George Bosilca and Dongarra, Jack and Genton, Marc G. and Keyes, David and Ltaief, Hatem and Sun, Ying} } @article {, title = {Resiliency in numerical algorithm design for extreme scale simulations}, journal = {The International Journal of High Performance Computing Applications}, volume = {36371337212766180823}, year = {2022}, month = {2022-03}, pages = {251 - 285}, keywords = {Fault tolerance, Numerical algorithms, parallel computer architecture, resilience}, issn = {1094-3420}, doi = {10.1177/10943420211055188}, url = {http://journals.sagepub.com/doi/10.1177/10943420211055188http://journals.sagepub.com/doi/pdf/10.1177/10943420211055188http://journals.sagepub.com/doi/pdf/10.1177/10943420211055188http://journals.sagepub.com/doi/full-xml/10.1177/10943420211055188}, author = {Agullo, Emmanuel and Altenbernd, Mirco and Anzt, Hartwig and Bautista-Gomez, Leonardo and Benacchio, Tommaso and Bonaventura, Luca and Bungartz, Hans-Joachim and Chatterjee, Sanjay and Ciorba, Florina M and DeBardeleben, Nathan and Drzisga, Daniel and Eibl, Sebastian and Engelmann, Christian and Gansterer, Wilfried N and Giraud, Luc and G{\"o}ddeke, Dominik and Heisig, Marco and J{\'e}z{\'e}quel, Fabienne and Kohl, Nils and Li, Xiaoye Sherry and Lion, Romain and Mehl, Miriam and Mycek, Paul and Obersteiner, Michael and Quintana-Ort{\'\i}, Enrique S and Rizzi, Francesco and R{\"u}de, Ulrich and Schulz, Martin and Fung, Fred and Speck, Robert and Stals, Linda and Teranishi, Keita and Thibault, Samuel and Th{\"o}nnes, Dominik and Wagner, Andreas and Wohlmuth, Barbara} } @article {, title = {Using long vector extensions for MPI reductions}, journal = {Parallel Computing}, volume = {109}, year = {2022}, month = {2022-03}, pages = {102871}, abstract = {The modern CPU{\textquoteright}s design, including the deep memory hierarchies and SIMD/vectorization capability have a more significant impact on algorithms{\textquoteright} efficiency than the modest frequency increase observed recently. The current introduction of wide vector instruction set extensions (AVX and SVE) motivated vectorization to become a critical software component to increase efficiency and close the gap to peak performance. In this paper, we investigate the impact of the vectorization of MPI reduction operations. We propose an implementation of predefined MPI reduction operations using vector intrinsics (AVX and SVE) to improve the time-to-solution of the predefined MPI reduction operations. The evaluation of the resulting software stack under different scenarios demonstrates that the approach is not only efficient but also generalizable to many vector architectures. Experiments conducted on varied architectures (Intel Xeon Gold, AMD Zen 2, and Arm A64FX), show that the proposed vector extension optimized reduction operations significantly reduce completion time for collective communication reductions. With these optimizations, we achieve higher memory bandwidth and an increased efficiency for local computations, which directly benefit the overall cost of collective reductions and applications based on them.}, issn = {01678191}, doi = {10.1016/j.parco.2021.102871}, url = {https://www.sciencedirect.com/science/article/pii/S0167819121001137}, author = {Zhong, Dong and Cao, Qinglei and George Bosilca and Dongarra, Jack} } @article {, title = {Accelerating FFT towards Exascale Computing}, year = {2021}, publisher = {NVIDIA GPU Technology Conference (GTC2021)}, author = {Alan Ayala and Stanimire Tomov and Haidar, Azzam and Stoyanov, M. and Cayrols, Sebastien and Li, Jiali and George Bosilca and Jack Dongarra} } @article {, title = {Budget-aware scheduling algorithms for scientific workflows with stochastic task weights on IaaS Cloud platforms}, journal = {Concurrency and Computation: Practice and Experience}, volume = {33}, number = {17}, year = {2021}, pages = {e6065}, doi = {https://doi.org/10.1002/cpe.6065}, author = {Eddy Caron and Yves Caniou and Aur{\'e}lie Kong Win Chang and Yves Robert} } @article {, title = {Gingko: A Sparse Linear Algebrea Library for HPC}, year = {2021}, month = {2021-04}, publisher = {2021 ECP Annual Meeting}, author = {Hartwig Anzt and Natalie Beams and Terry Cojean and Fritz G{\"o}bel and Thomas Gr{\"u}tzmacher and Aditya Kashi and Pratik Nayak and Tobias Ribizel and Yuhsiang M. Tsai} } @article {, title = {GPU algorithms for Efficient Exascale Discretizations}, journal = {Parallel Computing}, volume = {108}, year = {2021}, pages = {102841}, abstract = {In this paper we describe the research and development activities in the Center for Efficient Exascale Discretization within the US Exascale Computing Project, targeting state-of-the-art high-order finite-element algorithms for high-order applications on GPU-accelerated platforms. We discuss the GPU developments in several components of the CEED software stack, including the libCEED, MAGMA, MFEM, libParanumal, and Nek projects. We report performance and capability improvements in several CEED-enabled applications on both NVIDIA and AMD GPU systems.}, keywords = {Exascale applications, Finite element methods, GPU acceleration, high-order discretizations, High-performance computing}, doi = {10.1016/j.parco.2021.102841}, author = {Abdelfattah, Ahmad and Valeria Barra and Natalie Beams and Bleile, Ryan and Brown, Jed and Camier, Jean-Sylvain and Carson, Robert and Chalmers, Noel and Dobrev, Veselin and Dudouit, Yohann and others} } @techreport {, title = {Interim Report on Benchmarking FFT Libraries on High Performance Systems}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-21-03}, year = {2021}, month = {2021-07}, publisher = {University of Tennessee}, type = {ICL Tech Report}, abstract = {The Fast Fourier Transform (FFT) is used in many applications such as molecular dynamics, spectrum estimation, fast convolution and correlation, signal modulation, and many wireless multimedia applications. FFTs are also heavily used in ECP applications, such as EXAALT, Copa, ExaSky-HACC, ExaWind, WarpX, and many others. As these applications{\textquoteright} accuracy and speed depend on the performance of the FFTs, we designed an FFT benchmark to mea- sure performance and scalability of currently available FFT packages and present the results from a pre-Exascale platform. Our benchmarking also stresses the overall capacity of system interconnect; thus, it may be considered as an indicator of the bisection bandwidth, communication contention noise, and the software overheads in MPI collectives that are of interest to many other ECP applications and libraries. This FFT benchmarking project aims to show the strengths and weaknesses of multiple FFT libraries and to indicate what can be done to improve their performance. In particular, we believe that the benchmarking results could help design and implement a fast and robust FFT library for 2D and 3D inputs, while targeting large-scale heterogeneous systems with multicore processors and hardware accelerators that are a co-designed in tandem with ECP applications. Our work involves studying and analyzing state-of-the-art FFT software both from vendors and available as open-source codes to better understand their performance.}, author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Cayrols, Sebastien and Ragghianti, Gerald and Jack Dongarra} } @conference {, title = {Leveraging PaRSEC Runtime Support to Tackle Challenging 3D Data-Sparse Matrix Problems}, booktitle = {35th IEEE International Parallel \& Distributed Processing Symposium (IPDPS 2021)}, year = {2021}, month = {2021-05}, publisher = {IEEE}, organization = {IEEE}, address = {Portland, OR}, abstract = {The task-based programming model associated with dynamic runtime systems has gained popularity for challenging problems because of workload imbalance, heterogeneous resources, or extreme concurrency. During the last decade, lowrank matrix approximations, where the main idea consists of exploiting data sparsity typically by compressing off-diagonal tiles up to an application-specific accuracy threshold, have been adopted to address the curse of dimensionality at extreme scale. In this paper, we create a bridge between the runtime and the linear algebra by communicating knowledge of the data sparsity to the runtime. We design and implement this synergistic approach with high user productivity in mind, in the context of the PaRSEC runtime system and the HiCMA numerical library. This requires to extend PaRSEC with new features to integrate rank information into the dataflow so that proper decisions can be taken at runtime. We focus on the tile low-rank (TLR) Cholesky factorization for solving 3D data-sparse covariance matrix problems arising in environmental applications. In particular, we employ the 3D exponential model of Matern matrix kernel, which exhibits challenging nonuniform {\textasciiacute}high ranks in off-diagonal tiles. We first provide a dynamic data structure management driven by a performance model to reduce extra floating-point operations. Next, we optimize the memory footprint of the application by relying on a dynamic memory allocator, and supported by a rank-aware data distribution to cope with the workload imbalance. Finally, we expose further parallelism using kernel recursive formulations to shorten the critical path. Our resulting high-performance implementation outperforms existing data-sparse TLR Cholesky factorization by up to 7-fold on a large-scale distributed-memory system, while minimizing the memory footprint up to a 44-fold factor. This multidisciplinary work highlights the need to empower runtime systems beyond their original duty of task scheduling for servicing next-generation low-rank matrix algebra libraries.}, keywords = {asynchronous executions and load balancing, dynamic runtime system, environmental applications, High-performance computing, low-rank matrix computations, task-based programming model, user productivity}, author = {Qinglei Cao and Yu Pei and Kadir Akbudak and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @article {, title = {libCEED: Fast algebra for high-order element-based discretizations}, journal = {Journal of Open Source Software}, volume = {6}, number = {63}, year = {2021}, pages = {2945}, abstract = {Finite element methods are widely used to solve partial differential equations (PDE) in science and engineering, but their standard implementation (Arndt et al., 2020; Kirk et al., 2006; Logg et al., 2012) relies on assembling sparse matrices. Sparse matrix multiplication and triangular operations perform a scalar multiply and add for each nonzero entry, just 2 floating point operations (flops) per scalar that must be loaded from memory (Williams et al., 2009). Modern hardware is capable of nearly 100 flops per scalar streamed from memory (Rupp, 2020) so sparse matrix operations cannot achieve more than about 2\% utilization of arithmetic units. Matrix assembly becomes even more problematic when the polynomial degree p of the basis functions is increased, resulting in O(pd) storage and O(p2d) compute per degree of freedom (DoF) in d dimensions. Methods pioneered by the spectral element community (Deville et al., 2002; Orszag, 1980) exploit problem structure to reduce costs to O(1) storage and O(p) compute per DoF, with very high utilization of modern CPUs and GPUs. Unfortunately, highquality implementations have been relegated to applications and intrusive frameworks that are often difficult to extend to new problems or incorporate into legacy applications, especially when strong preconditioners are required. libCEED, the Code for Efficient Extensible Discretization (Abdelfattah et al., 2021), is a lightweight library that provides a purely algebraic interface for linear and nonlinear operators and preconditioners with element-based discretizations. libCEED provides portable performance via run-time selection of implementations optimized for CPUs and GPUs, including support for just-in-time (JIT) compilation. It is designed for convenient use in new and legacy software, and offers interfaces in C99 (International Standards Organisation, 1999), Fortran77 (ANSI, 1978), Python (Python, 2021), Julia (Bezanson et al., 2017), and Rust (Rust, 2021). Users and library developers can integrate libCEED at a low level into existing applications in place of existing matrix-vector products without significant refactoring of their own discretization infrastructure. Alternatively, users can utilize integrated libCEED support in MFEM (Anderson et al., 2020; MFEM, 2021). In addition to supporting applications and discretization libraries, libCEED provides a platform for performance engineering and co-design, as well as an algebraic interface for solvers research like adaptive p-multigrid, much like how sparse matrix libraries enable development and deployment of algebraic multigrid solvers}, keywords = {finite elements, high-order methods, High-performance computing, matrix-free, spectral elements}, doi = {10.21105/joss.02945}, url = {https://doi.org/10.21105/joss.02945}, author = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jean-Sylvain Camier and Veselin Dobrev and Yohann Dudouit and Leila Ghaffari and Tzanio Kolev and David Medina and Will Pazner and Thilina Ratnayaka and Jeremy Thompson and Stanimire Tomov} } @techreport {, title = {P1673R3: A Free Function Linear algebra Interface Based on the BLAS}, journal = {ISO JTC1 SC22 WG22}, number = {P1673R3}, year = {2021}, month = {2021-04}, publisher = {ISO}, type = {standard}, abstract = {We believe this proposal is complementary to P1385, a proposal for a C++ Standard linear algebra library that introduces matrix and vector classes and overloaded arithmetic operators. In fact, we think that our proposal would make a natural foundation for a library like what P1385 proposes. However, a free function interface -- which clearly separates algorithms from data structures -- more naturally allows for a richer set of operations such as what the BLAS provides. A natural extension of the present proposal would include accepting P1385{\textquoteright}s matrix and vector objects as input for the algorithms proposed here. A straightforward way to do that would be for P1385{\textquoteright}s matrix and vector objects to make views of their data available as basic_mdspan.}, keywords = {C++, linear algebra}, url = {http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1673r3.pdf}, author = {Mark Hoemmen and Daisy Hollman and Christian Trott and Daniel Sunderland and Nevin Liber and Li-Ta Lo and Damien Lebrun-Grandie and Graham Lopez and Peter Caday and Sarah Knepper and Piotr Luszczek and Timothy Costa} } @article {, title = {A Set of Batched Basic Linear Algebra Subprograms and LAPACK Routines}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {47}, number = {3}, year = {2021}, pages = {1{\textendash}23}, abstract = {This article describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular, half precision is used in many very large scale applications, such as those associated with machine learning.}, keywords = {Computations on matrices, Mathematical analysis, Mathematics of computing, Numerical analysis}, doi = {10.1145/3431921}, author = {Abdelfattah, Ahmad and Costa, Timothy and Jack Dongarra and Mark Gates and Haidar, Azzam and Hammarling, Sven and Higham, Nicholas J and Kurzak, Jakub and Piotr Luszczek and Stanimire Tomov and others} } @techreport {, title = {SLATE Performance Improvements: QR and Eigenvalues}, journal = {SLATE Working Notes}, number = {17, ICL-UT-21-02}, year = {2021}, month = {2021-04}, author = {Kadir Akbudak and Paul Bagwell and Sebastien Cayrols and Mark Gates and Dalal Sukkari and Asim YarKhan and Jack Dongarra} } @article {, title = {A survey of numerical linear algebra methods utilizing mixed-precision arithmetic}, journal = {The International Journal of High Performance Computing Applications}, volume = {35}, number = {4}, year = {2021}, pages = {344{\textendash}369}, abstract = {The efficient utilization of mixed-precision numerical linear algebra algorithms can offer attractive acceleration to scientific computing applications. Especially with the hardware integration of low-precision special-function units designed for machine learning applications, the traditional numerical algorithms community urgently needs to reconsider the floating point formats used in the distinct operations to efficiently leverage the available compute power. In this work, we provide a comprehensive survey of mixed-precision numerical linear algebra routines, including the underlying concepts, theoretical background, and experimental results for both dense and sparse linear algebra problems.}, keywords = {GPUs, High-performance computing, linear algebra, Mixed-precision arithmetic, numerical mathematics}, doi = {10.1177/10943420211003313}, author = {Abdelfattah, Ahmad and Anzt, Hartwig and Boman, Erik G and Carson, Erin and Cojean, Terry and Jack Dongarra and Fox, Alyson and Mark Gates and Higham, Nicholas J and Li, Xiaoye S and others} } @techreport {, title = {ASCR@40: Four Decades of Department of Energy Leadership in Advanced Scientific Computing Research}, year = {2020}, month = {2020-08}, publisher = {Advanced Scientific Computing Advisory Committee (ASCAC), US Department of Energy}, url = {https://computing.llnl.gov/misc/ASCR@40-Highlights.pdf}, author = {Bruce Hendrickson and Paul Messina and Buddy Bland and Jackie Chen and Phil Colella and Eli Dart and Jack Dongarra and Thom Dunning and Ian Foster and Richard Gerber and Rachel Harken and Wendy Huntoon and Bill Johnston and John Sarrao and Jeff Vetter} } @techreport {, title = {ASCR@40: Highlights and Impacts of ASCR{\textquoteright}s Programs}, year = {2020}, month = {2020-06}, publisher = {US Department of Energy{\textquoteright}s Office of Advanced Scientific Computing Research}, abstract = {The Office of Advanced Scientific Computing Research (ASCR) sits within the Office of Science in the Department of Energy (DOE). Per their web pages, {\textquotedblleft}the mission of the ASCR program is to discover, develop, and deploy computational and networking capabilities to analyze, model, simulate, and predict complex phenomena important to the DOE.{\textquotedblright} This succinct statement encompasses a wide range of responsibilities for computing and networking facilities; for procuring, deploying, and operating high performance computing, networking, and storage resources; for basic research in mathematics and computer science; for developing and sustaining a large body of software; and for partnering with organizations across the Office of Science and beyond. While its mission statement may seem very contemporary, the roots of ASCR are quite deep{\textemdash}long predating the creation of DOE. Applied mathematics and advanced computing were both elements of the Theoretical Division of the Manhattan Project. In the early 1950s, the Manhattan Project scientist and mathematician John von Neumann, then a commissioner for the AEC (Atomic Energy Commission), advocated for the creation of a Mathematics program to support the continued development and applications of digital computing. Los Alamos National Laboratory (LANL) scientist John Pasta created such a program to fund researchers at universities and AEC laboratories. Under several organizational name changes, this program has persisted ever since, and would eventually grow to become ASCR.}, doi = {https://doi.org/10.2172/1631812}, url = {https://www.osti.gov/servlets/purl/1631812}, author = {Bruce Hendrickson and Paul Messina and Buddy Bland and Jackie Chen and Phil Colella and Eli Dart and Jack Dongarra and Thom Dunning and Ian Foster and Richard Gerber and Rachel Harken and Wendy Huntoon and Bill Johnston and John Sarrao and Jeff Vetter} } @techreport {1465, title = {Asynchronous SGD for DNN Training on Shared-Memory Parallel Architectures}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-04}, year = {2020}, month = {2020-03}, publisher = {University of Tennessee, Knoxville}, abstract = {We present a parallel asynchronous Stochastic Gradient Descent algorithm for shared memory architectures. Different from previous asynchronous algorithms, we consider the case where the gradient updates are not particularly sparse. In the context of the MagmaDNN framework, we compare the parallel efficiency of the asynchronous implementation with that of the traditional synchronous implementation. Tests are performed for training deep neural networks on multicore CPUs and GPU devices.}, keywords = {Asynchronous iterative methods, Deep learning, gpu, multicore CPU, Stochastic Gradient Descent}, author = {Florent Lopez and Edmond Chow and Stanimire Tomov and Jack Dongarra} } @conference {1485, title = {Asynchronous SGD for DNN Training on Shared-Memory Parallel Architectures}, booktitle = {Workshop on Scalable Deep Learning over Parallel And Distributed Infrastructures (ScaDL 2020)}, year = {2020}, month = {2020-05}, author = {Florent Lopez and Edmond Chow and Stanimire Tomov and Jack Dongarra} } @techreport {, title = {CEED ECP Milestone Report: Improve Performance and Capabilities of CEED-Enabled ECP Applications on Summit/Sierra}, journal = {ECP Milestone Reports}, year = {2020}, month = {2020-05}, publisher = {Zenodo}, doi = {https://doi.org/10.5281/zenodo.3860804}, url = {https://doi.org/10.5281/zenodo.3860804}, author = {Kolev, Tzanio and Fischer, Paul and Abdelfattah, Ahmad and Ananthan, Shreyas and Valeria Barra and Natalie Beams and Bleile, Ryan and Brown, Jed and Carson, Robert and Camier, Jean-Sylvain and Churchfield, Matthew and Dobrev, Veselin and Jack Dongarra and Dudouit, Yohann and Karakus, Ali and Kerkemeier, Stefan and Lan, YuHsiang and Medina, David and Merzari, Elia and Min, Misun and Parker, Scott and Ratnayaka, Thilina and Smith, Cameron and Sprague, Michael and Stitt, Thomas and Thompson, Jeremy and Tomboulides, Ananias and Stanimire Tomov and Tomov, Vladimir and Vargas, Arturo and Warburton, Tim and Weiss, Kenneth} } @conference {1478, title = {Communication Avoiding 2D Stencil Implementations over PaRSEC Task-Based Runtime}, booktitle = {2020 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2020}, month = {2020-05}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, abstract = {Stencil computation or general sparse matrix-vector product (SpMV) are key components in many algorithms like geometric multigrid or Krylov solvers. But their low arithmetic intensity means that memory bandwidth and network latency will be the performance limiting factors. The current architectural trend favors computations over bandwidth, worsening the already unfavorable imbalance. Previous work approached stencil kernel optimization either by improving memory bandwidth usage or by providing a Communication Avoiding (CA) scheme to minimize network latency in repeated sparse vector multiplication by replicating remote work in order to delay communications on the critical path. Focusing on minimizing communication bottleneck in distributed stencil computation, in this study we combine a CA scheme with the computation and communication overlapping that is inherent in a dataflow task-based runtime system such as PaRSEC to demonstrate their combined benefits. We implemented the 2D five point stencil (Jacobi iteration) in PETSc, and over PaRSEC in two flavors, full communications (base-PaRSEC) and CA-PaRSEC which operate directly on a 2D compute grid. Our results running on two clusters, NaCL and Stampede2 indicate that we can achieve 2{\texttimes} speedup over the standard SpMV solution implemented in PETSc, and in certain cases when kernel execution is not dominating the execution time, the CA-PaRSEC version achieved up to 57\% and 33\% speedup over base-PaRSEC implementation on NaCL and Stampede2 respectively.}, doi = {https://doi.org/10.1109/IPDPSW50202.2020.00127}, author = {Yu Pei and Qinglei Cao and George Bosilca and Piotr Luszczek and Victor Eijkhout and Jack Dongarra} } @conference {, title = {DeepFreeze: Towards Scalable Asynchronous Checkpointing of Deep Learning Models}, booktitle = {20th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)}, year = {2020}, month = {2020-05}, publisher = {IEEE}, organization = {IEEE}, address = {Melbourne, VIC, Australia}, abstract = {In the age of big data, deep learning has emerged as a powerful tool to extract insight and exploit its value, both in industry and scientific applications. One common pattern emerging in such applications is frequent checkpointing of the state of the learning model during training, needed in a variety of scenarios: analysis of intermediate states to explain features and correlations with training data, exploration strategies involving alternative models that share a common ancestor, knowledge transfer, resilience, etc. However, with increasing size of the learning models and popularity of distributed data-parallel training approaches, simple checkpointing techniques used so far face several limitations: low serialization performance, blocking I/O, stragglers due to the fact that only a single process is involved in checkpointing. This paper proposes a checkpointing technique specifically designed to address the aforementioned limitations, introducing efficient asynchronous techniques to hide the overhead of serialization and I/O, and distribute the load over all participating processes. Experiments with two deep learning applications (CANDLE and ResNet) on a pre-Exascale HPC platform (Theta) shows significant improvement over state-of-art, both in terms of checkpointing duration and runtime overhead.}, doi = {https://doi.org/10.1109/CCGrid49817.2020.00-76}, author = {Bogdan Nicolae and Jiali Li and Justin M. Wozniak and George Bosilca and Matthieu Dorier and Franck Cappello} } @article {, title = {Evaluating Asynchronous Schwarz Solvers on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2020}, month = {2020-08}, abstract = {With the commencement of the exascale computing era, we realize that the majority of the leadership supercomputers are heterogeneous and massively parallel. Even a single node can contain multiple co-processors such as GPUs and multiple CPU cores. For example, ORNL{\textquoteright}s Summit accumulates six NVIDIA Tesla V100 GPUs and 42 IBM Power9 cores on each node. Synchronizing across compute resources of multiple nodes can be prohibitively expensive. Hence, it is necessary to develop and study asynchronous algorithms that circumvent this issue of bulk-synchronous computing. In this study, we examine the asynchronous version of the abstract Restricted Additive Schwarz method as a solver. We do not explicitly synchronize, but allow the communication between the sub-domains to be completely asynchronous, thereby removing the bulk synchronous nature of the algorithm. We accomplish this by using the one-sided Remote Memory Access (RMA) functions of the MPI standard. We study the benefits of using such an asynchronous solver over its synchronous counterpart. We also study the communication patterns governed by the partitioning and the overlap between the sub-domains on the global solver. Finally, we show that this concept can render attractive performance benefits over the synchronous counterparts even for a well-balanced problem.}, keywords = {abstract Schwarz methods, Asynchronous solvers, exascale, GPUs, multicore processors, parallel numerical linear algebra}, doi = {https://doi.org/10.1177/1094342020946814}, author = {Pratik Nayak and Terry Cojean and Hartwig Anzt} } @conference {, title = {Evaluating the Performance of NVIDIA{\textquoteright}s A100 Ampere GPU for Sparse and Batched Computations}, booktitle = {2020 IEEE/ACM Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {GPU accelerators have become an important backbone for scientific high performance-computing, and the performance advances obtained from adopting new GPU hardware are significant. In this paper we take a first look at NVIDIA{\textquoteright}s newest server-line GPU, the A100 architecture, part of the Ampere generation. Specifically, we assess its performance for sparse and batch computations, as these routines are relied upon in many scientific applications, and compare to the p}, keywords = {Batched linear algebra, NVIDIA A100 GPU, sparse linear algebra, Sparse Matrix Vector Product}, author = {Hartwig Anzt and Yuhsiang M. Tsai and Ahmad Abdelfattah and Terry Cojean and Jack Dongarra} } @conference {, title = {Extreme-Scale Task-Based Cholesky Factorization Toward Climate and Weather Prediction Applications}, booktitle = {Platform for Advanced Scientific Computing Conference (PASC20)}, year = {2020}, month = {2020-06}, publisher = {ACM}, organization = {ACM}, address = {Geneva, Switzerland}, abstract = {Climate and weather can be predicted statistically via geospatial Maximum Likelihood Estimates (MLE), as an alternative to running large ensembles of forward models. The MLE-based iterative optimization procedure requires the solving of large-scale linear systems that performs a Cholesky factorization on a symmetric positive-definite covariance matrix---a demanding dense factorization in terms of memory footprint and computation. We propose a novel solution to this problem: at the mathematical level, we reduce the computational requirement by exploiting the data sparsity structure of the matrix off-diagonal tiles by means of low-rank approximations; and, at the programming-paradigm level, we integrate PaRSEC, a dynamic, task-based runtime to reach unparalleled levels of efficiency for solving extreme-scale linear algebra matrix operations. The resulting solution leverages fine-grained computations to facilitate asynchronous execution while providing a flexible data distribution to mitigate load imbalance. Performance results are reported using 3D synthetic datasets up to 42M geospatial locations on 130, 000 cores, which represent a cornerstone toward fast and accurate predictions of environmental applications.}, doi = {https://doi.org/10.1145/3394277.3401846}, author = {Qinglei Cao and Yu Pei and Kadir Akbudak and Aleksandr Mikhalev and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @conference {, title = {Flexible Data Redistribution in a Task-Based Runtime System}, booktitle = {IEEE International Conference on Cluster Computing (Cluster 2020)}, year = {2020}, month = {2020-09}, publisher = {IEEE}, organization = {IEEE}, address = {Kobe, Japan}, abstract = {Data redistribution aims to reshuffle data to optimize some objective for an algorithm. The objective can be multi-dimensional, such as improving computational load balance or decreasing communication volume or cost, with the ultimate goal to increase the efficiency and therefore decrease the time-to-solution for the algorithm. The classical redistribution problem focuses on optimally scheduling communications when reshuffling data between two regular, usually block-cyclic, data distributions. Recently, task-based runtime systems have gained popularity as a potential candidate to address the programming complexity on the way to exascale. In addition to an increase in portability against complex hardware and software systems, task-based runtime systems have the potential to be able to more easily cope with less-regular data distribution, providing a more balanced computational load during the lifetime of the execution. In this scenario, it becomes paramount to develop a general redistribution algorithm for task-based runtime systems, which could support all types of regular and irregular data distributions. In this paper, we detail a flexible redistribution algorithm, capable of dealing with redistribution problems without constraints of data distribution and data size and implement it in a task-based runtime system, PaRSEC. Performance results show great capability compared to ScaLAPACK, and applications highlight an increased efficiency with little overhead in terms of data distribution and data size.}, doi = {https://doi.org/10.1109/CLUSTER49012.2020.00032}, author = {Qinglei Cao and George Bosilca and Wei Wu and Dong Zhong and Aurelien Bouteiller and Jack Dongarra} } @article {, title = {Ginkgo: A High Performance Numerical Linear Algebra Library}, journal = {Journal of Open Source Software}, volume = {5}, year = {2020}, month = {2020-08}, abstract = {Ginkgo is a production-ready sparse linear algebra library for high performance computing on GPU-centric architectures with a high level of performance portability and focuses on software sustainability. The library focuses on solving sparse linear systems and accommodates a large variety of matrix formats, state-of-the-art iterative (Krylov) solvers and preconditioners, which make the library suitable for a variety of scientific applications. Ginkgo supports many architectures such as multi-threaded CPU, NVIDIA GPUs, and AMD GPUs. The heavy use of modern C++ features simplifies the addition of new executor paradigms and algorithmic functionality without introducing significant performance overhead. Solving linear systems is usually one of the most computationally and memory intensive aspects of any application. Hence there has been a significant amount of effort in this direction with software libraries such as UMFPACK (Davis, 2004) and CHOLMOD (Chen, Davis, Hager, \& Rajamanickam, 2008) for solving linear systems with direct methods and PETSc (Balay et al., 2020), Trilinos ({\textquotedblleft}The Trilinos Project Website,{\textquotedblright} 2020), Eigen (Guennebaud, Jacob, \& others, 2010) and many more to solve linear systems with iterative methods. With Ginkgo, we aim to ensure high performance while not compromising portability. Hence, we provide very efficient low level kernels optimized for different architectures and separate these kernels from the algorithms thereby ensuring extensibility and ease of use. Ginkgo is also a part of the xSDK effort (Bartlett et al., 2017) and available as a Spack (Gamblin et al., 2015) package. xSDK aims to provide infrastructure for and interoperability between a collection of related and complementary software elements to foster rapid and efficient development of scientific applications using High Performance Computing. Within this effort, we provide interoperability with application libraries such as deal.ii (Arndt et al., 2019) and mfem (Anderson et al., 2020). Ginkgo provides wrappers within these two libraries so that they can take advantage of the features of Ginkgo.}, doi = {https://doi.org/10.21105/joss.02260}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai} } @article {, title = {Ginkgo: A Node-Level Sparse Linear Algebra Library for HPC (Poster)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai and Jack Dongarra} } @conference {, title = {HAN: A Hierarchical AutotuNed Collective Communication Framework}, booktitle = {IEEE Cluster Conference}, year = {2020}, month = {2020-09}, publisher = {Best Paper Award, IEEE Computer Society Press}, organization = {Best Paper Award, IEEE Computer Society Press}, address = {Kobe, Japan}, abstract = {High-performance computing (HPC) systems keep growing in scale and heterogeneity to satisfy the increasing computational need, and this brings new challenges to the design of MPI libraries, especially with regard to collective operations. To address these challenges, we present "HAN," a new hierarchical autotuned collective communication framework in Open MPI, which selects suitable homogeneous collective communication modules as submodules for each hardware level, uses collective operations from the submodules as tasks, and organizes these tasks to perform efficient hierarchical collective operations. With a task-based design, HAN can easily swap out submodules, while keeping tasks intact, to adapt to new hardware. This makes HAN suitable for the current platform and provides a strong and flexible support for future HPC systems. To provide a fast and accurate autotuning mechanism, we present a novel cost model based on benchmarking the tasks instead of a whole collective operation. This method drastically reduces tuning time, as the cost of tasks can be reused across different message sizes, and is more accurate than existing cost models. Our cost analysis suggests the autotuning component can find the optimal configuration in most cases. The evaluation of the HAN framework suggests our design significantly improves the default Open MPI and achieves decent speedups against state-of-the-art MPI implementations on tested applications.}, author = {Xi Luo and Wei Wu and George Bosilca and Yu Pei and Qinglei Cao and Thananon Patinyasakdikul and Dong Zhong and Jack Dongarra} } @conference {1372, title = {Improved Energy-Aware Strategies for Periodic Real-Time Tasks under Reliability Constraints}, booktitle = {40th IEEE Real-Time Systems Symposium (RTSS 2019)}, year = {2020}, month = {2020-02}, publisher = {IEEE Press}, organization = {IEEE Press}, address = {York, UK}, author = {Li Han and Louis-Claude Canon and Jing Liu and Yves Robert and Frederic Vivien} } @article {, title = {Integrating Deep Learning in Domain Science at Exascale (MagmaDNN)}, year = {2020}, month = {2020-12}, publisher = {DOD HPCMP seminar}, address = {virtual}, abstract = {We will present some of the current challenges in the design and integration of deep learning AI with traditional HPC simulations. We evaluate existing packages for readiness to run efficiently deep learning models and applications on large scale HPC systems, identify challenges, and propose new asynchronous parallelization and optimization techniques for current large-scale heterogeneous systems and up-coming exascale systems. These developments, along with existing HPC AI software capabilities, have been integrated in MagmaDNN, an open source HPC deep learning framework. Many deep learning frameworks are targeted towards data scientists and fall short in providing quality integration into existing HPC workflows. This paper discusses the necessities of an HPC deep learning framework and how these can be provided, e.g., as in MagmaDNN, through a deep integration with existing HPC libraries such as MAGMA and its modular memory management, MPI, CuBLAS, CuDNN, MKL, and HIP. Advancements are also illustrated through the use of algorithmic enhancements in reduced and mixed-precision and asynchronous optimization methods. Finally, we present illustrations and potential solutions on enhancing traditional compute and data intensive applications at ORNL and UTK with AI. The approaches and future challenges are illustrated on materials science, imaging, and climate applications.}, author = {Stanimire Tomov and Kwai Wong and Jack Dongarra and Rick Archibald and Edmond Chow and Eduardo D{\textquoteright}Azevedo and Markus Eisenbach and Rocco Febbo and Florent Lopez and Daniel Nichols and Junqi Yin} } @techreport {, title = {Integrating Deep Learning in Domain Sciences at Exascale}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-10}, year = {2020}, month = {2020-08}, publisher = {University of Tennessee}, abstract = {This paper presents some of the current challenges in designing deep learning artificial intelligence (AI) and integrating it with traditional high-performance computing (HPC) simulations. We evaluate existing packages for their ability to run deep learning models and applications on large-scale HPC systems e ciently, identify challenges, and propose new asynchronous parallelization and optimization techniques for current large-scale heterogeneous systems and upcoming exascale systems. These developments, along with existing HPC AI software capabilities, have been integrated into MagmaDNN, an open-source HPC deep learning framework. Many deep learning frameworks are targeted at data scientists and fall short in providing quality integration into existing HPC workflows. This paper discusses the necessities of an HPC deep learning framework and how those needs can be provided (e.g., as in MagmaDNN) through a deep integration with existing HPC libraries, such as MAGMA and its modular memory management, MPI, CuBLAS, CuDNN, MKL, and HIP. Advancements are also illustrated through the use of algorithmic enhancements in reduced- and mixed-precision, as well as asynchronous optimization methods. Finally, we present illustrations and potential solutions for enhancing traditional compute- and data-intensive applications at ORNL and UTK with AI. The approaches and future challenges are illustrated in materials science, imaging, and climate applications.}, author = {Rick Archibald and Edmond Chow and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Markus Eisenbach and Rocco Febbo and Florent Lopez and Daniel Nichols and Stanimire Tomov and Kwai Wong and Junqi Yin} } @conference {, title = {Integrating Deep Learning in Domain Sciences at Exascale}, booktitle = {2020 Smoky Mountains Computational Sciences and Engineering Conference (SMC 2020)}, year = {2020}, month = {2020-08}, abstract = {This paper presents some of the current challenges in designing deep learning artificial intelligence (AI) and integrating it with traditional high-performance computing (HPC) simulations. We evaluate existing packages for their ability to run deep learning models and applications on large-scale HPC systems e ciently, identify challenges, and propose new asynchronous parallelization and optimization techniques for current large-scale heterogeneous systems and upcoming exascale systems. These developments, along with existing HPC AI software capabilities, have been integrated into MagmaDNN, an open-source HPC deep learning framework. Many deep learning frameworks are targeted at data scientists and fall short in providing quality integration into existing HPC workflows. This paper discusses the necessities of an HPC deep learning framework and how those needs can be provided (e.g., as in MagmaDNN) through a deep integration with existing HPC libraries, such as MAGMA and its modular memory management, MPI, CuBLAS, CuDNN, MKL, and HIP. Advancements are also illustrated through the use of algorithmic enhancements in reduced- and mixed-precision, as well as asynchronous optimization methods. Finally, we present illustrations and potential solutions for enhancing traditional compute- and data-intensive applications at ORNL and UTK with AI. The approaches and future challenges are illustrated in materials science, imaging, and climate applications.}, author = {Rick Archibald and Edmond Chow and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Markus Eisenbach and Rocco Febbo and Florent Lopez and Daniel Nichols and Stanimire Tomov and Kwai Wong and Junqi Yin} } @article {, title = {Load-Balancing Sparse Matrix Vector Product Kernels on GPUs}, journal = {ACM Transactions on Parallel Computing}, volume = {7}, year = {2020}, month = {2020-03}, abstract = {Efficient processing of Irregular Matrices on Single Instruction, Multiple Data (SIMD)-type architectures is a persistent challenge. Resolving it requires innovations in the development of data formats, computational techniques, and implementations that strike a balance between thread divergence, which is inherent for Irregular Matrices, and padding, which alleviates the performance-detrimental thread divergence but introduces artificial overheads. To this end, in this article, we address the challenge of designing high performance sparse matrix-vector product (SpMV) kernels designed for Nvidia Graphics Processing Units (GPUs). We present a compressed sparse row (CSR) format suitable for unbalanced matrices. We also provide a load-balancing kernel for the coordinate (COO) matrix format and extend it to a hybrid algorithm that stores part of the matrix in SIMD-friendly Ellpack format (ELL) format. The ratio between the ELL- and the COO-part is determined using a theoretical analysis of the nonzeros-per-row distribution. For the over 2,800 test matrices available in the Suite Sparse matrix collection, we compare the performance against SpMV kernels provided by NVIDIA{\textquoteright}s cuSPARSE library and a heavily-tuned sliced ELL (SELL-P) kernel that prevents unnecessary padding by considering the irregular matrices as a combination of matrix blocks stored in ELL format.}, doi = {https://doi.org/10.1145/3380930}, author = {Hartwig Anzt and Terry Cojean and Chen Yen-Chen and Jack Dongarra and Goran Flegar and Pratik Nayak and Stanimire Tomov and Yuhsiang M. Tsai and Weichung Wang} } @conference {, title = {Multiprecision Block-Jacobi for Iterative Triangular Solves}, booktitle = {European Conference on Parallel Processing (Euro-Par 2020)}, year = {2020}, month = {2020-08}, publisher = {Springer}, organization = {Springer}, abstract = {Recent research efforts have shown that Jacobi and block-Jacobi relaxation methods can be used as an effective and highly parallel approach for the solution of sparse triangular linear systems arising in the application of ILU-type preconditioners. Simultaneously, a few independent works have focused on designing efficient high performance adaptive-precision block-Jacobi preconditioning (block-diagonal scaling), in the context of the iterative solution of sparse linear systems, on manycore architectures. In this paper, we bridge the gap between relaxation methods based on regular splittings and preconditioners by demonstrating that iterative refinement can be leveraged to construct a relaxation method from the preconditioner. In addition, we exploit this insight to construct a highly-efficient sparse triangular system solver for graphics processors that combines iterative refinement with the block-Jacobi preconditioner available in the Ginkgo library.}, keywords = {Block-Jacobi, graphics processing units (GPUs), incomplete factorization preconditioning, multiprecision, sparse linear algebra}, doi = {https://doi.org/10.1007/978-3-030-57675-2_34}, author = {Fritz Goebel and Hartwig Anzt and Terry Cojean and Goran Flegar and Enrique S. Quintana-Orti} } @techreport {1454, title = {Performance Tuning SLATE}, journal = {SLATE Working Notes}, number = {14, ICL-UT-20-01}, year = {2020}, month = {2020-01}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Mark Gates and Ali Charara and Asim YarKhan and Dalal Sukkari and Mohammed Al Farhan and Jack Dongarra} } @article {, title = {A Set of Batched Basic Linear Algebra Subprograms}, journal = {ACM Transactions on Mathematical Software}, year = {2020}, month = {2020-10}, abstract = {This paper describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular half precision is used in many very large scale applications, such as those associated with machine learning.}, author = {Ahmad Abdelfattah and Timothy Costa and Jack Dongarra and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Mawussi Zounon} } @article {, title = {SLATE: Software for Linear Algebra Targeting Exascale (POSTER)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Ali Charara and Jakub Kurzak and Asim YarKhan and Mohammed Al Farhan and Dalal Sukkari and Jack Dongarra} } @article {1464, title = {SLATE Tutorial}, year = {2020}, month = {2020-02}, publisher = {2020 ECP Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Jakub Kurzak and Asim YarKhan and Ali Charara and Jamie Finney and Dalal Sukkari and Mohammed Al Farhan and Ichitaro Yamazaki and Panruo Wu and Jack Dongarra} } @techreport {1278, title = {SLATE Users{\textquoteright} Guide}, journal = {SLATE Working Notes}, number = {10, ICL-UT-19-01}, year = {2020}, month = {2020-07}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Mark Gates and Ali Charara and Jakub Kurzak and Asim YarKhan and Mohammed Al Farhan and Dalal Sukkari and Jack Dongarra} } @conference {, title = {Sparse Linear Algebra on AMD and NVIDIA GPUs{\textemdash}The Race is On}, booktitle = {ISC High Performance}, year = {2020}, month = {2020-06}, publisher = {Springer}, organization = {Springer}, abstract = {Efficiently processing sparse matrices is a central and performance-critical part of many scientific simulation codes. Recognizing the adoption of manycore accelerators in HPC, we evaluate in this paper the performance of the currently best sparse matrix-vector product (SpMV) implementations on high-end GPUs from AMD and NVIDIA. Specifically, we optimize SpMV kernels for the CSR, COO, ELL, and HYB format taking the hardware characteristics of the latest GPU technologies into account. We compare for 2,800 test matrices the performance of our kernels against AMD{\textquoteright}s hipSPARSE library and NVIDIA{\textquoteright}s cuSPARSE library, and ultimately assess how the GPU technologies from AMD and NVIDIA compare in terms of SpMV performance.}, keywords = {AMD, GPUs, nVidia, sparse matrix vector product (SpMV)}, doi = {https://doi.org/10.1007/978-3-030-50743-5_16}, author = {Yuhsiang M. Tsai and Terry Cojean and Hartwig Anzt} } @techreport {, title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic}, journal = {SLATE Working Notes}, number = {15, ICL-UT-20-08}, year = {2020}, month = {2020-07}, publisher = {University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang} } @conference {, title = {Task Bench: A Parameterized Benchmark for Evaluating Parallel Runtime Performance}, booktitle = {International Conference for High Performance Computing Networking, Storage, and Analysis (SC20)}, year = {2020}, month = {2020-11}, publisher = {ACM}, organization = {ACM}, abstract = {We present Task Bench, a parameterized benchmark designed to explore the performance of distributed programming systems under a variety of application scenarios. Task Bench dramatically lowers the barrier to benchmarking and comparing multiple programming systems by making the implementation for a given system orthogonal to the benchmarks themselves: every benchmark constructed with Task Bench runs on every Task Bench implementation. Furthermore, Task Bench{\textquoteright}s parameterization enables a wide variety of benchmark scenarios that distill the key characteristics of larger applications. To assess the effectiveness and overheads of the tested systems, we introduce a novel metric, minimum effective task granularity (METG). We conduct a comprehensive study with 15 programming systems on up to 256 Haswell nodes of the Cori supercomputer. Running at scale, 100μs-long tasks are the finest granularity that any system runs efficiently with current technologies. We also study each system{\textquoteright}s scalability, ability to hide communication and mitigate load imbalance.}, url = {https://dl.acm.org/doi/10.5555/3433701.3433783}, author = {Elliott Slaughter and Wei Wu and Yuankun Fu and Legend Brandenburg and Nicolai Garcia and Wilhem Kautz and Emily Marx and Kaleb S. Morris and Qinglei Cao and George Bosilca and Seema Mirchandaney and Wonchan Lee and Sean Treichler and Patrick McCormick and Alex Aiken} } @conference {, title = {Using Advanced Vector Extensions AVX-512 for MPI Reduction}, booktitle = {EuroMPI/USA {\textquoteright}20: 27th European MPI Users{\textquoteright} Group Meeting}, year = {2020}, month = {2020-09}, address = {Austin, TX}, abstract = {As the scale of high-performance computing (HPC) systems continues to grow, researchers are devoted themselves to explore increasing levels of parallelism to achieve optimal performance. The modern CPU{\textquoteright}s design, including its features of hierarchical memory and SIMD/vectorization capability, governs algorithms{\textquoteright} efficiency. The recent introduction of wide vector instruction set extensions (AVX and SVE) motivated vectorization to become of critical importance to increase efficiency and close the gap to peak performance. In this paper, we propose an implementation of predefined MPI reduction operations utilizing AVX, AVX2 and AVX-512 intrinsics to provide vector-based reduction operation and to improve the timeto- solution of these predefined MPI reduction operations. With these optimizations, we achieve higher efficiency for local computations, which directly benefit the overall cost of collective reductions. The evaluation of the resulting software stack under different scenarios demonstrates that the solution is at the same time generic and efficient. Experiments are conducted on an Intel Xeon Gold cluster, which shows our AVX-512 optimized reduction operations achieve 10X performance benefits than Open MPI default for MPI local reduction.}, keywords = {Instruction level parallelism, Intel AVX2/AVX-512, Long vector extension, MPI reduction operation, Single instruction multiple data, Vector operation}, doi = {https://doi.org/10.1145/3416315.3416316}, author = {Dong Zhong and Qinglei Cao and George Bosilca and Jack Dongarra} } @article {, title = {Using Advanced Vector Extensions AVX-512 for MPI Reduction (Poster)}, year = {2020}, month = {2020-09}, publisher = {EuroMPI/USA {\textquoteright}20: 27th European MPI Users{\textquoteright} Group Meeting}, address = {Austin, TX}, author = {Dong Zhong and George Bosilca and Qinglei Cao and Jack Dongarra} } @conference {1484, title = {Using Arm Scalable Vector Extension to Optimize Open MPI}, booktitle = {20th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID 2020)}, year = {2020}, month = {2020-05}, publisher = {IEEE/ACM}, organization = {IEEE/ACM}, address = {Melbourne, Australia}, abstract = {As the scale of high-performance computing (HPC) systems continues to grow, increasing levels of parallelism must be implored to achieve optimal performance. Recently, the processors support wide vector extensions, vectorization becomes much more important to exploit the potential peak performance of target architecture. Novel processor architectures, such as the Armv8-A architecture, introduce Scalable Vector Extension (SVE) - an optional separate architectural extension with a new set of A64 instruction encodings, which enables even greater parallelisms. In this paper, we analyze the usage and performance of the SVE instructions in Arm SVE vector Instruction Set Architecture (ISA); and utilize those instructions to improve the memcpy and various local reduction operations. Furthermore, we propose new strategies to improve the performance of MPI operations including datatype packing/unpacking and MPI reduction. With these optimizations, we not only provide a higher-parallelism for a single node, but also achieve a more efficient communication scheme of message exchanging. The resulting efforts have been implemented in the context of OPEN MPI, providing efficient and scalable capabilities of SVE usage and extending the possible implementations of SVE to a more extensive range of programming and execution paradigms. The evaluation of the resulting software stack under different scenarios with both simulator and Fujitsu{\textquoteright}s A64FX processor demonstrates that the solution is at the same time generic and efficient.}, keywords = {ARMIE, datatype pack and unpack, local reduction, non-contiguous accesses, SVE, Vector Length Agnostic}, doi = {https://doi.org/10.1109/CCGrid49817.2020.00-71}, author = {Dong Zhong and Pavel Shamis and Qinglei Cao and George Bosilca and Jack Dongarra} } @techreport {1433, title = {CEED ECP Milestone Report: Performance Tuning of CEED Software and 1st and 2nd Wave Apps}, year = {2019}, month = {2019-10}, publisher = {Zenodo}, doi = {https://doi.org/10.5281/zenodo.3477618}, author = {Stanimire Tomov and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jed Brown and Jean-Sylvain Camier and Veselin Dobrev and Jack Dongarra and Yohann Dudouit and Paul Fischer and Ali Karakus and Stefan Kerkemeier and Tzanio Kolev and YuHsiang Lan and Elia Merzari and Misun Min and Aleks Obabko and Scott Parker and Thilina Ratnayaka and Jeremy Thompson and Ananias Tomboulides and Vladimir Tomov and Tim Warburton} } @conference {1449, title = {Characterization of Power Usage and Performance in Data-Intensive Applications using MapReduce over MPI}, booktitle = {2019 International Conference on Parallel Computing (ParCo2019)}, year = {2019}, month = {2019-09}, address = {Prague, Czech Republic}, author = {Joshua Davis and Tao Gao and Sunita Chandrasekaran and Heike Jagode and Anthony Danalis and Pavan Balaji and Jack Dongarra and Michela Taufer} } @techreport {1398, title = {A Collection of Presentations from the BDEC2 Workshop in Kobe, Japan}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-09}, year = {2019}, month = {2019-02}, publisher = {University of Tennessee, Knoxville}, author = {Rosa M. Badia and Micah Beck and Fran{\c c}ois Bodin and Taisuke Boku and Franck Cappello and Alok Choudhary and Carlos Costa and Ewa Deelman and Nicola Ferrier and Katsuki Fujisawa and Kohei Fujita and Maria Girone and Geoffrey Fox and Shantenu Jha and Yoshinari Kameda and Christian Kniep and William Kramer and James Lin and Kengo Nakajima and Yiwei Qiu and Kishore Ramachandran and Glenn Ricart and Kim Serradell and Dan Stanzione and Lin Gan and Martin Swany and Christine Sweeney and Alex Szalay and Christine Kirkpatrick and Kenton McHenry and Alainna White and Steve Tuecke and Ian Foster and Joe Mambretti and William. M Tang and Michela Taufer and Miguel V{\'a}zquez} } @techreport {1399, title = {A Collection of White Papers from the BDEC2 Workshop in Poznan, Poland}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-10}, year = {2019}, month = {2019-05}, publisher = {University of Tennessee, Knoxville}, author = {Gabriel Antoniu and Alexandru Costan and Ovidiu Marcu and Maria S. P{\'e}rez and Nenad Stojanovic and Rosa M. Badia and Miguel V{\'a}zquez and Sergi Girona and Micah Beck and Terry Moore and Piotr Luszczek and Ezra Kissel and Martin Swany and Geoffrey Fox and Vibhatha Abeykoon and Selahattin Akkas and Kannan Govindarajan and Gurhan Gunduz and Supun Kamburugamuve and Niranda Perera and Ahmet Uyar and Pulasthi Wickramasinghe and Chathura Widanage and Maria Girone and Toshihiro Hanawa and Richard Moreno and Ariel Oleksiak and Martin Swany and Ryousei Takano and M.P. van Haarlem and J. van Leeuwen and J.B.R. Oonk and T. Shimwell and L.V.E. Koopmans} } @techreport {1408, title = {A Collection of White Papers from the BDEC2 Workshop in San Diego, CA}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-13}, year = {2019}, month = {2019-10}, publisher = {University of Tennessee}, author = {Ilkay Altintas and Kyle Marcus and Volkan Vural and Shweta Purawat and Daniel Crawl and Gabriel Antoniu and Alexandru Costan and Ovidiu Marcu and Prasanna Balaprakash and Rongqiang Cao and Yangang Wang and Franck Cappello and Robert Underwood and Sheng Di and Justin M. Wozniak and Jon C. Calhoun and Cong Xu and Antonio Lain and Paolo Faraboschi and Nic Dube and Dejan Milojicic and Balazs Gerofi and Maria Girone and Viktor Khristenko and Tony Hey and Erza Kissel and Yu Liu and Richard Loft and Pekka Manninen and Sebastian von Alfthan and Takemasa Miyoshi and Bruno Raffin and Olivier Richard and Denis Trystram and Maryam Rahnemoonfar and Robin Murphy and Joel Saltz and Kentaro Sano and Rupak Roy and Kento Sato and Jian Guo and Jen s Domke and Weikuan Yu and Takaki Hatsui and Yasumasa Joti and Alex Szalay and William M. Tang and Michael R. Wyatt II and Michela Taufer and Todd Gamblin and Stephen Herbein and Adam Moody and Dong H. Ahn and Rich Wolski and Chandra Krintz and Fatih Bakir and Wei-tsung Lin and Gareth George} } @article {1312, title = {Combining Checkpointing and Replication for Reliable Execution of Linear Workflows with Fail-Stop and Silent Errors}, journal = {International Journal of Networking and Computing}, volume = {9}, number = {1}, year = {2019}, month = {2019}, pages = {2-27}, abstract = {Large-scale platforms currently experience errors from two di?erent sources, namely fail-stop errors (which interrupt the execution) and silent errors (which strike unnoticed and corrupt data). This work combines checkpointing and replication for the reliable execution of linear work?ows on platforms subject to these two error types. While checkpointing and replication have been studied separately, their combination has not yet been investigated despite its promising potential to minimize the execution time of linear work?ows in error-prone environments. Moreover, combined checkpointing and replication has not yet been studied in the presence of both fail-stop and silent errors. The combination raises new problems: for each task, we have to decide whether to checkpoint and/or replicate it to ensure its reliable execution. We provide an optimal dynamic programming algorithm of quadratic complexity to solve both problems. This dynamic programming algorithm has been validated through extensive simulations that reveal the conditions in which checkpointing only, replication only, or the combination of both techniques, lead to improved performance.}, keywords = {checkpoint, fail-stop error; silent error, HPC, linear workflow, Replication}, issn = {2185-2847}, url = {http://www.ijnc.org/index.php/ijnc/article/view/194}, author = {Anne Benoit and Aurelien Cavelan and Florina M. Ciorba and Valentin Le F{\`e}vre and Yves Robert} } @article {1369, title = {A Customized Precision Format Based on Mantissa Segmentation for Accelerating Sparse Linear Algebra}, journal = {Concurrency and Computation: Practice and Experience}, volume = {40319}, year = {2019}, month = {2019-01}, issn = {1532-0626}, doi = {https://doi.org/10.1002/cpe.5418}, author = {Thomas Gruetzmacher and Terry Cojean and Goran Flegar and Fritz G{\"o}bel and Hartwig Anzt} } @article {1314, title = {A Generic Approach to Scheduling and Checkpointing Workflows}, journal = {International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1255-1274}, keywords = {checkpoint, fail-stop error, resilience, workflow}, doi = {https://doi.org/10.1177/1094342019866891}, author = {Li Han and Valentin Le F{\`e}vre and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @article {, title = {{A Generic Approach to Scheduling and Checkpointing Workflows}}, journal = {Int. Journal of High Performance Computing Applications}, volume = {33}, number = {6}, year = {2019}, pages = {1255-1274}, author = {Han, Li and Le F{\`e}vre, Valentin and Canon, Louis-Claude and Robert, Yves and Vivien, Fr{\'e}d{\'e}ric} } @inproceedings {1404, title = {Least Squares Solvers for Distributed-Memory Machines with GPU Accelerators}, journal = {ACM International Conference on Supercomputing (ICS {\textquoteright}19)}, year = {2019}, month = {2019-06}, pages = {117{\textendash}126}, publisher = {ACM}, address = {Phoenix, Arizona}, isbn = {9781450360791}, doi = {https://dl.acm.org/doi/abs/10.1145/3330345.3330356}, author = {Jakub Kurzak and Mark Gates and Ali Charara and Asim YarKhan and Jack Dongarra} } @inproceedings {1405, title = {Linear Systems Solvers for Distributed-Memory Machines with GPU Accelerators}, journal = {Euro-Par 2019: Parallel Processing}, volume = {11725}, year = {2019}, month = {2019-08}, pages = {495{\textendash}506}, publisher = {Springer}, isbn = {978-3-030-29399-4}, doi = {https://doi.org/10.1007/978-3-030-29400-7_35}, url = {https://link.springer.com/chapter/10.1007/978-3-030-29400-7_35}, author = {Kurzak, Jakub and Mark Gates and Charara, Ali and Asim YarKhan and Yamazaki, Ichitaro and Jack Dongarra}, editor = {Yahyapour, Ramin} } @article {1366, title = {MagmaDNN 0.2 High-Performance Data Analytics for Manycore GPUs and CPUs}, year = {2019}, month = {2019-01}, publisher = {University of Tennessee}, doi = {10.13140/RG.2.2.14906.64961}, author = {Lucien Ng and Sihan Chen and Alex Gessinger and Daniel Nichols and Sophia Cheng and Anu Meenasorna and Kwai Wong and Stanimire Tomov and Azzam Haidar and Eduardo D{\textquoteright}Azevedo and Jack Dongarra} } @conference {1326, title = {MagmaDNN: Accelerated Deep Learning Using MAGMA}, booktitle = {Practice and Experience in Advanced Research Computing (PEARC {\textquoteright}19)}, year = {2019}, month = {2019-07}, publisher = {ACM}, organization = {ACM}, address = {Chicago, IL}, author = {Daniel Nichols and Kwai Wong and Stanimire Tomov and Lucien Ng and Sihan Chen and Alex Gessinger} } @conference {1436, title = {ParILUT {\textendash} A Parallel Threshold ILU for GPUs}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2019}, month = {2019-05}, publisher = {IEEE}, organization = {IEEE}, address = {Rio de Janeiro, Brazil}, abstract = {In this paper, we present the first algorithm for computing threshold ILU factorizations on GPU architectures. The proposed ParILUT-GPU algorithm is based on interleaving parallel fixed-point iterations that approximate the incomplete factors for an existing nonzero pattern with a strategy that dynamically adapts the nonzero pattern to the problem characteristics. This requires the efficient selection of thresholds that separate the values to be dropped from the incomplete factors, and we design a novel selection algorithm tailored towards GPUs. All components of the ParILUT-GPU algorithm make heavy use of the features available in the latest NVIDIA GPU generations, and outperform existing multithreaded CPU implementations.}, doi = {https://doi.org/10.1109/IPDPS.2019.00033}, author = {Hartwig Anzt and Tobias Ribizel and Goran Flegar and Edmond Chow and Jack Dongarra} } @conference {1452, title = {Performance Analysis of Tile Low-Rank Cholesky Factorization Using PaRSEC Instrumentation Tools}, booktitle = {Workshop on Programming and Performance Visualization Tools (ProTools 19) at SC19}, year = {2019}, month = {2019-11}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, author = {Qinglei Cao and Yu Pei and Thomas Herault and Kadir Akbudak and Aleksandr Mikhalev and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @article {1311, title = {Performance of Asynchronous Optimized Schwarz with One-sided Communication}, journal = {Parallel Computing}, volume = {86}, year = {2019}, month = {2019-08}, pages = {66-81}, abstract = {In asynchronous iterative methods on distributed-memory computers, processes update their local solutions using data from other processes without an implicit or explicit global synchronization that corresponds to advancing the global iteration counter. In this work, we test the asynchronous optimized Schwarz domain-decomposition iterative method using various one-sided (remote direct memory access) communication schemes with passive target completion. The results show that when one-sided communication is well-supported, the asynchronous version of optimized Schwarz can outperform the synchronous version even for perfectly balanced partitionings of the problem on a supercomputer with uniform nodes.}, issn = {0167-8191}, doi = {https://doi.org/10.1016/j.parco.2019.05.004}, url = {http://www.sciencedirect.com/science/article/pii/S0167819118301261}, author = {Ichitaro Yamazaki and Edmond Chow and Aurelien Bouteiller and Jack Dongarra} } @conference {1339, title = {Scheduling Independent Stochastic Tasks on Heterogeneous Cloud Platforms}, booktitle = {IEEE Cluster 2019}, year = {2019}, month = {2019-09}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Albuquerque, New Mexico}, author = {Yiqin Gao and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @article {1315, title = {Scheduling Independent Stochastic Tasks under Deadline and Budget Constraints}, journal = {International Journal of High Performance Computing Applications}, volume = {34}, year = {2019}, month = {2019-06}, pages = {246-264}, abstract = {This article discusses scheduling strategies for the problem of maximizing the expected number of tasks that can be executed on a cloud platform within a given budget and under a deadline constraint. The execution times of tasks follow independent and identically distributed probability laws. The main questions are how many processors to enroll and whether and when to interrupt tasks that have been executing for some time. We provide complexity results and an asymptotically optimal strategy for the problem instance with discrete probability distributions and without deadline. We extend the latter strategy for the general case with continuous distributions and a deadline and we design an efficient heuristic which is shown to outperform standard approaches when running simulations for a variety of useful distribution laws.}, doi = {https://doi.org/10.1177/1094342019852135}, author = {Louis-Claude Canon and Aur{\'e}lie Kong Win Chang and Yves Robert and Frederic Vivien} } @article {1463, title = {SLATE: Design of a Modern Distributed and Accelerated Linear Algebra Library}, year = {2019}, month = {2019-11}, publisher = {International Conference for High Performance Computing, Networking, Storage and Analysis (SC19)}, address = {Denver, CO}, author = {Mark Gates and Jakub Kurzak and Ali Charara and Asim YarKhan and Jack Dongarra} } @conference {1450, title = {SLATE: Design of a Modern Distributed and Accelerated Linear Algebra Library}, booktitle = {International Conference for High Performance Computing, Networking, Storage and Analysis (SC19)}, year = {2019}, month = {2019-11}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, abstract = {The SLATE (Software for Linear Algebra Targeting Exascale) library is being developed to provide fundamental dense linear algebra capabilities for current and upcoming distributed high-performance systems, both accelerated CPU-GPU based and CPU based. SLATE will provide coverage of existing ScaLAPACK functionality, including the parallel BLAS; linear systems using LU and Cholesky; least squares problems using QR; and eigenvalue and singular value problems. In this respect, it will serve as a replacement for ScaLAPACK, which after two decades of operation, cannot adequately be retrofitted for modern accelerated architectures. SLATE uses modern techniques such as communication-avoiding algorithms, lookahead panels to overlap communication and computation, and task-based scheduling, along with a modern C++ framework. Here we present the design of SLATE and initial reports of several of its components.}, doi = {https://doi.org/10.1145/3295500.3356223}, author = {Mark Gates and Jakub Kurzak and Ali Charara and Asim YarKhan and Jack Dongarra} } @techreport {1279, title = {SLATE Developers{\textquoteright} Guide}, journal = {SLATE Working Notes}, number = {11, ICL-UT-19-02}, year = {2019}, month = {2019-12}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Ali Charara and Mark Gates and Jakub Kurzak and Asim YarKhan and Jack Dongarra} } @techreport {1304, title = {SLATE Mixed Precision Performance Report}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-03}, year = {2019}, month = {2019-04}, publisher = {University of Tennessee}, author = {Ali Charara and Jack Dongarra and Mark Gates and Jakub Kurzak and Asim YarKhan} } @techreport {1321, title = {SLATE Working Note 12: Implementing Matrix Inversions}, journal = {SLATE Working Notes}, number = {12, ICL-UT-19-04}, year = {2019}, month = {2019-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Jakub Kurzak and Mark Gates and Ali Charara and Asim YarKhan and Jack Dongarra} } @techreport {1394, title = {SLATE Working Note 13: Implementing Singular Value and Symmetric/Hermitian Eigenvalue Solvers}, journal = {SLATE Working Notes}, number = {13, ICL-UT-19-07}, year = {2019}, note = {revision 06-2023}, month = {2019-09}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Mark Gates and Mohammed Al Farhan and Ali Charara and Jakub Kurzak and Dalal Sukkari and Asim YarKhan and Jack Dongarra} } @article {1438, title = {Towards a New Peer Review Concept for Scientific Computing ensuring Technical Quality, Software Sustainability, and Result Reproducibility}, journal = {Proceedings in Applied Mathematics and Mechanics}, volume = {19}, year = {2019}, month = {2019-11}, abstract = {In this position paper we argue for implementing an alternative peer review process for scientific computing contributions that promotes high quality scientific software developments as fully-recognized conference submission. The idea is based on leveraging the code reviewers{\textquoteright} feedback on scientific software contributions to community software developments as a third-party review involvement. Providing open access to this technical review would complement the scientific review of the contribution, efficiently reduce the workload of the undisclosed reviewers, improve the algorithm implementation quality and software sustainability, and ensure full reproducibility of the reported results. Using this process creates incentives to publish scientific algorithms in open source software {\textendash} instead of designing prototype algorithms with the unique purpose of publishing a paper. In addition, the comments and suggestions of the community being archived in the versioning control systems ensure that also community reviewers are receiving credit for the review contributions {\textendash} unlike reviewers in the traditional peer review process. Finally, it reflects the particularity of the scientific computing community using conferences rather than journals as the main publication venue.}, issn = {1617-7061}, doi = {https://doi.org/10.1002/pamm.201900490}, author = {Hartwig Anzt and Terry Cojean and Eileen Kuhn} } @conference {1318, title = {Towards Continuous Benchmarking}, booktitle = {Platform for Advanced Scientific Computing Conference (PASC 2019)}, year = {2019}, month = {2019-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Zurich, Switzerland}, abstract = {We present an automated performance evaluation framework that enables an automated workflow for testing and performance evaluation of software libraries. Integrating this component into an ecosystem enables sustainable software development, as a community effort, via a web application for interactively evaluating the performance of individual software components. The performance evaluation tool is based exclusively on web technologies, which removes the burden of downloading performance data or installing additional software. We employ this framework for the Ginkgo software ecosystem, but the framework can be used with essentially any software project, including the comparison between different software libraries. The Continuous Integration (CI) framework of Ginkgo is also extended to automatically run a benchmark suite on predetermined HPC systems, store the state of the machine and the environment along with the compiled binaries, and collect results in a publicly accessible performance data repository based on Git. The Ginkgo performance explorer (GPE) can be used to retrieve the performance data from the repository, and visualizes it in a web browser. GPE also implements an interface that allows users to write scripts, archived in a Git repository, to extract particular data, compute particular metrics, and visualize them in many different formats (as specified by the script). The combination of these approaches creates a workflow which enables performance reproducibility and software sustainability of scientific software. In this paper, we present example scripts that extract and visualize performance data for Ginkgo{\textquoteright}s SpMV kernels that allow users to identify the optimal kernel for specific problem characteristics.}, isbn = {9781450367707}, doi = {https://doi.org/10.1145/3324989.3325719}, author = {Hartwig Anzt and Yen Chen Chen and Terry Cojean and Jack Dongarra and Goran Flegar and Pratik Nayak and Enrique S. Quintana-Orti and Yuhsiang M. Tsai and Weichung Wang} } @conference {, title = {Towards Portable Online Prediction of Network Utilization Using MPI-Level Monitoring}, booktitle = {2019 European Conference on Parallel Processing (Euro-Par 2019)}, year = {2019}, month = {2019-08}, publisher = {Springer}, organization = {Springer}, address = {G{\"o}ttingen, Germany}, abstract = {Stealing network bandwidth helps a variety of HPC runtimes and services to run additional operations in the background without negatively affecting the applications. A key ingredient to make this possible is an accurate prediction of the future network utilization, enabling the runtime to plan the background operations in advance, such as to avoid competing with the application for network bandwidth. In this paper, we propose a portable deep learning predictor that only uses the information available through MPI introspection to construct a recurrent sequence-to-sequence neural network capable of forecasting network utilization. We leverage the fact that most HPC applications exhibit periodic behaviors to enable predictions far into the future (at least the length of a period). Our online approach does not have an initial training phase, it continuously improves itself during application execution without incurring significant computational overhead. Experimental results show better accuracy and lower computational overhead compared with the state-of-the-art on two representative applications.}, doi = {https://doi.org/10.1007/978-3-030-29400-7_4}, author = {Shu-Mei Tseng and Bogdan Nicolae and George Bosilca and Emmanuel Jeannot and Aparna Chandramowlishwaran and Franck Cappello} } @article {1331, title = {Accelerating 2D FFT: Exploit GPU Tensor Cores through Mixed-Precision}, year = {2018}, month = {2018-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC18), ACM Student Research Poster}, address = {Dallas, TX}, author = {Xaiohe Cheng and Anumeena Soma and Eduardo D{\textquoteright}Azevedo and Kwai Wong and Stanimire Tomov} } @article {1300, title = {Batched BLAS (Basic Linear Algebra Subprograms) 2018 Specification}, year = {2018}, month = {2018-07}, abstract = {This document describes an API for Batch Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). We focus on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The extensions beyond the original BLAS standard are considered that specify a programming interface not only for routines with uniformly-sized matrices and/or vectors but also for the situation where the sizes vary. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance manycore platforms. These include multicore and many-core CPU processors; GPUs and coprocessors; as well as other hardware accelerators with floating-point compute facility.}, author = {Jack Dongarra and Iain Duff and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jonathan Hogg and Pedro Valero Lara and Piotr Luszczek and Mawussi Zounon and Samuel D. Relton and Stanimire Tomov and Timothy Costa and Sarah Knepper} } @article {1211, title = {Big Data and Extreme-Scale Computing: Pathways to Convergence - Toward a Shaping Strategy for a Future Software and Data Ecosystem for Scientific Inquiry}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-07}, pages = {435{\textendash}479}, abstract = {Over the past four years, the Big Data and Exascale Computing (BDEC) project organized a series of five international workshops that aimed to explore the ways in which the new forms of data-centric discovery introduced by the ongoing revolution in high-end data analysis (HDA) might be integrated with the established, simulation-centric paradigm of the high-performance computing (HPC) community. Based on those meetings, we argue that the rapid proliferation of digital data generators, the unprecedented growth in the volume and diversity of the data they generate, and the intense evolution of the methods for analyzing and using that data are radically reshaping the landscape of scientific computing. The most critical problems involve the logistics of wide-area, multistage workflows that will move back and forth across the computing continuum, between the multitude of distributed sensors, instruments and other devices at the networks edge, and the centralized resources of commercial clouds and HPC centers. We suggest that the prospects for the future integration of technological infrastructures and research ecosystems need to be considered at three different levels. First, we discuss the convergence of research applications and workflows that establish a research paradigm that combines both HPC and HDA, where ongoing progress is already motivating efforts at the other two levels. Second, we offer an account of some of the problems involved with creating a converged infrastructure for peripheral environments, that is, a shared infrastructure that can be deployed throughout the network in a scalable manner to meet the highly diverse requirements for processing, communication, and buffering/storage of massive data workflows of many different scientific domains. Third, we focus on some opportunities for software ecosystem convergence in big, logically centralized facilities that execute large-scale simulations and models and/or perform large-scale data analytics. We close by offering some conclusions and recommendations for future investment and policy review.}, doi = {https://doi.org/10.1177/1094342018778123}, author = {Mark Asch and Terry Moore and Rosa M. Badia and Micah Beck and Pete Beckman and Thierry Bidot and Fran{\c c}ois Bodin and Franck Cappello and Alok Choudhary and Bronis R. de Supinski and Ewa Deelman and Jack Dongarra and Anshu Dubey and Geoffrey Fox and Haohuan Fu and Sergi Girona and Michael Heroux and Yutaka Ishikawa and Kate Keahey and David Keyes and William T. Kramer and Jean-Fran{\c c}ois Lavignon and Yutong Lu and Satoshi Matsuoka and Bernd Mohr and St{\'e}phane Requena and Joel Saltz and Thomas Schulthess and Rick Stevens and Martin Swany and Alexander Szalay and William Tang and Ga{\"e}l Varoquaux and Jean-Pierre Vilotte and Robert W. Wisniewski and Zhiwei Xu and Igor Zacharov} } @conference {1197, title = {Budget-Aware Scheduling Algorithms for Scientific Workflows with Stochastic Task Weights on Heterogeneous IaaS Cloud Platforms}, booktitle = {2018 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2018}, month = {2018-05}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {This paper introduces several budget-aware algorithms to deploy scientific workflows on IaaS cloud platforms, where users can request Virtual Machines (VMs) of different types, each with specific cost and speed parameters. We use a realistic application/platform model with stochastic task weights, and VMs communicating through a datacenter. We extend two well-known algorithms, MinMin and HEFT, and make scheduling decisions based upon machine availability and available budget. During the mapping process, the budget-aware algorithms make conservative assumptions to avoid exceeding the initial budget; we further improve our results with refined versions that aim at re-scheduling some tasks onto faster VMs, thereby spending any budget fraction leftover by the first allocation. These refined variants are much more time-consuming than the former algorithms, so there is a trade-off to find in terms of scalability. We report an extensive set of simulations with workflows from the Pegasus benchmark suite. Most of the time our budget-aware algorithms succeed in achieving efficient makespans while enforcing the given budget, despite (i) the uncertainty in task weights and (ii) the heterogeneity of VMs in both cost and speed values.}, keywords = {budget aware algorithm, multi criteria scheduling, workflow}, doi = {10.1109/IPDPSW.2018.00014}, author = {Yves Caniou and Eddy Caron and Aur{\'e}lie Kong Win Chang and Yves Robert} } @article {1187, title = {Checkpointing Workflows for Fail-Stop Errors}, journal = {IEEE Transactions on Computers}, volume = {67}, year = {2018}, month = {2018-08}, pages = {1105{\textendash}1120}, abstract = {We consider the problem of orchestrating the execution of workflow applications structured as Directed Acyclic Graphs (DAGs) on parallel computing platforms that are subject to fail-stop failures. The objective is to minimize expected overall execution time, or makespan. A solution to this problem consists of a schedule of the workflow tasks on the available processors and of a decision of which application data to checkpoint to stable storage, so as to mitigate the impact of processor failures. To address this challenge, we consider a restricted class of graphs, Minimal Series-Parallel Graphs (M-SPGS), which is relevant to many real-world workflow applications. For this class of graphs, we propose a recursive list-scheduling algorithm that exploits the M-SPG structure to assign sub-graphs to individual processors, and uses dynamic programming to decide how to checkpoint these sub-graphs. We assess the performance of our algorithm for production workflow configurations, comparing it to an approach in which all application data is checkpointed and an approach in which no application data is checkpointed. Results demonstrate that our algorithm outperforms both the former approach, because of lower checkpointing overhead, and the latter approach, because of better resilience to failures.}, keywords = {checkpoint, fail-stop error, resilience, workflow}, url = {http://ieeexplore.ieee.org/document/8279499/}, author = {Li Han and Louis-Claude Canon and Henri Casanova and Yves Robert and Frederic Vivien} } @techreport {1397, title = {A Collection of White Papers from the BDEC2 Workshop in Bloomington, IN}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-15}, year = {2018}, month = {2018-11}, publisher = {University of Tennessee, Knoxville}, author = {James Ahrens and Christopher M. Biwer and Alexandru Costan and Gabriel Antoniu and Maria S. P{\'e}rez and Nenad Stojanovic and Rosa Badia and Oliver Beckstein and Geoffrey Fox and Shantenu Jha and Micah Beck and Terry Moore and Sunita Chandrasekaran and Carlos Costa and Thierry Deutsch and Luigi Genovese and Tarek El-Ghazawi and Ian Foster and Dennis Gannon and Toshihiro Hanawa and Tevfik Kosar and William Kramer and Madhav V. Marathe and Christopher L. Barrett and Takemasa Miyoshi and Alex Pothen and Ariful Azad and Judy Qiu and Bo Peng and Ravi Teja and Sahil Tyagi and Chathura Widanage and Jon Koskey and Maryam Rahnemoonfar and Umakishore Ramachandran and Miles Deegan and William Tang and Osamu Tatebe and Michela Taufer and Michel Cuende and Ewa Deelman and Trilce Estrada and Rafael Ferreira Da Silva and Harrel Weinstein and Rodrigo Vargas and Miwako Tsuji and Kevin G. Yager and Wanling Gao and Jianfeng Zhan and Lei Wang and Chunjie Luo and Daoyi Zheng and Xu Wen and Rui Ren and Chen Zheng and Xiwen He and Hainan Ye and Haoning Tang and Zheng Cao and Shujie Zhang and Jiahui Dai} } @article {1193, title = {Computing the Expected Makespan of Task Graphs in the Presence of Silent Errors}, journal = {Parallel Computing}, volume = {75}, year = {2018}, month = {2018-07}, pages = {41{\textendash}60}, abstract = {Applications structured as Directed Acyclic Graphs (DAGs) of tasks occur in many domains, including popular scientific workflows. DAG scheduling has thus received an enormous amount of attention. Many of the popular DAG scheduling heuristics make scheduling deci- sions based on path lengths. At large scale compute platforms are subject to various types of failures with non-negligible probabilities of occurrence. Failures that have recently re- ceived increased attention are {\textquotedblleft}silent errors,{\textquotedblright} which cause data corruption. Tolerating silent errors is done by checking the validity of computed results and possibly re-executing a task from scratch. The execution time of a task then becomes a random variable, and so do path lengths in a DAG. Unfortunately, computing the expected makespan of a DAG (and equivalently computing expected path lengths in a DAG) is a computationally dif- ficult problem. Consequently, designing effective scheduling heuristics in this context is challenging. In this work, we propose an algorithm that computes a first order approxi- mation of the expected makespan of a DAG when tasks are subject to silent errors. We find that our proposed approximation outperforms previously proposed approaches both in terms of approximation error and of speed.}, keywords = {Expected makespan, fault-tolerance, scheduling, Scientific workflows, silent errors, Task graphs}, doi = {https://doi.org/10.1016/j.parco.2018.03.004}, author = {Henri Casanova and Julien Herrmann and Yves Robert} } @article {1218, title = {Coping with Silent and Fail-Stop Errors at Scale by Combining Replication and Checkpointing}, journal = {Journal of Parallel and Distributed Computing}, volume = {122}, year = {2018}, month = {2018-12}, pages = {209{\textendash}225}, abstract = {This paper provides a model and an analytical study of replication as a technique to cope with silent errors, as well as a mixture of both silent and fail-stop errors on large-scale platforms. Compared with fail-stop errors that are immediately detected when they occur, silent errors require a detection mechanism. To detect silent errors, many application-specific techniques are available, either based on algorithms (e.g., ABFT), invariant preservation or data analytics, but replication remains the most transparent and least intrusive technique. We explore the right level (duplication, triplication or more) of replication for two frameworks: (i) when the platform is subject to only silent errors, and (ii) when the platform is subject to both silent and fail-stop errors. A higher level of replication is more expensive in terms of resource usage but enables to tolerate more errors and to even correct some errors, hence there is a trade-off to be found. Replication is combined with checkpointing and comes with two flavors: process replication and group replication. Process replication applies to message-passing applications with communicating processes. Each process is replicated, and the platform is composed of process pairs, or triplets. Group replication applies to black-box applications, whose parallel execution is replicated several times. The platform is partitioned into two halves (or three thirds). In both scenarios, results are compared before each checkpoint, which is taken only when both results (duplication) or two out of three results (triplication) coincide. Otherwise, one or more silent errors have been detected, and the application rolls back to the last checkpoint, as well as when fail-stop errors have struck. We provide a detailed analytical study for all of these scenarios, with formulas to decide, for each scenario, the optimal parameters as a function of the error rate, checkpoint cost, and platform size. We also report a set of extensive simulation results that nicely corroborates the analytical model.}, keywords = {checkpointing, fail-stop errors, Fault tolerance, High-performance computing, Replication, silent errors}, doi = {https://doi.org/10.1016/j.jpdc.2018.08.002}, author = {Anne Benoit and Aurelien Cavelan and Franck Cappello and Padma Raghavan and Yves Robert and Hongyang Sun} } @conference {1215, title = {A Generic Approach to Scheduling and Checkpointing Workflows}, booktitle = { The 47th International Conference on Parallel Processing (ICPP 2018)}, year = {2018}, month = {2018-08}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Eugene, OR}, abstract = {This work deals with scheduling and checkpointing strategies to execute scientific workflows on failure-prone large-scale platforms. To the best of our knowledge, this work is the first to target failstop errors for arbitrary workflows. Most previous work addresses soft errors, which corrupt the task being executed by a processor but do not cause the entire memory of that processor to be lost, contrarily to fail-stop errors. We revisit classical mapping heuristics such as HEFT and MinMin and complement them with several checkpointing strategies. The objective is to derive an efficient trade-off between checkpointing every task (CkptAll), which is an overkill when failures are rare events, and checkpointing no task (CkptNone), which induces dramatic re-execution overhead even when only a few failures strike during execution. Contrarily to previous work, our approach applies to arbitrary workflows, not just special classes of dependence graphs such as M-SPGs (Minimal Series-Parallel Graphs). Extensive experiments report significant gain over both CkptAll and CkptNone, for a wide variety of workflows.}, author = {Li Han and Valentin Le F{\`e}vre and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @techreport {1207, title = {Initial Integration and Evaluation of SLATE Parallel BLAS in LATTE}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-07}, year = {2018}, month = {2018-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Asim YarKhan and Gerald Ragghianti and Jack Dongarra and Marc Cawkwell and Danny Perez and Arthur Voter} } @techreport {1273, title = {Least Squares Performance Report}, journal = {SLATE Working Notes}, number = {09, ICL-UT-18-10}, year = {2018}, month = {2018-12}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Mark Gates and Ali Charara and Jakub Kurzak and Asim YarKhan and Ichitaro Yamazaki and Jack Dongarra} } @techreport {1228, title = {Linear Systems Performance Report}, journal = {SLATE Working Notes}, number = {08, ICL-UT-18-08}, year = {2018}, month = {2018-09}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Ali Charara and Asim YarKhan and Jamie Finney and Gerald Ragghianti and Piotr Luszczek and Jack Dongarra} } @article {1239, title = {Multi-Level Checkpointing and Silent Error Detection for Linear Workflows}, journal = {Journal of Computational Science}, volume = {28}, year = {2018}, month = {2018-09}, pages = {398{\textendash}415}, abstract = {We focus on High Performance Computing (HPC) workflows whose dependency graph forms a linear chain, and we extend single-level checkpointing in two important directions. Our first contribution targets silent errors, and combines in-memory checkpoints with both partial and guaranteed verifications. Our second contribution deals with multi-level checkpointing for fail-stop errors. We present sophisticated dynamic programming algorithms that return the optimal solution for each problem in polynomial time. We also show how to combine all these techniques and solve the problem with both fail-stop and silent errors. Simulation results demonstrate that these extensions lead to significantly improved performance compared to the standard single-level checkpointing algorithm.}, author = {Anne Benoit and Aurelien Cavelan and Yves Robert and Hongyang Sun} } @article {1190, title = {ParILUT - A New Parallel Threshold ILU}, journal = {SIAM Journal on Scientific Computing}, volume = {40}, year = {2018}, month = {2018-07}, pages = {C503{\textendash}C519}, publisher = {SIAM}, abstract = {We propose a parallel algorithm for computing a threshold incomplete LU (ILU) factorization. The main idea is to interleave a parallel fixed-point iteration that approximates an incomplete factorization for a given sparsity pattern with a procedure that adjusts the pattern. We describe and test a strategy for identifying nonzeros to be added and nonzeros to be removed from the sparsity pattern. The resulting pattern may be different and more effective than that of existing threshold ILU algorithms. Also in contrast to other parallel threshold ILU algorithms, much of the new algorithm has fine-grained parallelism.}, doi = {https://doi.org/10.1137/16M1079506}, author = {Hartwig Anzt and Edmond Chow and Jack Dongarra} } @article {1227, title = {PMIx: Process Management for Exascale Environments}, journal = {Parallel Computing}, volume = {79}, year = {2018}, month = {2018-01}, pages = {9{\textendash}29}, issn = {01678191}, doi = {10.1016/j.parco.2018.08.002}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0167819118302424https://api.elsevier.com/content/article/PII:S0167819118302424?httpAccept=text/xmlhttps://api.elsevier.com/content/article/PII:S0167819118302424?httpAccept=text/plain}, author = {Ralph Castain and Joshua Hursey and Aurelien Bouteiller and David Solt} } @article {1221, title = {Using Jacobi Iterations and Blocking for Solving Sparse Triangular Systems in Incomplete Factorization Preconditioning}, journal = {Journal of Parallel and Distributed Computing}, volume = {119}, year = {2018}, month = {2018-11}, pages = {219{\textendash}230}, abstract = {When using incomplete factorization preconditioners with an iterative method to solve large sparse linear systems, each application of the preconditioner involves solving two sparse triangular systems. These triangular systems are challenging to solve efficiently on computers with high levels of concurrency. On such computers, it has recently been proposed to use Jacobi iterations, which are highly parallel, to approximately solve the triangular systems from incomplete factorizations. The effectiveness of this approach, however, is problem-dependent: the Jacobi iterations may not always converge quickly enough for all problems. Thus, as a necessary and important step to evaluate this approach, we experimentally test the approach on a large number of realistic symmetric positive definite problems. We also show that by using block Jacobi iterations, we can extend the range of problems for which such an approach can be effective. For block Jacobi iterations, it is essential for the blocking to be cognizant of the matrix structure.}, doi = {https://doi.org/10.1016/j.jpdc.2018.04.017}, author = {Edmond Chow and Hartwig Anzt and Jennifer Scott and Jack Dongarra} } @article {1176, title = {Argobots: A Lightweight Low-Level Threading and Tasking Framework}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {2017}, month = {2017-10}, abstract = {In the past few decades, a number of user-level threading and tasking models have been proposed in the literature to address the shortcomings of OS-level threads, primarily with respect to cost and flexibility. Current state-of-the-art user-level threading and tasking models, however, are either too specific to applications or architectures or are not as powerful or flexible. In this paper, we present Argobots, a lightweight, low-level threading and tasking framework that is designed as a portable and performant substrate for high-level programming models or runtime systems. Argobots offers a carefully designed execution model that balances generality of functionality with providing a rich set of controls to allow specialization by the user or high-level programming model. We describe the design, implementation, and optimization of Argobots and present integrations with three example high-level models: OpenMP, MPI, and co-located I/O service. Evaluations show that (1) Argobots outperforms existing generic threading runtimes; (2) our OpenMP runtime offers more efficient interoperability capabilities than production OpenMP runtimes do; (3) when MPI interoperates with Argobots instead of Pthreads, it enjoys reduced synchronization costs and better latency hiding capabilities; and (4) I/O service with Argobots reduces interference with co-located applications, achieving performance competitive with that of the Pthreads version.}, keywords = {Argobots, context switch, I/O, interoperability, lightweight, MPI, OpenMP, stackable scheduler, tasklet, user-level thread}, doi = {10.1109/TPDS.2017.2766062}, url = {http://ieeexplore.ieee.org/document/8082139/}, author = {Sangmin Seo and Abdelhalim Amer and Pavan Balaji and Cyril Bordage and George Bosilca and Alex Brooks and Philip Carns and Adrian Castello and Damien Genet and Thomas Herault and Shintaro Iwasaki and Prateek Jindal and Sanjay Kale and Sriram Krishnamoorthy and Jonathan Lifflander and Huiwei Lu and Esteban Meneses and Mar Snir and Yanhua Sun and Kenjiro Taura and Pete Beckman} } @techreport {1175, title = {C++ API for Batch BLAS}, journal = {SLATE Working Notes}, number = {04, ICL-UT-17-12}, year = {2017}, month = {2017-12}, publisher = {University of Tennessee}, author = {Ahmad Abdelfattah and Konstantin Arturov and Cris Cecka and Jack Dongarra and Chip Freitag and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Panruo Wu} } @techreport {1081, title = {C++ API for BLAS and LAPACK}, journal = {SLATE Working Notes}, number = {02, ICL-UT-17-03}, year = {2017}, note = {Revision 02-21-2018}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Mark Gates and Piotr Luszczek and Ahmad Abdelfattah and Jakub Kurzak and Jack Dongarra and Konstantin Arturov and Cris Cecka and Chip Freitag} } @conference {1098, title = {Checkpointing Workflows for Fail-Stop Errors}, booktitle = {IEEE Cluster}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii}, abstract = {We consider the problem of orchestrating the execution of workflow applications structured as Directed Acyclic Graphs (DAGs) on parallel computing platforms that are subject to fail-stop failures. The objective is to minimize expected overall execution time, or makespan. A solution to this problem consists of a schedule of the workflow tasks on the available processors and of a decision of which application data to checkpoint to stable storage, so as to mitigate the impact of processor failures. For general DAGs this problem is hopelessly intractable. In fact, given a solution, computing its expected makespan is still a difficult problem. To address this challenge, we consider a restricted class of graphs, Minimal Series-Parallel Graphs (M-SPGS). It turns out that many real-world workflow applications are naturally structured as M-SPGS. For this class of graphs, we propose a recursive list-scheduling algorithm that exploits the M-SPG structure to assign sub-graphs to individual processors, and uses dynamic programming to decide which tasks in these sub-graphs should be checkpointed. Furthermore, it is possible to efficiently compute the expected makespan for the solution produced by this algorithm, using a first-order approximation of task weights and existing evaluation algorithms for 2-state probabilistic DAGs. We assess the performance of our algorithm for production workflow configurations, comparing it to (i) an approach in which all application data is checkpointed, which corresponds to the standard way in which most production workflows are executed today; and (ii) an approach in which no application data is checkpointed. Our results demonstrate that our algorithm strikes a good compromise between these two approaches, leading to lower checkpointing overhead than the former and to better resilience to failure than the latter.}, author = {Li Han and Louis-Claude Canon and Henri Casanova and Yves Robert and Frederic Vivien} } @article {1163, title = {Flexible Batched Sparse Matrix Vector Product on GPUs}, year = {2017}, month = {2017-11}, publisher = {ScalA{\textquoteright}17: 8th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, address = {Denver, Colorado}, author = {Hartwig Anzt and Collins, Gary and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti} } @conference {, title = {Flexible Batched Sparse Matrix-Vector Product on GPUs}, booktitle = {8th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA {\textquoteright}17)}, year = {2017}, month = {2017-11}, publisher = {ACM Press}, organization = {ACM Press}, address = {Denver, CO}, abstract = { We propose a variety of batched routines for concurrently processing a large collection of small-size, independent sparse matrix-vector products (SpMV) on graphics processing units (GPUs). These batched SpMV kernels are designed to be flexible in order to handle a batch of matrices which differ in size, nonzero count, and nonzero distribution. Furthermore, they support three most commonly used sparse storage formats: CSR, COO and ELL. Our experimental results on a state-of-the-art GPU reveal performance improvements of up to 25X compared to non-batched SpMV routines.}, doi = {http://dx.doi.org/10.1145/3148226.3148230}, author = {Hartwig Anzt and Gary Collins and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti} } @conference {1096, title = {Identifying the Right Replication Level to Detect and Correct Silent Errors at Scale}, booktitle = {2017 Workshop on Fault-Tolerance for HPC at Extreme Scale}, year = {2017}, month = {2017-06}, publisher = {ACM}, organization = {ACM}, address = {Washington, DC}, abstract = {This paper provides a model and an analytical study of replication as a technique to detect and correct silent errors. Although other detection techniques exist for HPC applications, based on algorithms (ABFT), invariant preservation or data analytics, replication remains the most transparent and least intrusive technique. We explore the right level (duplication, triplication or more) of replication needed to efficiently detect and correct silent errors. Replication is combined with checkpointing and comes with two flavors: process replication and group replication. Process replication applies to message-passing applications with communicating processes. Each process is replicated, and the platform is composed of process pairs, or triplets. Group replication applies to black-box applications, whose parallel execution is replicated several times. The platform is partitioned into two halves (or three thirds). In both scenarios, results are compared before each checkpoint, which is taken only when both results (duplication) or two out of three results (triplication) coincide. If not, one or more silent errors have been detected, and the application rolls back to the last checkpoint. We provide a detailed analytical study of both scenarios, with formulas to decide, for each scenario, the optimal parameters as a function of the error rate, checkpoint cost, and platform size. We also report a set of extensive simulation results that corroborates the analytical model.}, doi = {10.1145/3086157.3086162}, author = {Anne Benoit and Franck Cappello and Aurelien Cavelan and Yves Robert and Hongyang Sun} } @conference {1095, title = {Optimal Checkpointing Period with replicated execution on heterogeneous platforms}, booktitle = {2017 Workshop on Fault-Tolerance for HPC at Extreme Scale}, year = {2017}, month = {2017-06}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Washington, DC}, abstract = {In this paper, we design and analyze strategies to replicate the execution of an application on two different platforms subject to failures, using checkpointing on a shared stable storage. We derive the optimal pattern size~W for a periodic checkpointing strategy where both platforms concurrently try and execute W units of work before checkpointing. The first platform that completes its pattern takes a checkpoint, and the other platform interrupts its execution to synchronize from that checkpoint. We compare this strategy to a simpler on-failure checkpointing strategy, where a checkpoint is taken by one platform only whenever the other platform encounters a failure. We use first or second-order approximations to compute overheads and optimal pattern sizes, and show through extensive simulations that these models are very accurate. The simulations show the usefulness of a secondary platform to reduce execution time, even when the platforms have relatively different speeds: in average, over a wide range of scenarios, the overhead is reduced by 30\%. The simulations also demonstrate that the periodic checkpointing strategy is globally more efficient, unless platform speeds are quite close.}, doi = {10.1145/3086157.3086165}, author = {Anne Benoit and Aurelien Cavelan and Valentin Le F{\`e}vre and Yves Robert} } @inbook {1384, title = {Performance Analysis and Debugging Tools at Scale}, booktitle = {Exascale Scientific Applications: Scalability and Performance Portability}, year = {2017}, month = {2017-11}, pages = {17-50}, publisher = {Chapman \& Hall / CRC Press}, organization = {Chapman \& Hall / CRC Press}, chapter = {2}, abstract = {This chapter explores present-day challenges and those likely to arise as new hardware and software technologies are introduced on the path to exascale. It covers some of the underlying hardware, software, and techniques that enable tools and debuggers. Performance tools and debuggers are critical components that enable computational scientists to fully exploit the computing power of While high-performance computing systems. Instrumentation is the insertion of code to perform measurement in a program. It is vital step in performance analysis, especially for parallel programs. The essence of a debugging tool is enabling observation, exploration, and control of program state, such that a developer can, for example, verify that what is currently occurring correlates to what is intended. The increased complexity and volume of performance and debugging data likely to be seen on exascale systems risks overwhelming tool users. Tools and debuggers may need to develop advanced techniques such as automated filtering and analysis to reduce the complexity seen by the user.}, isbn = {9781315277400}, doi = {https://doi.org/10.1201/b21930}, author = {Scott Parker and John Mellor-Crummey and Dong H. Ahn and Heike Jagode and Holger Brunst and Sameer Shende and Allen D. Malony and David DelSignore and Ronny Tschuter and Ralph Castain and Kevin Harms and Philip Carns and Ray Loy and Kalyan Kumaran} } @inproceedings {1307, title = {PMIx: Process Management for Exascale Environments}, journal = {Proceedings of the 24th European MPI Users{\textquoteright} Group Meeting}, year = {2017}, pages = {14:1{\textendash}14:10}, publisher = {ACM}, address = {New York, NY, USA}, abstract = {High-Performance Computing (HPC) applications have historically executed in static resource allocations, using programming models that ran independently from the resident system management stack (SMS). Achieving exascale performance that is both cost-effective and fits within site-level environmental constraints will, however, require that the application and SMS collaboratively orchestrate the flow of work to optimize resource utilization and compensate for on-the-fly faults. The Process Management Interface - Exascale (PMIx) community is committed to establishing scalable workflow orchestration by defining an abstract set of interfaces by which not only applications and tools can interact with the resident SMS, but also the various SMS components can interact with each other. This paper presents a high-level overview of the goals and current state of the PMIx standard, and lays out a roadmap for future directions. }, isbn = {978-1-4503-4849-2}, doi = {10.1145/3127024.3127027}, url = {http://doi.acm.org/10.1145/3127024.3127027}, author = {Castain, Ralph H. and David Solt and Joshua Hursey and Aurelien Bouteiller} } @conference {1097, title = {Resilience for Stencil Computations with Latent Errors}, booktitle = {International Conference on Parallel Processing (ICPP)}, year = {2017}, month = {2017-08}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Bristol, UK}, abstract = {Projections and measurements of error rates in near-exascale and exascale systems suggest a dramatic growth, due to extreme scale (109,109 cores), concurrency, software complexity, and deep submicron transistor scaling. Such a growth makes resilience a critical concern, and may increase the incidence of errors that "escape," silently corrupting application state. Such errors can often be revealed by application software tests but with long latencies, and thus are known as latent errors. We explore how to efficiently recover from latent errors, with an approach called application-based focused recovery (ABFR). Specifically we present a case study of stencil computations, a widely useful computational structure, showing how ABFR focuses recovery effort where needed, using intelligent testing and pruning to reduce recovery effort, and enables recovery effort to be overlapped with application computation. We analyze and characterize the ABFR approach on stencils, creating a performance model parameterized by error rate and detection interval (latency). We compare projections from the model to experimental results with the Chombo stencil application, validating the model and showing that ABFR on stencil can achieve a significant reductions in error recovery cost (up to 400x) and recovery latency (up to 4x). Such reductions enable efficient execution at scale with high latent error rates.}, author = {Aiman Fang and Aurelien Cavelan and Yves Robert and Andrew Chien} } @article {1090, title = {Towards Optimal Multi-Level Checkpointing}, journal = {IEEE Transactions on Computers}, volume = {66}, year = {2017}, month = {2017-07}, pages = {1212{\textendash}1226}, keywords = {checkpointing, Dynamic programming, Error analysis, Heuristic algorithms, Optimized production technology, protocols, Shape}, doi = {10.1109/TC.2016.2643660}, author = {Anne Benoit and Aurelien Cavelan and Valentin Le F{\`e}vre and Yves Robert and Hongyang Sun} } @article {933, title = {Assessing General-purpose Algorithms to Cope with Fail-stop and Silent Errors}, journal = {ACM Transactions on Parallel Computing}, year = {2016}, month = {2016-08}, abstract = {In this paper, we combine the traditional checkpointing and rollback recovery strategies with verification mechanisms to cope with both fail-stop and silent errors. The objective is to minimize makespan and/or energy consumption. For divisible load applications, we use first-order approximations to find the optimal checkpointing period to minimize execution time, with an additional verification mechanism to detect silent errors before each checkpoint, hence extending the classical formula by Young and Daly for fail-stop errors only. We further extend the approach to include intermediate verifications, and to consider a bi-criteria problem involving both time and energy (linear combination of execution time and energy consumption). Then, we focus on application workflows whose dependence graph is a linear chain of tasks. Here, we determine the optimal checkpointing and verification locations, with or without intermediate verifications, for the bicriteria problem. Rather than using a single speed during the whole execution, we further introduce a new execution scenario, which allows for changing the execution speed via dynamic voltage and frequency scaling (DVFS). We determine in this scenario the optimal checkpointing and verification locations, as well as the optimal speed pairs. Finally, we conduct an extensive set of simulations to support the theoretical study, and to assess the performance of each algorithm, showing that the best overall performance is achieved under the most flexible scenario using intermediate verifications and different speeds.}, keywords = {checkpoint, fail-stop error, failure, HPC, resilience, silent data corruption, silent error, verification}, doi = {10.1145/2897189}, author = {Anne Benoit and Aurelien Cavelan and Yves Robert and Hongyang Sun} } @inproceedings {991, title = {Batched Generation of Incomplete Sparse Approximate Inverses on GPUs}, journal = {Proceedings of the 7th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, year = {2016}, month = {2016-11}, pages = {49{\textendash}56}, abstract = {Incomplete Sparse Approximate Inverses (ISAI) have recently been shown to be an attractive alternative to exact sparse triangular solves in the context of incomplete factorization preconditioning. In this paper we propose a batched GPU-kernel for the efficient generation of ISAI matrices. Utilizing only thread-local memory allows for computing the ISAI matrix with very small memory footprint. We demonstrate that this strategy is faster than the existing strategy for generating ISAI matrices, and use a large number of test matrices to assess the algorithm{\textquoteright}s efficiency in an iterative solver setting.}, isbn = {978-1-5090-5222-6}, doi = {10.1109/ScalA.2016.11}, author = {Hartwig Anzt and Edmond Chow and Thomas Huckle and Jack Dongarra} } @techreport {988, title = {On block-asynchronous execution on GPUs}, journal = {LAPACK Working Note}, number = {291}, year = {2016}, month = {2016-11}, abstract = {This paper experimentally investigates how GPUs execute instructions when used for general purpose computing (GPGPU). We use a light-weight realizing a vector operation to analyze which vector entries are updated subsequently, and identify regions where parallel execution can be expected. The results help us to understand how GPUs operate, and map this operation mode to the mathematical concept of asynchronism. In particular it helps to understand the effects that can occur when implementing a fixed-point method using in-place updates on GPU hardware.}, url = {http://www.netlib.org/lapack/lawnspdf/lawn291.pdf}, author = {Hartwig Anzt and Edmond Chow and Jack Dongarra} } @inproceedings {996, title = {Domain Overlap for Iterative Sparse Triangular Solves on GPUs}, journal = {Software for Exascale Computing - SPPEXA}, volume = {113}, year = {2016}, month = {2016-09}, pages = {527{\textendash}545}, publisher = {Springer International Publishing}, abstract = {Iterative methods for solving sparse triangular systems are an attractive alternative to exact forward and backward substitution if an approximation of the solution is acceptable. On modern hardware, performance benefits are available as iterative methods allow for better parallelization. In this paper, we investigate how block-iterative triangular solves can benefit from using overlap. Because the matrices are triangular, we use {\textquotedblleft}directed{\textquotedblright} overlap, depending on whether the matrix is upper or lower triangular. We enhance a GPU implementation of the block-asynchronous Jacobi method with directed overlap. For GPUs and other cases where the problem must be overdecomposed, i.e., more subdomains and threads than cores, there is a preference in processing or scheduling the subdomains in a specific order, following the dependencies specified by the sparse triangular matrix. For sparse triangular factors from incomplete factorizations, we demonstrate that moderate directed overlap with subdomain scheduling can improve convergence and time-to-solution.}, doi = {10.1007/978-3-319-40528-5_24}, author = {Hartwig Anzt and Edmond Chow and Daniel Szyld and Jack Dongarra}, editor = {Hans-Joachim Bungartz and Philipp Neumann and Wolfgang E. Nagel} } @conference {939, title = {Heterogeneous Streaming}, booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This paper introduces a new heterogeneous streaming library called hetero Streams (hStreams). We show how a simple FIFO streaming model can be applied to heterogeneous systems that include manycore coprocessors and multicore CPUs. This model supports concurrency across nodes, among tasks within a node, and between data transfers and computation. We give examples for different approaches, show how the implementation can be layered, analyze overheads among layers, and apply those models to parallelize applications using simple, intuitive interfaces. We compare the features and versatility of hStreams, OpenMP, CUDA Streams1 and OmpSs. We show how the use of hStreams makes it easier for scientists to identify tasks and easily expose concurrency among them, and how it enables tuning experts and runtime systems to tailor execution for different heterogeneous targets. Practical application examples are taken from the field of numerical linear algebra, commercial structural simulation software, and a seismic processing application.}, keywords = {plasma}, author = {Chris J. Newburn and Gaurav Bansal and Michael Wood and Luis Crivelli and Judit Planas and Alejandro Duran and Paulo Souza and Leonardo Borges and Piotr Luszczek and Stanimire Tomov and Jack Dongarra and Hartwig Anzt and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Ichitaro Yamazaki and Jesus Labarta} } @conference {930, title = {Optimal Resilience Patterns to Cope with Fail-stop and Silent Errors}, booktitle = {2016 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This work focuses on resilience techniques at extreme scale. Many papers deal with fail-stop errors. Many others deal with silent errors (or silent data corruptions). But very few papers deal with fail-stop and silent errors simultaneously. However, HPC applications will obviously have to cope with both error sources. This paper presents a unified framework and optimal algorithmic solutions to this double challenge. Silent errors are handled via verification mechanisms (either partially or fully accurate) and in-memory checkpoints. Fail-stop errors are processed via disk checkpoints. All verification and checkpoint types are combined into computational patterns. We provide a unified model, and a full characterization of the optimal pattern. Our results nicely extend several published solutions and demonstrate how to make use of different techniques to solve the double threat of fail-stop and silent errors. Extensive simulations based on real data confirm the accuracy of the model, and show that patterns that combine all resilience mechanisms are required to provide acceptable overheads.}, keywords = {fail-stop errors, multilevel checkpoint, optimal pattern, resilience, silent errors, verification}, doi = {10.1109/IPDPS.2016.39}, author = {Anne Benoit and Aurelien Cavelan and Yves Robert and Hongyang Sun} } @article {932, title = {Scheduling Computational Workflows on Failure-prone Platforms}, journal = {International Journal of Networking and Computing}, volume = {6}, number = {1}, year = {2016}, pages = {2-26}, abstract = {We study the scheduling of computational workflows on compute resources that experience exponentially distributed failures. When a failure occurs, rollback and recovery is used to resume the execution from the last checkpointed state. The scheduling problem is to minimize the expected execution time by deciding in which order to execute the tasks in the workflow and deciding for each task whether to checkpoint it or not after it completes. We give a polynomialtime optimal algorithm for fork DAGs (Directed Acyclic Graphs) and show that the problem is NP-complete with join DAGs. We also investigate the complexity of the simple case in which no task is checkpointed. Our main result is a polynomial-time algorithm to compute the expected execution time of a workflow, with a given task execution order and specified to-be-checkpointed tasks. Using this algorithm as a basis, we propose several heuristics for solving the scheduling problem. We evaluate these heuristics for representative workflow configurations.}, keywords = {checkpointing, fault-tolerance, reliability, scheduling, workflow}, issn = { 2185-2847}, author = {Guillaume Aupy and Anne Benoit and Henri Casanova and Yves Robert} } @article {995, title = {Updating Incomplete Factorization Preconditioners for Model Order Reduction}, journal = {Numerical Algorithms}, volume = {73}, number = {3}, year = {2016}, month = {2016-02}, pages = {611{\textendash}630}, abstract = {When solving a sequence of related linear systems by iterative methods, it is common to reuse the preconditioner for several systems, and then to recompute the preconditioner when the matrix has changed significantly. Rather than recomputing the preconditioner from scratch, it is potentially more efficient to update the previous preconditioner. Unfortunately, it is not always known how to update a preconditioner, for example, when the preconditioner is an incomplete factorization. A recently proposed iterative algorithm for computing incomplete factorizations, however, is able to exploit an initial guess, unlike existing algorithms for incomplete factorizations. By treating a previous factorization as an initial guess to this algorithm, an incomplete factorization may thus be updated. We use a sequence of problems from model order reduction. Experimental results using an optimized GPU implementation show that updating a previous factorization can be inexpensive and effective, making solving sequences of linear systems a potential niche problem for the iterative incomplete factorization algorithm.}, keywords = {key publication}, doi = {10.1007/s11075-016-0110-2}, author = {Hartwig Anzt and Edmond Chow and Jens Saak and Jack Dongarra} } @conference {865, title = {Asynchronous Iterative Algorithm for Computing Incomplete Factorizations on GPUs}, booktitle = {International Supercomputing Conference (ISC 2015)}, year = {2015}, month = {2015-07}, address = {Frankfurt, Germany}, author = {Edmond Chow and Hartwig Anzt and Jack Dongarra} } @conference {928, title = {Cholesky Across Accelerators}, booktitle = {17th IEEE International Conference on High Performance Computing and Communications (HPCC 2015)}, year = {2015}, month = {2015-08}, publisher = {IEEE}, organization = {IEEE}, address = {Elizabeth, NJ}, author = {Asim YarKhan and Azzam Haidar and Chongxiao Cao and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {840, title = {Design for a Soft Error Resilient Dynamic Task-based Runtime}, booktitle = {29th IEEE International Parallel \& Distributed Processing Symposium (IPDPS)}, year = {2015}, month = {2015-05}, publisher = {IEEE}, organization = {IEEE}, address = {Hyderabad, India}, abstract = {As the scale of modern computing systems grows, failures will happen more frequently. On the way to Exascale a generic, low-overhead, resilient extension becomes a desired aptitude of any programming paradigm. In this paper we explore three additions to a dynamic task-based runtime to build a generic framework providing soft error resilience to task-based programming paradigms. The first recovers the application by re-executing the minimum required sub-DAG, the second takes critical checkpoints of the data flowing between tasks to minimize the necessary re-execution, while the last one takes advantage of algorithmic properties to recover the data without re-execution. These mechanisms have been implemented in the PaRSEC task-based runtime framework. Experimental results validate our approach and quantify the overhead introduced by such mechanisms.}, author = {Chongxiao Cao and George Bosilca and Thomas Herault and Jack Dongarra} } @conference {887, title = {Flexible Linear Algebra Development and Scheduling with Cholesky Factorization}, booktitle = {17th IEEE International Conference on High Performance Computing and Communications}, year = {2015}, month = {2015-08}, address = {Newark, NJ}, abstract = {Modern high performance computing environments are composed of networks of compute nodes that often contain a variety of heterogeneous compute resources, such as multicore-CPUs, GPUs, and coprocessors. One challenge faced by domain scientists is how to efficiently use all these distributed, heterogeneous resources. In order to use the GPUs effectively, the workload parallelism needs to be much greater than the parallelism for a multicore-CPU. On the other hand, a Xeon Phi coprocessor will work most effectively with degree of parallelism between GPUs and multicore-CPUs. Additionally, effectively using distributed memory nodes brings out another level of complexity where the workload must be carefully partitioned over the nodes. In this work we are using a lightweight runtime environment to handle many of the complexities in such distributed, heterogeneous systems. The runtime environment uses task-superscalar concepts to enable the developer to write serial code while providing parallel execution. The task-programming model allows the developer to write resource-specialization code, so that each resource gets the appropriate sized workload-grain. Our task programming abstraction enables the developer to write a single algorithm that will execute efficiently across the distributed heterogeneous machine. We demonstrate the effectiveness of our approach with performance results for dense linear algebra applications, specifically the Cholesky factorization.}, author = {Azzam Haidar and Asim YarKhan and Chongxiao Cao and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {878, title = {Iterative Sparse Triangular Solves for Preconditioning}, booktitle = {EuroPar 2015}, year = {2015}, month = {2015-08}, publisher = {Springer Berlin}, organization = {Springer Berlin}, address = {Vienna, Austria}, abstract = {Sparse triangular solvers are typically parallelized using level scheduling techniques, but parallel eciency is poor on high-throughput architectures like GPUs. We propose using an iterative approach for solving sparse triangular systems when an approximation is suitable. This approach will not work for all problems, but can be successful for sparse triangular matrices arising from incomplete factorizations, where an approximate solution is acceptable. We demonstrate the performance gains that this approach can have on GPUs in the context of solving sparse linear systems with a preconditioned Krylov subspace method. We also illustrate the effect of using asynchronous iterations.}, doi = {10.1007/978-3-662-48096-0_50}, url = {http://dx.doi.org/10.1007/978-3-662-48096-0_50}, author = {Hartwig Anzt and Edmond Chow and Jack Dongarra} } @conference {893, title = {Random-Order Alternating Schwarz for Sparse Triangular Solves}, booktitle = {2015 SIAM Conference on Applied Linear Algebra (SIAM LA)}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {Block-asynchronous Jacobi is an iteration method where a locally synchronous iteration is embedded in an asynchronous global iteration. The unknowns are partitioned into small subsets, and while the components within the same subset are iterated in Jacobi fashion, no update order in-between the subsets is enforced. The values of the non-local entries remain constant during the local iterations, which can result in slow inter-subset information propagation and slow convergence. Interpreting of the subsets as subdomains allows to transfer the concept of domain overlap typically enhancing the information propagation to block-asynchronous solvers. In this talk we explore the impact of overlapping domains to convergence and performance of block-asynchronous Jacobi iterations, and present results obtained by running this solver class on state-of-the-art HPC systems.}, author = {Hartwig Anzt and Edmond Chow and Daniel Szyld and Jack Dongarra} } @conference {836, title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL }, booktitle = {International Workshop on OpenCL}, year = {2014}, month = {2014-05}, address = {Bristol University, England}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments presented, and in general provides to heterogeneous architectures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance OpenCL BLAS, hardware and OpenCL-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.}, author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {822, title = {Design for a Soft Error Resilient Dynamic Task-based Runtime}, journal = {ICL Technical Report}, number = {ICL-UT-14-04}, year = {2014}, month = {2014-11}, publisher = {University of Tennessee}, abstract = {Abstract{\textemdash}As the scale of modern computing systems grows, failures will happen more frequently. On the way to Exascale a generic, low-overhead, resilient extension becomes a desired aptitude of any programming paradigm. In this paper we explore three additions to a dynamic task-based runtime to build a generic framework providing soft error resilience to task-based programming paradigms. The first recovers the application by re-executing the minimum required sub-DAG, the second takes critical checkpoints of the data flowing between tasks to minimize the necessary re-execution, while the last one takes advantage of algorithmic properties to recover the data without re-execution. These mechanisms have been implemented in the PaRSEC task-based runtime framework. Experimental results validate our approach and quantify the overhead introduced by such mechanisms.}, author = {Chongxiao Cao and Thomas Herault and George Bosilca and Jack Dongarra} } @conference {828, title = {Performance and Portability with OpenCL for Throughput-Oriented HPC Workloads Across Accelerators, Coprocessors, and Multicore Processors}, booktitle = {5th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA {\textquoteright}14)}, year = {2014}, month = {2014-11}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, abstract = {Ever since accelerators and coprocessors became the mainstream hardware for throughput-oriented HPC workloads, various programming techniques have been proposed to increase productivity in terms of both the performance and ease-of-use. We evaluate these aspects of OpenCL on a number of hardware platforms for an important subset of dense linear algebra operations that are relevant to a wide range of scientific applications. Our findings indicate that OpenCL portability has improved since our previous publication and many new and surprising usage scenarios are possible that rival those available after decades of software development on the CPUs. The combined performance-portability metric, even though not promised by the OpenCL standard, reflects the need for tuning performance-critical operations during the porting process and we show how a large portion of the available efficiency is lost if the tuning is not done correctly.}, doi = {10.1109/ScalA.2014.8}, author = {Azzam Haidar and Chongxiao Cao and Ichitaro Yamazaki and Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @conference {863, title = {Task-Based Programming for Seismic Imaging: Preliminary Results}, booktitle = {2014 IEEE International Conference on High Performance Computing and Communications (HPCC)}, year = {2014}, month = {2014-08}, publisher = {IEEE}, organization = {IEEE}, address = {Paris, France}, abstract = {The level of hardware complexity of current supercomputers is forcing the High Performance Computing (HPC) community to reconsider parallel programming paradigms and standards. The high-level of hardware abstraction provided by task-based paradigms make them excellent candidates for writing portable codes that can consistently deliver high performance across a wide range of platforms. While this paradigm has proved efficient for achieving such goals for dense and sparse linear solvers, it is yet to be demonstrated that industrial parallel codes{\textemdash}relying on the classical Message Passing Interface (MPI) standard and that accumulate dozens of years of expertise (and countless lines of code){\textemdash}may be revisited to turn them into efficient task-based programs. In this paper, we study the applicability of task-based programming in the case of a Reverse Time Migration (RTM) application for Seismic Imaging. The initial MPI-based application is turned into a task-based code executed on top of the PaRSEC runtime system. Preliminary results show that the approach is competitive with (and even potentially superior to) the original MPI code on a homogeneous multicore node, and can more efficiently exploit complex hardware such as a cache coherent Non Uniform Memory Access (ccNUMA) node or an Intel Xeon Phi accelerator.}, keywords = {plasma}, author = {Lionel Boillot and George Bosilca and Emmanuel Agullo and Henri Calandra} } @conference {809, title = {Unified Development for Mixed Multi-GPU and Multi-Coprocessor Environments using a Lightweight Runtime Environment}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Many of the heterogeneous resources available to modern computers are designed for different workloads. In order to efficiently use GPU resources, the workload must have a greater degree of parallelism than a workload designed for multicore-CPUs. And conceptually, the Intel Xeon Phi coprocessors are capable of handling workloads somewhere in between the two. This multitude of applicable workloads will likely lead to mixing multicore-CPUs, GPUs, and Intel coprocessors in multi-user environments that must offer adequate computing facilities for a wide range of workloads. In this work, we are using a lightweight runtime environment to manage the resourcespecific workload, and to control the dataflow and parallel execution in two-way hybrid systems. The lightweight runtime environment uses task superscalar concepts to enable the developer to write serial code while providing parallel execution. In addition, our task abstractions enable unified algorithmic development across all the heterogeneous resources. We provide performance results for dense linear algebra applications, demonstrating the effectiveness of our approach and full utilization of a wide variety of accelerator hardware.}, keywords = {algorithms, Computer science, CUDA, Heterogeneous systems, Intel Xeon Phi, linear algebra, nVidia, Tesla K20, Tesla M2090}, author = {Azzam Haidar and Chongxiao Cao and Jack Dongarra and Piotr Luszczek and Stanimire Tomov} } @article {826, title = {Unveiling the Performance-energy Trade-off in Iterative Linear System Solvers for Multithreaded Processors}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2014}, month = {2014-09}, pages = {885-904}, chapter = {885}, abstract = {In this paper, we analyze the interactions occurring in the triangle performance-power-energy for the execution of a pivotal numerical algorithm, the iterative conjugate gradient (CG) method, on a diverse collection of parallel multithreaded architectures. This analysis is especially timely in a decade where the power wall has arisen as a major obstacle to build faster processors. Moreover, the CG method has recently been proposed as a complement to the LINPACK benchmark, as this iterative method is argued to be more archetypical of the performance of today{\textquoteright}s scientific and engineering applications. To gain insights about the benefits of hands-on optimizations we include runtime and energy efficiency results for both out-of-the-box usage relying exclusively on compiler optimizations, and implementations manually optimized for target architectures, that range from general-purpose and digital signal multicore processors to manycore graphics processing units, all representative of current multithreaded systems.}, keywords = {CG, CPUs, energy efficiency, GPUs, low-power architectures}, doi = {10.1002/cpe.3341}, url = {http://dx.doi.org/10.1002/cpe.3341}, author = {Jos{\'e} I. Aliaga and Hartwig Anzt and Maribel Castillo and Juan C. Fern{\'a}ndez and Germ{\'a}n Le{\'o}n and Joaqu{\'\i}n P{\'e}rez and Enrique S. Quintana-Orti} } @techreport {681, title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL}, journal = {University of Tennessee Technical Report (Lawn 275)}, number = {UT-CS-13-706}, year = {2013}, month = {2013-03}, publisher = {University of Tennessee}, abstract = {This paper presents the design and implementation of sev- eral fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments pre- sented, and in general provides to heterogeneous architec- tures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is ob- tained through use of the high-performance OpenCL BLAS, hardware and OpenCL-speci c tuning, and a hybridization methodology where we split the algorithm into computa- tional tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.}, author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {icl:733, title = {Multi-criteria checkpointing strategies: optimizing response-time versus resource utilization}, journal = {University of Tennessee Computer Science Technical Report}, number = {ICL-UT-13-01}, year = {2013}, month = {2013-02}, abstract = {Failures are increasingly threatening the eciency of HPC systems, and current projections of Exascale platforms indicate that rollback recovery, the most convenient method for providing fault tolerance to generalpurpose applications, reaches its own limits at such scales. One of the reasons explaining this unnerving situation comes from the focus that has been given to per-application completion time, rather than to platform efficiency. In this paper, we discuss the case of uncoordinated rollback recovery where the idle time spent waiting recovering processors is used to progress a different, independent application from the system batch queue. We then propose an extended model of uncoordinated checkpointing that can discriminate between idle time and wasted computation. We instantiate this model in a simulator to demonstrate that, with this strategy, uncoordinated checkpointing per application completion time is unchanged, while it delivers near-perfect platform efficiency.}, author = {Aurelien Bouteiller and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert} } @conference {868, title = {Multi-criteria Checkpointing Strategies: Response-Time versus Resource Utilization}, booktitle = {Euro-Par 2013}, year = {2013}, month = {2013-08}, publisher = {Springer}, organization = {Springer}, address = {Aachen, Germany}, abstract = {Failures are increasingly threatening the efficiency of HPC systems, and current projections of Exascale platforms indicate that roll- back recovery, the most convenient method for providing fault tolerance to general-purpose applications, reaches its own limits at such scales. One of the reasons explaining this unnerving situation comes from the focus that has been given to per-application completion time, rather than to platform efficiency. In this paper, we discuss the case of uncoordinated rollback recovery where the idle time spent waiting recovering processors is used to progress a different, independent application from the sys- tem batch queue. We then propose an extended model of uncoordinated checkpointing that can discriminate between idle time and wasted com- putation. We instantiate this model in a simulator to demonstrate that, with this strategy, uncoordinated checkpointing per application comple- tion time is unchanged, while it delivers near-perfect platform efficiency.}, author = {Aurelien Bouteiller and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert} } @article {748, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {Concurrency and Computation: Practice and Experience}, year = {2013}, month = {2013-11}, abstract = {In this paper, we present a unified model for several well-known checkpoint/restart protocols. The proposed model is generic enough to encompass both extremes of the checkpoint/restart space, from coordinated approaches to a variety of uncoordinated checkpoint strategies (with message logging). We identify a set of crucial parameters, instantiate them, and compare the expected efficiency of the fault tolerant protocols, for a given application/platform pair. We then propose a detailed analysis of several scenarios, including some of the most powerful currently available high performance computing platforms, as well as anticipated Exascale designs. The results of this analytical comparison are corroborated by a comprehensive set of simulations. Altogether, they outline comparative behaviors of checkpoint strategies at very large scale, thereby providing insight that is hardly accessible to direct experimentation.}, doi = {10.1002/cpe.3173}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @article {icl:730, title = {Matrices Over Runtime Systems at Exascale}, journal = {Supercomputing {\textquoteright}12 (poster)}, year = {2012}, month = {2012-11}, address = {Salt Lake City, Utah}, author = {Emmanuel Agullo and George Bosilca and Cedric Castagn{\`e}de and Jack Dongarra and Hatem Ltaeif and Stanimire Tomov} } @techreport {icl:716, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 269)}, number = {UT-CS-12-697}, year = {2012}, month = {2012-06}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @inproceedings {icl:736, title = {User Level Failure Mitigation in MPI}, journal = {Euro-Par 2012: Parallel Processing Workshops}, volume = {7640}, year = {2012}, month = {2012-08}, pages = {499-504}, publisher = {Springer Berlin Heidelberg}, address = {Rhodes Island, Greece}, keywords = {ftmpi}, author = {Wesley Bland}, editor = {Ioannis Caragiannis and Michael Alexander and Rosa M. Badia and Mario Cannataro and Alexandru Costan and Marco Danelutto and Frederic Desprez and Bettina Krammer and Sahuquillo, J. and Stephen L. Scott and J. Weidendorfer} } @article {icl:623, title = {Energy and performance characteristics of different parallel implementations of scientific applications on multicore systems}, journal = {International Journal of High Performance Computing Applications}, volume = {25}, number = {3}, year = {2011}, month = {2011-00}, pages = {342-350}, keywords = {mumi}, author = {Charles Lively and Xingfu Wu and Valerie Taylor and Shirley Moore and Hung-Ching Chang and Kirk Cameron} } @article {icl:646, title = {Impact of Kernel-Assisted MPI Communication over Scientific Applications: CPMD and FFTW}, journal = {18th EuroMPI}, year = {2011}, month = {2011-09}, pages = {247-254}, publisher = {Springer}, address = {Santorini, Greece}, keywords = {dague}, author = {Teng Ma and Aurelien Bouteiller and George Bosilca and Jack Dongarra}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @article {icl:643, title = {The International Exascale Software Project Roadmap}, journal = {International Journal of High Performance Computing}, volume = {25}, number = {1}, year = {2011}, month = {2011-01}, pages = {3-60}, abstract = {Over the last 20 years, the open-source community has provided more and more software on which the world{\textquoteright}s high-performance computing systems depend for performance and productivity. The community has invested millions of dollars and years of effort to build key components. However, although the investments in these separate software elements have been tremendously valuable, a great deal of productivity has also been lost because of the lack of planning, coordination, and key integration of technologies necessary to make them work together smoothly and efficiently, both within individual petascale systems and between different systems. It seems clear that this completely uncoordinated development model will not provide the software needed to support the unprecedented parallelism required for peta/ exascale computation on millions of cores, or the flexibility required to exploit new hardware models and features, such as transactional memory, speculative execution, and graphics processing units. This report describes the work of the community to prepare for the challenges of exascale computing, ultimately combing their efforts in a coordinated International Exascale Software Project.}, doi = {https://doi.org/10.1177/1094342010391989}, author = {Jack Dongarra and Pete Beckman and Terry Moore and Patrick Aerts and Giovanni Aloisio and Jean-Claude Andre and David Barkai and Jean-Yves Berthou and Taisuke Boku and Bertrand Braunschweig and Franck Cappello and Barbara Chapman and Xuebin Chi and Alok Choudhary and Sudip Dosanjh and Thom Dunning and Sandro Fiore and Al Geist and Bill Gropp and Robert Harrison and Mark Hereld and Michael Heroux and Adolfy Hoisie and Koh Hotta and Zhong Jin and Yutaka Ishikawa and Fred Johnson and Sanjay Kale and Richard Kenway and David Keyes and Bill Kramer and Jesus Labarta and Alain Lichnewsky and Thomas Lippert and Bob Lucas and Barney MacCabe and Satoshi Matsuoka and Paul Messina and Peter Michielse and Bernd Mohr and Matthias S. Mueller and Wolfgang E. Nagel and Hiroshi Nakashima and Michael E. Papka and Dan Reed and Mitsuhisa Sato and Ed Seidel and John Shalf and David Skinner and Marc Snir and Thomas Sterling and Rick Stevens and Fred Streitz and Bob Sugar and Shinji Sumimoto and William Tang and John Taylor and Rajeev Thakur and Anne Trefethen and Mateo Valero and Aad van der Steen and Jeffrey Vetter and Peg Williams and Robert Wisniewski and Kathy Yelick} } @article {icl:647, title = {OMPIO: A Modular Software Architecture for MPI I/O}, journal = {18th EuroMPI}, year = {2011}, month = {2011-09}, pages = {81-89}, publisher = {Springer}, address = {Santorini, Greece}, author = {Mohamad Chaarawi and Edgar Gabriel and Rainer Keller and Richard L. Graham and George Bosilca and Jack Dongarra}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @inproceedings {icl:659, title = {An open-source tool-chain for performance analysis}, journal = {Parallel Tools Workshop}, year = {2011}, month = {2011-09}, address = {Dresden, Germany}, author = {Kevin Coulomb and Augustin Degomme and Mathieu Faverge and Francois Trahay} } @inproceedings {icl:619, title = {Power-Aware Prediction Models of Hybrid (MPI/OpenMP) Scientific Applications}, journal = {International Conference on Energy-Aware High Performance Computing (EnA-HPC 2011)}, year = {2011}, month = {2011-09}, address = {Hamburg, Germany}, keywords = {mumi}, author = {Charles Lively and Xingfu Wu and Valerie Taylor and Shirley Moore and Hung-Ching Chang and Chun-Yi Su and Kirk Cameron} } @article {icl:677, title = {QCG-OMPI: MPI Applications on Grids.}, journal = {Future Generation Computer Systems}, volume = {27}, number = {4}, year = {2011}, month = {2011-01}, pages = {435-369}, author = {Emmanuel Agullo and Camille Coti and Thomas Herault and Julien Langou and Sylvain Peyronnet and A. Rezmerita and Franck Cappello and Jack Dongarra} } @inproceedings {icl:674, title = {Scalable Runtime for MPI: Efficiently Building the Communication Infrastructure}, journal = {Proceedings of Recent Advances in the Message Passing Interface - 18th European MPI Users{\textquoteright} Group Meeting, EuroMPI 2011}, volume = {6960}, year = {2011}, month = {2011-09}, pages = {342-344}, publisher = {Springer}, address = {Santorini, Greece}, keywords = {ftmpi}, author = {George Bosilca and Thomas Herault and Pierre Lemariner and Jack Dongarra and A. Rezmerita}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @article {icl:555, title = {Constructing Resiliant Communication Infrastructure for Runtime Environments in Advances in Parallel Computing}, journal = {Advances in Parallel Computing - Parallel Computing: From Multicores and GPU{\textquoteright}s to Petascale}, volume = {19}, year = {2010}, pages = {441-451}, doi = {10.3233/978-1-60750-530-3-441}, author = {George Bosilca and Camille Coti and Thomas Herault and Pierre Lemariner and Jack Dongarra}, editor = {Barbara Chapman and Frederic Desprez and Gerhard R. Joubert and Alain Lichnewsky and Frans Peters and T. Priol} } @article {icl:574, title = {QCG-OMPI: MPI Applications on Grids}, journal = {Future Generation Computer Systems}, volume = {27}, number = {4}, year = {2010}, month = {2010-03}, pages = {357-369}, author = {Emmanuel Agullo and Camille Coti and Thomas Herault and Julien Langou and Sylvain Peyronnet and A. Rezmerita and Franck Cappello and Jack Dongarra} } @inproceedings {icl:532, title = {QR Factorization of Tall and Skinny Matrices in a Grid Computing Environment}, journal = {24th IEEE International Parallel and Distributed Processing Symposium (also LAWN 224)}, year = {2010}, month = {2010-04}, address = {Atlanta, GA}, author = {Emmanuel Agullo and Camille Coti and Jack Dongarra and Thomas Herault and Julien Langou} } @techreport {icl:484, title = {Constructing resiliant communication infrastructure for runtime environments}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-09-02}, year = {2009}, month = {2009-07}, author = {George Bosilca and Camille Coti and Thomas Herault and Pierre Lemariner and Jack Dongarra} } @article {icl:517, title = {Constructing Resilient Communication Infrastructure for Runtime Environments}, journal = {ParCo 2009}, year = {2009}, month = {2009-09}, address = {Lyon France}, author = {Pierre Lemariner and George Bosilca and Camille Coti and Thomas Herault and Jack Dongarra} } @inproceedings {icl:520, title = {Grid Computing applied to the Boundary Element Method}, journal = {Proceedings of the First International Conference on Parallel, Distributed and Grid Computing for Engineering}, volume = {27}, number = {:104203/9027}, year = {2009}, month = {2009-00}, publisher = {Civil-Comp Press}, address = {Stirlingshire, UK}, keywords = {netsolve}, author = {Manoel Cunha and Jose Telles and Asim YarKhan and Jack Dongarra}, editor = {B. H. V. Topping and Peter Iv{\'a}nyi} } @article {, title = {Highly Scalable Self-Healing Algorithms for High Performance Scientific Computing}, journal = {IEEE Transactions on Computers}, volume = {58}, year = {2009}, month = {2009-11}, pages = {1512-1524}, abstract = {As the number of processors in today{\textquoteright}s high-performance computers continues to grow, the mean-time-to-failure of these computers is becoming significantly shorter than the execution time of many current high-performance computing applications. Although today{\textquoteright}s architectures are usually robust enough to survive node failures without suffering complete system failure, most of today{\textquoteright}s high-performance computing applications cannot survive node failures. Therefore, whenever a node fails, all surviving processes on surviving nodes usually have to be aborted and the whole application has to be restarted. In this paper, we present a framework for building self-healing high-performance numerical computing applications so that they can adapt to node or link failures without aborting themselves. The framework is based on FT-MPI and diskless checkpointing. Our diskless checkpointing uses weighted checksum schemes, a variation of Reed-Solomon erasure codes over floating-point numbers. We introduce several scalable encoding strategies into the existing diskless checkpointing and reduce the overhead to survive k failures in p processes from 2[log p]. k ((beta + 2gamma) m + alpha) to (1 + O (radic(p)/radic(m))) 2 . k (beta + 2gamma)m, where alpha is the communication latency, 1/beta is the network bandwidth between processes, {1\over \gamma } is the rate to perform calculations, and m is the size of local checkpoint per process. When additional checkpoint processors are used, the overhead can be reduced to (1 + O (1/radic(m))). k (beta + 2gamma)m, which is independent of the total number of computational processors. The introduced self-healing algorithms are scalable in the sense that the overhead to survive k failures in p processes does not increase as the number of processes p increases. We evaluate the performance overhead of our self-healing approach by using a preconditioned conjugate gradient equation solver as an example.}, doi = {https://doi.org/10.1109/TC.2009.42}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:481, title = {The International Exascale Software Project: A Call to Cooperative Action by the Global High Performance Community}, journal = {International Journal of High Performance Computing Applications (to appear)}, year = {2009}, month = {2009-07}, author = {Jack Dongarra and Pete Beckman and Patrick Aerts and Franck Cappello and Thomas Lippert and Satoshi Matsuoka and Paul Messina and Terry Moore and Rick Stevens and Anne Trefethen and Mateo Valero} } @inproceedings {icl:499, title = {Making Performance Analysis and Tuning Part of the Software Development Cycle}, journal = {Proceedings of DoD HPCMP UGC 2009}, year = {2009}, month = {2009-06}, publisher = {IEEE}, address = {San Diego, CA}, author = {Ricardo Portillo and Patricia J. Teller and David Cronk and Shirley Moore} } @inproceedings {icl:602, title = {Modeling the Office of Science Ten Year Facilities Plan: The PERI Architecture Tiger Team}, journal = {SciDAC 2009, Journal of Physics: Conference Series}, volume = {180(2009)012039}, year = {2009}, month = {2009-07}, publisher = {IOP Publishing}, address = {San Diego, California}, keywords = {test}, author = {Bronis R. de Supinski and Sadaf Alam and David Bailey and Laura Carrington and Chris Daley and Anshu Dubey and Todd Gamblin and Dan Gunter and Paul D. Hovland and Heike Jagode and Karen Karavanic and Gabriel Marin and John Mellor-Crummey and Shirley Moore and Boyana Norris and Leonid Oliker and Catherine Olschanowsky and Philip C. Roth and Martin Schulz and Sameer Shende and Allan Snavely} } @inproceedings {icl:503, title = {MPI-aware Compiler Optimizations for Improving Communication-Computation Overlap}, journal = {Proceedings of the 23rd annual International Conference on Supercomputing (ICS {\textquoteright}09)}, year = {2009}, month = {2009-06}, pages = {316-325}, publisher = {ACM}, address = {Yorktown Heights, NY, USA}, author = {Anthony Danalis and Lori Pollock and Martin Swany and John Cavazos} } @article {icl:505, title = {Parallel Dense Linear Algebra Software in the Multicore Era}, journal = {in Cyberinfrastructure Technologies and Applications}, year = {2009}, month = {2009-00}, pages = {9-24}, publisher = {Nova Science Publishers, Inc.}, keywords = {plasma}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou}, editor = {Junwei Cao} } @article {icl:437, title = {Algorithm-Based Fault Tolerance for Fail-Stop Failures}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {19}, number = {12}, year = {2008}, month = {2008-01}, keywords = {FT-MPI, lapack, scalapack}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:451, title = {DARPA{\textquoteright}s HPCS Program: History, Models, Tools, Languages}, journal = {in Advances in Computers}, volume = {72}, year = {2008}, month = {2008-01}, publisher = {Elsevier}, author = {Jack Dongarra and Robert Graybill and William Harrod and Robert Lucas and Ewing Lusk and Piotr Luszczek and Janice McMahon and Allan Snavely and Jeffrey Vetter and Katherine Yelick and Sadaf Alam and Roy Campbell and Laura Carrington and Tzu-Yi Chen and Omid Khalili and Jeremy Meredith and Mustafa Tikir}, editor = {M. Zelkowitz} } @article {icl:409, title = {High Performance GridRPC Middleware}, journal = {Recent developments in Grid Technology and Applications}, year = {2008}, month = {2008-00}, publisher = {Nova Science Publishers}, keywords = {netsolve}, author = {Yves Caniou and Eddy Caron and Frederic Desprez and Hidemoto Nakada and Yoshio Tanaka and Keith Seymour}, editor = {George A. Gravvanis and John P. Morrison and Hamid R. Arabnia and D. A. Power} } @inproceedings {icl:416, title = {Interior State Computation of Nano Structures}, journal = {PARA 2008, 9th International Workshop on State-of-the-Art in Scientific and Parallel Computing}, year = {2008}, month = {2008-05}, address = {Trondheim, Norway}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @article {icl:417, title = {Performance Instrumentation and Compiler Optimizations for MPI/OpenMP Applications}, journal = {Lecture Notes in Computer Science, OpenMP Shared Memory Parallel Programming}, volume = {4315}, year = {2008}, month = {2008-00}, publisher = {Springer Berlin / Heidelberg}, author = {Oscar Hernandez and Fengguang Song and Barbara Chapman and Jack Dongarra and Bernd Mohr and Shirley Moore and Felix Wolf} } @article {icl:462, title = {PERI Auto-tuning}, journal = {Proc. SciDAC 2008}, volume = {125}, year = {2008}, month = {2008-01}, publisher = {Journal of Physics}, address = {Seatlle, Washington}, keywords = {gco}, author = {David Bailey and Jacqueline Chame and Chun Chen and Jack Dongarra and Mary Hall and Jeffrey K. Hollingsworth and Paul D. Hovland and Shirley Moore and Keith Seymour and Jaewook Shin and Ananta Tiwari and Sam Williams and Haihang You} } @article {icl:447, title = {State-of-the-Art Eigensolvers for Electronic Structure Calculations of Large Scale Nano-Systems}, journal = {Journal of Computational Physics}, volume = {227}, number = {15}, year = {2008}, month = {2008-01}, pages = {7113-7124}, author = {Christof Voemel and Stanimire Tomov and Osni Marques and Andrew Canning and Lin-Wang Wang and Jack Dongarra} } @article {icl:366, title = {Disaster Survival Guide in Petascale Computing: An Algorithmic Approach}, journal = {in Petascale Computing: Algorithms and Applications (to appear)}, year = {2007}, month = {2007-00}, publisher = {Chapman \& Hall - CRC Press}, author = {Jack Dongarra and Zizhong Chen and George Bosilca and Julien Langou} } @inproceedings {icl:390, title = {Memory Leak Detection in Fortran Applications using TAU}, journal = {Proc. DoD HPCMP Users Group Conference (HPCMP-UGC{\textquoteright}07)}, year = {2007}, month = {2007-01}, publisher = {IEEE Computer Society}, address = {Pittsburgh, PA}, author = {Sameer Shende and Allen D. Malony and Shirley Moore and David Cronk} } @article {icl:397, title = {Recovery Patterns for Iterative Methods in a Parallel Unstable Environment}, journal = {SIAM SISC (to appear)}, year = {2007}, month = {2007-05}, author = {Julien Langou and Zizhong Chen and George Bosilca and Jack Dongarra} } @inproceedings {icl:393, title = {Self Adapting Application Level Fault Tolerance for Parallel and Distributed Computing}, journal = {Proceedings of Workshop on Self Adapting Application Level Fault Tolerance for Parallel and Distributed Computing at IPDPS}, year = {2007}, month = {2007-03}, pages = {1-8}, author = {Zizhong Chen and Ming Yang and Guillermo Francia III and Jack Dongarra} } @inproceedings {icl:331, title = {Algorithm-Based Checkpoint-Free Fault Tolerance for Parallel Matrix Computations on Volatile Resources}, journal = {IPDPS 2006, 20th IEEE International Parallel and Distributed Processing Symposium}, year = {2006}, month = {2006-01}, address = {Rhodes Island, Greece}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:402, title = {Conjugate-Gradient Eigenvalue Solvers in Computing Electronic Properties of Nanostructure Architectures}, journal = {International Journal of Computational Science and Engineering}, volume = {2}, number = {3/4}, year = {2006}, month = {2006-00}, pages = {205-212}, author = {Stanimire Tomov and Julien Langou and Jack Dongarra and Andrew Canning and Lin-Wang Wang} } @article {icl:652, title = {A High-Performance, Heterogeneous MPI}, journal = {HeteroPar 2006}, year = {2006}, month = {2006-09}, address = {Barcelona, Spain}, author = {Richard L. Graham and Galen M. Shipman and Brian Barrett and Ralph Castain and George Bosilca and Andrew Lumsdaine} } @inproceedings {icl:325, title = {Performance evaluation of eigensolvers in nano-structure computations}, journal = {IEEE/ACM Proceedings of HPCNano SC06 (to appear)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @inproceedings {icl:319, title = {Performance Instrumentation and Compiler Optimizations for MPI/OpenMP Applications}, journal = {Second International Workshop on OpenMP}, year = {2006}, month = {2006-01}, address = {Reims, France}, keywords = {kojak}, author = {Oscar Hernandez and Fengguang Song and Barbara Chapman and Jack Dongarra and Bernd Mohr and Shirley Moore and Felix Wolf} } @article {icl:327, title = {Predicting the electronic properties of 3D, million-atom semiconductor nanostructure architectures}, journal = {J. Phys.: Conf. Ser. 46}, volume = {:101088/1742-6596/46/1/040}, year = {2006}, month = {2006-01}, pages = {292-298}, keywords = {DOE_NANO}, author = {Alex Zunger and Alberto Franceschetti and Gabriel Bester and Wesley B. Jones and Kwiseon Kim and Peter A. Graf and Lin-Wang Wang and Andrew Canning and Osni Marques and Christof Voemel and Jack Dongarra and Julien Langou and Stanimire Tomov} } @article {icl:332, title = {Self Adapting Numerical Software SANS Effort}, journal = {IBM Journal of Research and Development}, volume = {50}, number = {2/3}, year = {2006}, month = {2006-01}, pages = {223-238}, keywords = {gco}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Victor Eijkhout and Graham Fagg and Erika Fuentes and Julien Langou and Piotr Luszczek and Jelena Pjesivac{\textendash}Grbovic and Keith Seymour and Haihang You and Sathish Vadhiyar} } @inproceedings {icl:324, title = {Towards bulk based preconditioning for quantum dot computations}, journal = {IEEE/ACM Proceedings of HPCNano SC06 (to appear)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @techreport {icl:263, title = {Algorithm-Based Checkpoint-Free Fault Tolerance for Parallel Matrix Computations on Volatile Resources}, journal = {University of Tennessee Computer Science Department Technical Report}, volume = {{\textendash}05-561}, year = {2005}, month = {2005-11}, author = {Zizhong Chen and Jack Dongarra} } @inproceedings {icl:284, title = {Comparison of Nonlinear Conjugate-Gradient methods for computing the Electronic Properties of Nanostructure Architectures}, journal = {Proceedings of 5th International Conference on Computational Science (ICCS)}, year = {2005}, month = {2005-01}, pages = {317-325}, publisher = {Springer{\textquoteright}s Lecture Notes in Computer Science}, address = {Atlanta, GA, USA}, keywords = {doe-nano}, author = {Stanimire Tomov and Julien Langou and Andrew Canning and Lin-Wang Wang and Jack Dongarra}, editor = {V. S. Sunderman and Geert Dick van Albada and Peter M. Sloot and Jack Dongarra} } @techreport {icl:303, title = {Condition Numbers of Gaussian Random Matrices}, journal = {University of Tennessee Computer Science Department Technical Report}, volume = {{\textendash}04-539}, year = {2005}, month = {2005-00}, keywords = {ft-la}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:266, title = {Condition Numbers of Gaussian Random Matrices}, journal = {SIAM Journal on Matrix Analysis and Applications (to appear)}, year = {2005}, month = {2005-01}, keywords = {ftmpi, grads, lacsi, sans}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:292, title = {Conjugate-Gradient Eigenvalue Solvers in Computing Electronic Properties of Nanostructure Architectures}, journal = {International Journal of Computational Science and Engineering (to appear)}, year = {2005}, month = {2005-01}, author = {Stanimire Tomov and Julien Langou and Andrew Canning and Lin-Wang Wang and Jack Dongarra} } @inproceedings {icl:293, title = {Dynamic Process Management for Pipelined Applications}, journal = {Proceedings of DoD HPCMP UGC 2005 (to appear)}, year = {2005}, month = {2005-01}, publisher = {IEEE}, address = {Nashville, TN}, author = {David Cronk and Graham Fagg and Susan Emeny and Scott Tucker} } @inproceedings {icl:265, title = {Fault Tolerant High Performance Computing by a Coding Approach}, journal = {Proceedings of ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (to appear)}, year = {2005}, month = {2005-01}, address = {Chicago, Illinois}, keywords = {ftmpi, grads, lacsi, sans}, author = {Zizhong Chen and Graham Fagg and Edgar Gabriel and Julien Langou and Thara Angskun and George Bosilca and Jack Dongarra} } @article {, title = {NanoPSE: A Nanoscience Problem Solving Environment for Atomistic Electronic Structure of Semiconductor Nanostructures}, journal = {Journal of Physics: Conference Series}, year = {2005}, month = {2005-06}, pages = {277-282}, abstract = {Researchers at the National Renewable Energy Laboratory and their collaborators have developed over the past ~10 years a set of algorithms for an atomistic description of the electronic structure of nanostructures, based on plane-wave pseudopotentials and configuration interaction. The present contribution describes the first step in assembling these various codes into a single, portable, integrated set of software packages. This package is part of an ongoing research project in the development stage. Components of NanoPSE include codes for atomistic nanostructure generation and passivation, valence force field model for atomic relaxation, code for potential field generation, empirical pseudopotential method solver, strained linear combination of bulk bands method solver, configuration interaction solver for excited states, selection of linear algebra methods, and several inverse band structure solvers. Although not available for general distribution at this time as it is being developed and tested, the design goal of the NanoPSE software is to provide a software context for collaboration. The software package is enabled by fcdev, an integrated collection of best practice GNU software for open source development and distribution augmented to better support FORTRAN.}, doi = {https://doi.org/10.1088/1742-6596/16/1/038}, url = {https://iopscience.iop.org/article/10.1088/1742-6596/16/1/038/meta}, author = {Wesley B. Jones and Gabriel Bester and Andrew Canning and Alberto Franceschetti and Peter A. Graf and Kwiseon Kim and Julien Langou and Lin-Wang Wang and Jack Dongarra and Alex Zunger} } @article {icl:278, title = {New Grid Scheduling and Rescheduling Methods in the GrADS Project}, journal = {International Journal of Parallel Programming}, volume = {33}, number = {2}, year = {2005}, month = {2005-06}, pages = {209-229}, publisher = {Springer}, keywords = {grads}, author = {Francine Berman and Henri Casanova and Andrew Chien and Keith Cooper and Holly Dail and Anshuman Dasgupta and Wei Deng and Jack Dongarra and Lennart Johnsson and Ken Kennedy and Charles Koelbel and Bo Liu and Xu Liu and Anirban Mandal and Gabriel Marin and Mark Mazina and John Mellor-Crummey and Celso Mendes and A. Olugbile and Jignesh M. Patel and Dan Reed and Zhiao Shi and Otto Sievert and H. Xia and Asim YarKhan} } @inproceedings {icl:267, title = {Numerically Stable Real Number Codes Based on Random Matrices}, journal = {The International Conference on Computational Science}, year = {2005}, month = {2005-01}, publisher = {LNCS 3514, Springer-Verlag}, address = {Atlanta, GA}, keywords = {ftmpi, grads, lacsi, sans}, author = {Zizhong Chen and Jack Dongarra} } @inproceedings {icl:287, title = {Performance Analysis of GYRO: A Tool Evaluation}, journal = {In Proceedings of the 2005 SciDAC Conference}, year = {2005}, month = {2005-06}, address = {San Francisco, CA}, keywords = {kojak}, author = {Patrick H. Worley and Jeff Candy and Laura Carrington and Kevin Huck and Timothy Kaiser and Kumar Mahinthakumar and Allen D. Malony and Shirley Moore and Dan Reed and Philip C. Roth and H. Shan and Sameer Shende and Allan Snavely and S. Sreepathi and Felix Wolf and Y. Zhang} } @conference {icl:298, title = {Performance Profiling and Analysis of DoD Applications using PAPI and TAU}, booktitle = {Proceedings of DoD HPCMP UGC 2005}, year = {2005}, month = {2005-06}, publisher = {IEEE}, organization = {IEEE}, address = {Nashville, TN}, keywords = {papi}, author = {Shirley Moore and David Cronk and Felix Wolf and Avi Purkayastha and Patricia J. Teller and Robert Araiza and Gabriela Aguilera and Jamie Nava} } @techreport {icl:301, title = {Recovery Patterns for Iterative Methods in a Parallel Unstable Environment}, journal = {University of Tennessee Computer Science Department Technical Report, UT-CS-04-538}, year = {2005}, month = {2005-00}, keywords = {ft-la}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Julien Langou} } @article {icl:236, title = {Cray X1 Evaluation Status Report}, journal = {Oak Ridge National Laboratory Report}, volume = {/-2004/13}, year = {2004}, month = {2004-01}, author = {Pratul Agarwal and R. A. Alexander and E. Apra and Satish Balay and Arthur S. Bland and James Colgan and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Tom Dunigan and Mark Fahey and Al Geist and M. Gordon and Robert Harrison and Dinesh Kaushik and M. Krishnakumar and Piotr Luszczek and Tony Mezzacapa and Jeff Nichols and Jarek Nieplocha and Leonid Oliker and T. Packwood and M. Pindzola and Thomas C. Schulthess and Jeffrey Vetter and James B White and T. Windus and Patrick H. Worley and Thomas Zacharia} } @inproceedings {icl:230, title = {Extending the MPI Specification for Process Fault Tolerance on High Performance Computing Systems}, journal = {Proceedings of ISC2004 (to appear)}, year = {2004}, month = {2004-06}, address = {Heidelberg, Germany}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and George Bosilca and Thara Angskun and Zizhong Chen and Jelena Pjesivac{\textendash}Grbovic and Kevin London and Jack Dongarra} } @inproceedings {icl:142, title = {LAPACK for Clusters Project: An Example of Self Adapting Numerical Software}, journal = {Proceedings of the 37th Annual Hawaii International Conference on System Sciences (HICSS 04{\textquoteright})}, volume = {9}, year = {2004}, month = {2004-01}, pages = {90282}, address = {Big Island, Hawaii}, keywords = {lacsi, lfc}, author = {Zizhong Chen and Jack Dongarra and Piotr Luszczek and Kenneth Roche} } @techreport {icl:234, title = {Numerically Stable Real-Number Codes Based on Random Matrices}, journal = {University of Tennessee Computer Science Department Technical Report}, volume = {{\textendash}04-526}, year = {2004}, month = {2004-10}, keywords = {ftmpi}, author = {Zizhong Chen and Jack Dongarra} } @article {icl:240, title = {Process Fault-Tolerance: Semantics, Design and Applications for High Performance Computing}, journal = {International Journal for High Performance Applications and Supercomputing (to appear)}, year = {2004}, month = {2004-04}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and Zizhong Chen and Thara Angskun and George Bosilca and Jelena Pjesivac{\textendash}Grbovic and Jack Dongarra} } @techreport {icl:251, title = {Recovery Patterns for Iterative Methods in a Parallel Unstable Environment}, journal = {ICL Technical Report}, number = {ICL-UT-04-04}, year = {2004}, month = {2004-01}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Julien Langou} } @article {icl:202, title = {The Virtual Instrument: Support for Grid-enabled Scientific Simulations}, journal = {International Journal of High Performance Computing Applications}, volume = {18}, number = {1}, year = {2004}, month = {2004-01}, pages = {3-17}, author = {Henri Casanova and Thomas Bartol and Francine Berman and Adam Birnbaum and Jack Dongarra and Mark Ellisman and Marcio Faerman and Erhan Gockay and Michelle Miller and Graziano Obertelli and Stuart Pomerantz and Terry Sejnowski and Joel Stiles and Rich Wolski} } @article {icl:195, title = {Automatic performance analysis of hybrid MPI/OpenMP applications}, journal = {Journal of Systems Architecture, Special Issue {\textquoteright}Evolutions in parallel distributed and network-based processing{\textquoteright}}, volume = {49(10-11)}, year = {2003}, month = {2003-11}, pages = {421-439}, publisher = {Elsevier}, keywords = {kojak}, author = {Felix Wolf and Bernd Mohr}, editor = {Andrea Clematis and Daniele D{\textquoteright}Agostino} } @inproceedings {icl:153, title = {Fault Tolerant Communication Library and Applications for High Performance Computing}, journal = {Los Alamos Computer Science Institute (LACSI) Symposium 2003 (presented)}, year = {2003}, month = {2003-10}, address = {Santa Fe, NM}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and Zizhong Chen and Thara Angskun and George Bosilca and Antonin Bukovsky and Jack Dongarra} } @inproceedings {icl:171, title = {Optimizing Performance and Reliability in Distributed Computing Systems Through Wide Spectrum Storage}, journal = {Proceedings of the IPDPS 2003, NGS Workshop}, year = {2003}, month = {2003-01}, pages = {209}, address = {Nice, France}, author = {James Plank and Micah Beck and Jack Dongarra and Rich Wolski and Henri Casanova} } @article {icl:138, title = {Scheduling in the Grid Application Development Software Project}, journal = {Resource Management in the Grid}, year = {2003}, month = {2003-03}, publisher = {Kluwer Publishers}, keywords = {grads}, author = {Holly Dail and Otto Sievert and Francine Berman and Henri Casanova and Asim YarKhan and Sathish Vadhiyar and Jack Dongarra and Chuang Liu and Lingyun Yang and Dave Angulo and Ian Foster} } @article {icl:136, title = {Self Adapting Software for Numerical Linear Algebra and LAPACK for Clusters}, journal = {Parallel Computing}, volume = {29}, number = {11-12}, year = {2003}, month = {2003-11}, pages = {1723-1743}, keywords = {lacsi, lfc, sans}, author = {Zizhong Chen and Jack Dongarra and Piotr Luszczek and Kenneth Roche} } @techreport {icl:209, title = {Self Adapting Software for Numerical Linear Algebra and LAPACK for Clusters (LAPACK Working Note 160)}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-03-499}, year = {2003}, month = {2003-01}, keywords = {lacsi}, author = {Zizhong Chen and Jack Dongarra and Piotr Luszczek and Kenneth Roche} } @article {icl:119, title = {Adaptive Scheduling for Task Farming with Grid Middleware}, journal = {International Journal of Supercomputer Applications and High-Performance Computing}, volume = {13}, number = {3}, year = {2002}, month = {2002-10}, pages = {231-240}, author = {Henri Casanova and Myung Ho Kim and James Plank and Jack Dongarra} } @article {icl:80, title = {Automatic Optimisation of Parallel Linear Algebra Routines in Systems with Variable Load}, journal = {EuroPar 2002}, year = {2002}, month = {2002-08}, address = {Paderborn, Germany}, author = {Javier Cuenca and Domingo Giminez and Jos{\'e} Gonz{\'a}lez and Jack Dongarra and Kenneth Roche} } @article {icl:113, title = {A Comparison of Parallel Solvers for General Narrow Banded Linear Systems}, journal = {Parallel and Distributed Computing Practices}, volume = {2}, year = {2002}, month = {2002-10}, pages = {385-400}, author = {Peter Arbenz and Andrew Cleary and Jack Dongarra and Markus Hegland} } @techreport {icl:97, title = {GridRPC: A Remote Procedure Call API for Grid Computing}, journal = {ICL Technical Report}, number = {ICL-UT-02-06}, year = {2002}, month = {2002-11}, author = {Keith Seymour and Hidemoto Nakada and Satoshi Matsuoka and Jack Dongarra and Craig Lee and Henri Casanova} } @article {icl:207, title = {Innovations of the NetSolve Grid Computing System}, journal = {Concurrency: Practice and Experience}, volume = {14}, number = {13-15}, year = {2002}, month = {2002-01}, pages = {1457-1479}, keywords = {netsolve}, author = {Dorian Arnold and Henri Casanova and Jack Dongarra} } @article {icl:101, title = {Middleware for the Use of Storage in Communication}, journal = {Parallel Computing}, volume = {28}, number = {12}, year = {2002}, month = {2002-08}, pages = {1773-1788}, keywords = {netsolve}, author = {Micah Beck and Dorian Arnold and Alessandro Bassi and Francine Berman and Henri Casanova and Jack Dongarra and Terry Moore and Graziano Obertelli and James Plank and Martin Swany and Sathish Vadhiyar and Rich Wolski} } @inproceedings {icl:187, title = {Overview of GridRPC: A Remote Procedure Call API for Grid Computing}, journal = {Proceedings of the Third International Workshop on Grid Computing}, year = {2002}, month = {2002-01}, pages = {274-278}, author = {Keith Seymour and Hidemoto Nakada and Satoshi Matsuoka and Jack Dongarra and Craig Lee and Henri Casanova}, editor = {Manish Parashar} } @article {icl:120, title = {Stochastic Performance Prediction for Iterative Algorithms in Distributed Environments}, journal = {Journal of Parallel and Distributed Computing}, volume = {98}, number = {1}, year = {2002}, month = {2002-10}, pages = {68-91}, author = {Henri Casanova and Michael G. Thomason and Jack Dongarra} } @inproceedings {icl:79, title = {Toward a Framework for Preparing and Executing Adaptive Grid Programs}, journal = {International Parallel and Distributed Processing Symposium: IPDPS 2002 Workshops}, year = {2002}, month = {2002-04}, pages = {0171}, address = {Fort Lauderdale, FL}, keywords = {grads}, author = {Ken Kennedy and John Mellor-Crummey and Keith Cooper and Linda Torczon and Francine Berman and Andrew Chien and Dave Angulo and Ian Foster and Dennis Gannon and Lennart Johnsson and Carl Kesselman and Jack Dongarra and Sathish Vadhiyar} } @article {icl:95, title = {The Virtual Instrument: Support for Grid-enabled Scientific Simulations}, journal = {Journal of Parallel and Distributed Computing (submitted)}, year = {2002}, month = {2002-10}, author = {Henri Casanova and Thomas Bartol and Francine Berman and Adam Birnbaum and Jack Dongarra and Mark Ellisman and Marcio Faerman and Erhan Gockay and Michelle Miller and Graziano Obertelli and Stuart Pomerantz and Terry Sejnowski and Joel Stiles and Rich Wolski} } @article {icl:90, title = {The GrADS Project: Software Support for High-Level Grid Application Development}, journal = {International Journal of High Performance Applications and Supercomputing}, volume = {15}, number = {4}, year = {2001}, month = {2001-01}, pages = {327-344}, keywords = {grads}, author = {Francine Berman and Andrew Chien and Keith Cooper and Jack Dongarra and Ian Foster and Dennis Gannon and Lennart Johnsson and Ken Kennedy and Carl Kesselman and John Mellor-Crummey and Dan Reed and Linda Torczon and Rich Wolski} } @article {icl:4, title = {Logistical Computing and Internetworking: Middleware for the Use of Storage in Communication}, journal = {submitted to SC2001}, year = {2001}, month = {2001-11}, address = {Denver, Colorado}, keywords = {netsolve}, author = {Micah Beck and Dorian Arnold and Alessandro Bassi and Francine Berman and Henri Casanova and Jack Dongarra and Terry Moore and Graziano Obertelli and James Plank and Martin Swany and Sathish Vadhiyar and Rich Wolski} } @inproceedings {icl:19, title = {Metacomputing Support for the SARA3D Structural Acoustics Application}, journal = {Department of Defense Users{\textquoteright} Group Conference (to appear)}, year = {2001}, month = {2001-06}, address = {Biloxi, Mississippi}, keywords = {netsolve}, author = {Shirley Moore and Dorian Arnold and David Cronk} } @inproceedings {icl:7, title = {Network-Enabled Server Systems: Deploying Scientific Simulations on the Grid}, journal = {2001 High Performance Computing Symposium (HPC{\textquoteright}01), part of the Advance Simulation Technologies Conference}, year = {2001}, month = {2001-04}, address = {Seattle, Washington}, author = {Henri Casanova and Satoshi Matsuoka and Jack Dongarra} } @inproceedings {icl:8, title = {Parallel I/O for EQM Applications}, journal = {Department of Defense Users{\textquoteright} Group Conference Proceedings (to appear),}, year = {2001}, month = {2001-06}, address = {Biloxi, Mississippi}, keywords = {ftmpi}, author = {David Cronk and Graham Fagg and Shirley Moore} } @article {icl:20, title = {Review of Performance Analysis Tools for MPI Parallel Programs}, journal = {European Parallel Virtual Machine / Message Passing Interface Users{\textquoteright} Group Meeting, Lecture Notes in Computer Science 2131}, year = {2001}, month = {2001-09}, pages = {241-248}, publisher = {Springer Verlag, Berlin}, address = {Greece}, abstract = {In order to produce MPI applications that perform well on today{\textquoteright}s parallel architectures, programmers need effective tools for collecting and analyzing performance data. A variety of such tools, both commercial and research, are becoming available. This paper reviews and evaluations the available cross-platform MPI performance analysis tools.}, keywords = {papi}, doi = {https://doi.org/10.1007/3-540-45417-9_34}, author = {Shirley Moore and David Cronk and Kevin London and Jack Dongarra} } @article {icl:82, title = {Telescoping Languages: A Strategy for Automatic Generation of Scientific Problem-Solving Systems from Annotated Libraries}, journal = {Journal of Parallel and Distributed Computing}, volume = {61}, number = {12}, year = {2001}, month = {2001-12}, pages = {1803-1826}, author = {Ken Kennedy and Bradley Broom and Keith Cooper and Jack Dongarra and Rob Fowler and Dennis Gannon and Lennart Johnsson and John Mellor-Crummey and Linda Torczon} } @techreport {icl:30, title = {The GrADS Project: Software Support for High-Level Grid Application Development}, journal = {Technical Report}, year = {2000}, month = {2000-02}, keywords = {grads}, author = {Francine Berman and Andrew Chien and Keith Cooper and Jack Dongarra and Ian Foster and Dennis Gannon and Lennart Johnsson and Ken Kennedy and Carl Kesselman and Dan Reed and Linda Torczon and Rich Wolski} } @techreport {icl:34, title = {Metacomputing: An Evaluation of Emerging Systems}, journal = {University of Tennessee Computer Science Department Technical Report}, number = {UT-CS-00-445}, year = {2000}, month = {2000-07}, author = {David Cronk and Brett Ellis and Graham Fagg} } @techreport {icl:51, title = {A Comparison of Parallel Solvers for Diagonally Dominant and General Narrow Banded Linear Systems II (LAPACK Working Note 143)}, journal = {University of Tennessee Computer Science Department Technical Report}, number = {UT-CS-99-415}, year = {1999}, month = {1999-01}, author = {Peter Arbenz and Andrew Cleary and Jack Dongarra and Markus Hegland} } @techreport {icl:52, title = {A Comparison of Parallel Solvers for General Narrow Banded Linear Systems (LAPACK Working Note 142)}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-99-414}, year = {1999}, month = {1999-01}, author = {Peter Arbenz and Andrew Cleary and Jack Dongarra and Markus Hegland} } @article {icl:33, title = {Deploying Fault-tolerance and Task Migration with NetSolve}, journal = {Future Generation Computer Systems}, volume = {15}, number = {5-6}, year = {1999}, month = {1999-10}, pages = {745-755}, publisher = {Elsevier}, keywords = {netsolve}, author = {Henri Casanova and James Plank and Micah Beck and Jack Dongarra} } @article {icl:53, title = {Logistical Quality of Service in NetSolve}, journal = {Computer Communications}, volume = {22}, number = {11}, year = {1999}, month = {1999-01}, pages = {1034-1044}, keywords = {netsolve}, author = {Micah Beck and Henri Casanova and Jack Dongarra and Terry Moore and James Plank and Francine Berman and Rich Wolski} } @article {icl:229, title = {A Numerical Linear Algebra Problem Solving Environment Designer{\textquoteright}s Perspective (LAPACK Working Note 139)}, journal = {SIAM Annual Meeting}, year = {1999}, month = {1999-05}, address = {Atlanta, GA}, author = {Antoine Petitet and Henri Casanova and Clint Whaley and Jack Dongarra and Yves Robert} } @article {icl:75, title = {Parallel and Distributed Scientific Computing: A Numerical Linear Algebra Problem Solving Environment Designer{\textquoteright}s Perspective}, journal = {Handbook on Parallel and Distributed Processing}, year = {1999}, month = {1999-01}, author = {Antoine Petitet and Henri Casanova and Jack Dongarra and Yves Robert and Clint Whaley} } @inproceedings {icl:54, title = {Portable Representation of Internet Content Channels in I2-DSI}, journal = {4th Intl. Web Caching Workshop}, year = {1999}, month = {1999-03}, address = {San Diego, CA}, author = {Micah Beck and Rajeev Chawla and Bert Dempsey and Terry Moore} } @article {icl:63, title = {Stochastic Performance Prediction for Iterative Algorithms in Distributed Environments}, journal = {Journal of Parallel and Distributed Computing}, volume = {98}, number = {1}, year = {1999}, month = {1999-01}, pages = {68-91}, author = {Henri Casanova and Myung Ho Kim and James Plank and Jack Dongarra} } @article {icl:62, title = {Tiling on Systems with Communication/Computation Overlap}, journal = {Concurrency: Practice and Experience}, volume = {11}, number = {3}, year = {1999}, month = {1999-01}, pages = {139-153}, author = {Pierre-Yves Calland and Jack Dongarra and Yves Robert} } @article {1467, title = {ScaLAPACK: A Portable Linear Algebra Library for Distributed Memory Computers - Design Issues and Performance}, journal = {Computer Physics Communications}, volume = {97}, year = {1996}, month = {1996-08}, pages = {1-15}, abstract = {This paper outlines the content and performance of ScaLAPACK, a collection of mathematical software for linear algebra computations on distributed memory computers. The importance of developing standards for computational and message passing interfaces is discussed. We present the different components and building blocks of ScaLAPACK. This paper outlines the difficulties inherent in producing correct codes for networks of heterogeneous processors. We define a theoretical model of parallel computers dedicated to linear algebra applications: the Distributed Linear Algebra Machine (DLAM). This model provides a convenient framework for developing parallel algorithms and investigating their scalability, performance and programmability. Extensive performance results on various platforms are presented and analyzed with the help of the DLAM. Finally, this paper briefly describes future directions for the ScaLAPACK library and concludes by suggesting alternative approaches to mathematical libraries, explaining how ScaLAPACK could be integrated into efficient and user-friendly distributed systems.}, doi = {https://doi.org/10.1016/0010-4655(96)00017-3}, author = {Jaeyoung Choi and Jim Demmel and Inderjit Dhillon and Jack Dongarra and Susan Ostrouchov and Antoine Petitet and Kendall Stanley and David Walker and Clint Whaley} }