@conference {, title = {AI Benchmarking for Science: Efforts from the MLCommons Science Working Group}, booktitle = {Lecture Notes in Computer Science}, volume = {13387}, year = {2023}, month = {2023-01}, pages = {47 - 64}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {With machine learning (ML) becoming a transformative tool for science, the scientific community needs a clear catalogue of ML techniques, and their relative benefits on various scientific problems, if they were to make significant advances in science using AI. Although this comes under the purview of benchmarking, conventional benchmarking initiatives are focused on performance, and as such, science, often becomes a secondary criteria. In this paper, we describe a community effort from a working group, namely, MLCommons Science Working Group, in developing science-specific AI benchmarking for the international scientific community. Since the inception of the working group in 2020, the group has worked very collaboratively with a number of national laboratories, academic institutions and industries, across the world, and has developed four science-specific AI benchmarks. We will describe the overall process, the resulting benchmarks along with some initial results. We foresee that this initiative is likely to be very transformative for the AI for Science, and for performance-focused communities.}, isbn = {978-3-031-23219-0}, doi = {10.1007/978-3-031-23220-610.1007/978-3-031-23220-6_4}, url = {https://link.springer.com/chapter/10.1007/978-3-031-23220-6_4}, author = {Thiyagalingam, Jeyan and von Laszewski, Gregor and Yin, Junqi and Emani, Murali and Papay, Juri and Barrett, Gregg and Luszczek, Piotr and Tsaris, Aristeidis and Kirkpatrick, Christine and Wang, Feiyi and Gibbs, Tom and Vishwanath, Venkatram and Shankar, Mallikarjun and Fox, Geoffrey and Hey, Tony}, editor = {Anzt, Hartwig and Bienz, Amanda and Luszczek, Piotr and Baboulin, Marc} } @booklet {, title = {Earth Virtualization Engines - A Technical Perspective}, year = {2023}, month = {2023-09}, abstract = {Participants of the Berlin Summit on Earth Virtualization Engines (EVEs) discussed ideas and concepts to improve our ability to cope with climate change. EVEs aim to provide interactive and accessible climate simulations and data for a wide range of users. They combine high-resolution physics-based models with machine learning techniques to improve the fidelity, efficiency, and interpretability of climate projections. At their core, EVEs offer a federated data layer that enables simple and fast access to exabyte-sized climate data through simple interfaces. In this article, we summarize the technical challenges and opportunities for developing EVEs, and argue that they are essential for addressing the consequences of climate change.}, url = {https://arxiv.org/abs/2309.09002}, author = {Torsten Hoefler and Bjorn Stevens and Andreas F. Prein and Johanna Baehr and Thomas Schulthess and Thomas F. Stocker and John Taylor and Daniel Klocke and Pekka Manninen and Piers M. Forster and Tobias K{\"o}lling and Nicolas Gruber and Hartwig Anzt and Claudia Frauen and Florian Ziemen and Milan Kl{\"o}wer and Karthik Kashinath and Christoph Sch{\"a}r and Oliver Fuhrer and Bryan N. Lawrence} } @article {, title = {HPC Forecast: Cloudy and Uncertain}, journal = {Communications of the ACM}, volume = {66}, year = {2023}, month = {2023-01}, pages = {82 - 90}, abstract = {An examination of how the technology landscape has changed and possible future directions for HPC operations and innovation.}, issn = {0001-0782}, doi = {10.1145/3552309}, url = {https://dl.acm.org/doi/pdf/10.1145/3552309}, author = {Reed, Daniel and Gannon, Dennis and Dongarra, Jack} } @conference {, title = {PAQR: Pivoting Avoiding QR factorization}, booktitle = {2023 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2023}, publisher = {IEEE}, organization = {IEEE}, address = {St. Petersburg, FL, USA}, doi = {10.1109/IPDPS54959.2023.00040}, url = {https://ieeexplore.ieee.org/document/10177407/}, author = {Sid-Lakhdar, Wissam and Cayrols, Sebastien and Bielich, Daniel and Abdelfattah, Ahmad and Luszczek, Piotr and Gates, Mark and Tomov, Stanimire and Johansen, Hans and Williams-Young, David and Davis, Timothy and Dongarra, Jack and Anzt, Hartwig} } @conference {, title = {Reducing Data Motion and Energy Consumption of Geospatial Modeling Applications Using Automated Precision Conversion}, booktitle = {2023 IEEE International Conference on Cluster Computing (CLUSTER)}, year = {2023}, month = {2023-11}, publisher = {IEEE}, organization = {IEEE}, address = {Santa Fe, NM, USA}, abstract = {The burgeoning interest in large-scale geospatial modeling, particularly within the domains of climate and weather prediction, underscores the concomitant critical importance of accuracy, scalability, and computational speed. Harnessing these complex simulations{\textquoteright} potential, however, necessitates innovative computational strategies, especially considering the increasing volume of data involved. Recent advancements in Graphics Processing Units (GPUs) have opened up new avenues for accelerating these modeling processes. In particular, their efficient utilization necessitates new strategies, such as mixed-precision arithmetic, that can balance the trade-off between computational speed and model accuracy. This paper leverages PaRSEC runtime system and delves into the opportunities provided by mixed-precision arithmetic to expedite large-scale geospatial modeling in heterogeneous environments. By using an automated conversion strategy, our mixed-precision approach significantly improves computational performance (up to 3X) on Summit supercomputer and reduces the associated energy consumption on various Nvidia GPU generations. Importantly, this implementation ensures the requisite accuracy in environmental applications, a critical factor in their operational viability. The findings of this study bear significant implications for future research and development in high-performance computing, underscoring the transformative potential of mixed-precision arithmetic on GPUs in addressing the computational demands of large-scale geospatial modeling and making a stride toward a more sustainable, efficient, and accurate future in large-scale environmental applications.}, doi = {10.1109/CLUSTER52292.2023.00035}, url = {https://ieeexplore.ieee.org/document/10319946/}, author = {Cao, Qinglei and Abdulah, Sameh and Ltaief, Hatem and Genton, Marc G. and Keyes, David and Bosilca, George} } @conference {, title = {Task-Based Polar Decomposition Using SLATE on Massively Parallel Systems with Hardware Accelerators}, booktitle = {SC-W {\textquoteright}23: Proceedings of the SC {\textquoteright}23 Workshops of The International Conference on High Performance Computing, Network, Storage, and Analysis}, year = {2023}, month = {2023-11}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, abstract = {We investigate a new task-based implementation of the polar decomposition on massively parallel systems augmented with multiple GPUs using SLATE. We implement the iterative QR Dynamically-Weighted Halley (QDWH) algorithm, whose building blocks mainly consist of compute-bound matrix operations, allowing for high levels of parallelism to be exploited on various hardware architectures, such as NVIDIA, AMD, and Intel GPU-based systems. To achieve both performance and portability, we implement our QDWH-based polar decomposition in the SLATE library, which uses efficient techniques in dense linear algebra, such as 2D block cyclic data distribution and communication-avoiding algorithms, as well as modern parallel programming approaches, such as dynamic scheduling and communication overlapping, and uses OpenMP tasks to track data dependencies. We report numerical accuracy and performance results. The benchmarking campaign reveals up to an 18-fold performance speedup of the GPU accelerated implementation compared to the existing state-of-the-art implementation for the polar decomposition.}, isbn = {9798400707858}, doi = {10.1145/3624062.3624248}, url = {https://dl.acm.org/doi/proceedings/10.1145/3624062}, author = {Sukkari, Dalal and Gates, Mark and Al Farhan, Mohammed and Anzt, Hartwig and Dongarra, Jack} } @article {, title = {Using Ginkgo{\textquoteright}s memory accessor for improving the accuracy of memory-bound low precision BLAS}, journal = {Software: Practice and Experience}, volume = {532}, year = {2023}, month = {Jan-01-2023}, pages = {81 - 98}, issn = {0038-0644}, doi = {10.1002/spe.v53.110.1002/spe.3041}, url = {https://doi.org/10.1002/spe.3041}, author = {Gr{\"u}tzmacher, Thomas and Anzt, Hartwig and Quintana-Ort{\'\i}, Enrique S.} } @article {, title = {Accelerating Geostatistical Modeling and Prediction With Mixed-Precision Computations: A High-Productivity Approach With PaRSEC}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {33}, year = {2022}, month = {2022-04}, pages = {964 - 976}, abstract = {Geostatistical modeling, one of the prime motivating applications for exascale computing, is a technique for predicting desired quantities from geographically distributed data, based on statistical models and optimization of parameters. Spatial data are assumed to possess properties of stationarity or non-stationarity via a kernel fitted to a covariance matrix. A primary workhorse of stationary spatial statistics is Gaussian maximum log-likelihood estimation (MLE), whose central data structure is a dense, symmetric positive definite covariance matrix of the dimension of the number of correlated observations. Two essential operations in MLE are the application of the inverse and evaluation of the determinant of the covariance matrix. These can be rendered through the Cholesky decomposition and triangular solution. In this contribution, we reduce the precision of weakly correlated locations to single- or half- precision based on distance. We thus exploit mathematical structure to migrate MLE to a three-precision approximation that takes advantage of contemporary architectures offering BLAS3-like operations in a single instruction that are extremely fast for reduced precision. We illustrate application-expected accuracy worthy of double-precision from a majority half-precision computation, in a context where uniform single-precision is by itself insufficient. In tackling the complexity and imbalance caused by the mixing of three precisions, we deploy the PaRSEC runtime system. PaRSEC delivers on-demand casting of precisions while orchestrating tasks and data movement in a multi-GPU distributed-memory environment within a tile-based Cholesky factorization. Application-expected accuracy is maintained while achieving up to 1.59X by mixing FP64/FP32 operations on 1536 nodes of HAWK or 4096 nodes of Shaheen II , and up to 2.64X by mixing FP64/FP32/FP16 operations on 128 nodes of Summit , relative to FP64-only operations. This translates into up to 4.5, 4.7, ...}, keywords = {Computational modeling, Covariance matrices, Data models, Maximum likelihood estimation, Predictive models, runtime, Task analysis}, issn = {1045-9219}, doi = {10.1109/TPDS.2021.3084071}, url = {https://ieeexplore.ieee.org/document/9442267/https://ieeexplore.ieee.org/ielam/71/9575177/9442267-aam.pdfhttp://xplorestaging.ieee.org/ielx7/71/9575177/09442267.pdf?arnumber=9442267}, author = {Abdulah, Sameh and Qinglei Cao and Pei, Yu and George Bosilca and Jack Dongarra and Genton, Marc G. and Keyes, David E. and Ltaief, Hatem and Sun, Ying} } @inproceedings {, title = {Addressing Irregular Patterns of Matrix Computations on GPUs and Their Impact on Applications Powered by Sparse Direct Solvers}, journal = {2022 International Conference for High Performance Computing, Networking, Storage and Analysis (SC22)}, year = {2022}, month = {2022-11}, pages = {354-367}, publisher = {IEEE Computer Society}, address = {Dallas, TX}, abstract = {Many scientific applications rely on sparse direct solvers for their numerical robustness. However, performance optimization for these solvers remains a challenging task, especially on GPUs. This is due to workloads of small dense matrices that are different in size. Matrix decompositions on such irregular workloads are rarely addressed on GPUs. This paper addresses irregular workloads of matrix computations on GPUs, and their application to accelerate sparse direct solvers. We design an interface for the basic matrix operations supporting problems of different sizes. The interface enables us to develop irrLU-GPU, an LU decomposition on matrices of different sizes. We demonstrate the impact of irrLU-GPU on sparse direct LU solvers using NVIDIA and AMD GPUs. Experimental results are shown for a sparse direct solver based on a multifrontal sparse LU decomposition applied to linear systems arising from the simulation, using finite element discretization on unstructured meshes, of a high-frequency indefinite Maxwell problem.}, keywords = {GPU computing, irregular computational workloads, lu factorization, multifrontal solvers, sparse direct solvers}, url = {https://dl.acm.org/doi/abs/10.5555/3571885.3571919}, author = {Ahmad Abdelfattah and Pieter Ghysels and Wajih Boukaram and Stanimire Tomov and Xiaoye Sherry Li and Jack Dongarra} } @inbook {, title = {Batch QR Factorization on GPUs: Design, Optimization, and Tuning}, booktitle = { Lecture Notes in Computer Science}, volume = {13350}, year = {2022}, month = {2022-06}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Cham}, abstract = {QR factorization of dense matrices is a ubiquitous tool in high performance computing (HPC). From solving linear systems and least squares problems to eigenvalue problems, and singular value decompositions, the impact of a high performance QR factorization is fundamental to computer simulations and many applications. More importantly, the QR factorization on a batch of relatively small matrices has acquired a lot of attention in sparse direct solvers and low-rank approximations for Hierarchical matrices. To address this interest and demand, we developed and present a high performance batch QR factorization for Graphics Processing Units (GPUs). We present a multi-level blocking strategy that adjusts various algorithmic designs to the size of the input matrices. We also show that following the LAPACK QR design convention, while still useful, is significantly outperformed by unconventional code structures that increase data reuse. The performance results show multi-fold speedups against the state of the art libraries on the latest GPU architectures from both NVIDIA and AMD.}, keywords = {Batch linear algebra, GPU computing, QR factorization}, isbn = {978-3-031-08750-9}, doi = {10.1007/978-3-031-08751-6_5}, url = {https://link.springer.com/chapter/10.1007/978-3-031-08751-6_5}, author = {Abdelfattah, Ahmad and Stanimire Tomov and Dongarra, Jack}, editor = {Groen, Derek and de Mulatier, C{\'e}lia and Paszy{\'n}ski, Maciej and Krzhizhanovskaya, Valeria V. and Dongarra, Jack J. and Sloot, Peter M. A.} } @techreport {, title = {Communication Avoiding LU with Tournament Pivoting in SLATE}, journal = {SLATE Working Notes}, number = {18, ICL-UT-22-01}, year = {2022}, month = {2022-01}, author = {Rabab Alomairy and Mark Gates and Sebastien Cayrols and Dalal Sukkari and Kadir Akbudak and Asim YarKhan and Paul Bagwell and Jack Dongarra} } @article {, title = {Compressed basis GMRES on high-performance graphics processing units}, journal = {The International Journal of High Performance Computing Applications}, year = {2022}, month = {2022-05}, abstract = {Krylov methods provide a fast and highly parallel numerical tool for the iterative solution of many large-scale sparse linear systems. To a large extent, the performance of practical realizations of these methods is constrained by the communication bandwidth in current computer architectures, motivating the investigation of sophisticated techniques to avoid, reduce, and/or hide the message-passing costs (in distributed platforms) and the memory accesses (in all architectures). This article leverages Ginkgo{\textquoteright}s memory accessor in order to integrate a communication-reduction strategy into the (Krylov) GMRES solver that decouples the storage format (i.e., the data representation in memory) of the orthogonal basis from the arithmetic precision that is employed during the operations with that basis. Given that the execution time of the GMRES solver is largely determined by the memory accesses, the cost of the datatype transforms can be mostly hidden, resulting in the acceleration of the iterative step via a decrease in the volume of bits being retrieved from memory. Together with the special properties of the orthonormal basis (whose elements are all bounded by 1), this paves the road toward the aggressive customization of the storage format, which includes some floating-point as well as fixed-point formats with mild impact on the convergence of the iterative process. We develop a high-performance implementation of the {\textquotedblleft}compressed basis GMRES{\textquotedblright} solver in the Ginkgo sparse linear algebra library using a large set of test problems from the SuiteSparse Matrix Collection. We demonstrate robustness and performance advantages on a modern NVIDIA V100 graphics processing unit (GPU) of up to 50\% over the standard GMRES solver that stores all data in IEEE double-precision.}, issn = {1094-3420}, doi = {10.1177/10943420221115140}, url = {http://journals.sagepub.com/doi/10.1177/10943420221115140}, author = {Aliaga, Jos{\'e} I and Anzt, Hartwig and Gr{\"u}tzmacher, Thomas and Quintana-Ort{\'\i}, Enrique S and Andres E. Thomas} } @article {, title = {Compression and load balancing for efficient sparse matrix-vector product on multicore processors and graphics processing units}, journal = {Concurrency and Computation: Practice and Experience}, volume = {34}, year = {2022}, month = {2022-06}, issn = {1532-0626}, doi = {10.1002/cpe.6515}, url = {https://doi.org/10.1002/cpe.6515}, author = {Aliaga, Jos{\'e} I. and Anzt, Hartwig and Gr{\"u}tzmacher, Thomas and Quintana-Orti, Enrique S. and Andres E. Thomas} } @article {, title = {Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing}, journal = {ACM Transactions on Mathematical Software}, volume = {48}, year = {2022}, month = {2022-03}, pages = {1 - 33}, abstract = {In this article, we present Ginkgo, a modern C++ math library for scientific high performance computing. While classical linear algebra libraries act on matrix and vector objects, Ginkgo{\textquoteright}s design principle abstracts all functionality as {\textquotedblleft}linear operators,{\textquotedblright} motivating the notation of a {\textquotedblleft}linear operator algebra library.{\textquotedblright} Ginkgo{\textquoteright}s current focus is oriented toward providing sparse linear algebra functionality for high performance graphics processing unit (GPU) architectures, but given the library design, this focus can be easily extended to accommodate other algorithms and hardware architectures. We introduce this sophisticated software architecture that separates core algorithms from architecture-specific backends and provide details on extensibility and sustainability measures. We also demonstrate Ginkgo{\textquoteright}s usability by providing examples on how to use its functionality inside the MFEM and deal.ii finite element ecosystems. Finally, we offer a practical demonstration of Ginkgo{\textquoteright}s high performance on state-of-the-art GPU architectures.}, issn = {0098-3500}, doi = {10.1145/3480935}, url = {https://dl.acm.org/doi/10.1145/3480935}, author = {Anzt, Hartwig and Cojean, Terry and Flegar, Goran and G{\"o}bel, Fritz and Gr{\"u}tzmacher, Thomas and Nayak, Pratik and Ribizel, Tobias and Tsai, Yuhsiang Mike and Quintana-Ort{\'\i}, Enrique S} } @inproceedings {, title = {{Integrating process, control-flow, and data resiliency layers using a hybrid Fenix/Kokkos approach}}, journal = {2022 IEEE International Conference on Cluster Computing (CLUSTER 2022)}, year = {2022}, month = {2022-09}, address = {Heidelberg, Germany}, keywords = {checkpointing, Fault tolerance, Fenix, HPC, Kokkos, MPI-ULFM, resilience}, url = {https://hal.archives-ouvertes.fr/hal-03772536}, author = {Whitlock, Matthew and Morales, Nicolas and George Bosilca and Bouteiller, Aur{\'e}lien and Nicolae, Bogdan and Teranishi, Keita and Giem, Elisabeth and Sarkar, Vivek} } @techreport {, title = {PAQR: Pivoting Avoiding QR factorization}, journal = {ICL Technical Report}, number = {ICL-UT-22-06}, year = {2022}, month = {2022-06}, abstract = {The solution of linear least-squares problems is at the heart of many scientific and engineering applications. While any method able to minimize the backward error of such problems is considered numerically stable, the theory states that the forward error depends on the condition number of the matrix in the system of equations. On the one hand, the QR factorization is an efficient method to solve such problems, but the solutions it produces may have large forward errors when the matrix is deficient. On the other hand, QR with column pivoting (QRCP) is able to produce smaller forward errors on deficient matrices, but its cost is prohibitive compared to QR. The aim of this paper is to propose PAQR, an alternative solution method with the same cost (or smaller) as QR and as accurate as QRCP in practical cases, for the solution of rank-deficient linear least-squares problems. After presenting the algorithm and its implementations on different architectures, we compare its accuracy and performance results on a variety of application problems. }, author = {Wissam M. Sid-Lakhdar and Sebastien Cayrols and Daniel Bielich and Ahmad Abdelfattah and Piotr Luszczek and Mark Gates and Stanimire Tomov and Hans Johansen and David Williams-Young and Timothy A. Davis and Jack Dongarra} } @inproceedings {, title = {Prediction of Optimal Solvers for Sparse Linear Systems Using Deep Learning}, journal = {2022 SIAM Conference on Parallel Processing for Scientific Computing (PP)}, year = {2022}, month = {2022}, pages = {14 - 24}, publisher = {Society for Industrial and Applied Mathematics}, address = {Philadelphia, PA}, abstract = {Solving sparse linear systems is a key task in a number of computational problems, such as data analysis and simulations, and majorly determines overall execution time. Choosing a suitable iterative solver algorithm, however, can significantly improve time-to-completion. We present a deep learning approach designed to predict the optimal iterative solver for a given sparse linear problem. For this, we detail useful linear system features to drive the prediction process, the metrics we use to quantify the iterative solvers{\textquoteright} time-to-approximation performance and a comprehensive experimental evaluation of the prediction quality of the neural network. Using a hyperparameter optimization and an ablation study on the SuiteSparse matrix collection we have inferred the importance of distinct features, achieving a top-1 classification accuracy of 60\%.}, doi = {10.1137/1.978161197714110.1137/1.9781611977141.2}, url = {https://epubs.siam.org/doi/10.1137/1.9781611977141.2}, author = {Funk, Yannick and G{\"o}tz, Markus and Anzt, Hartwig}, editor = {Li, Xiaoye S. and Teranishi, Keita} } @techreport {, title = {Randomized Numerical Linear Algebra: A Perspective on the Field with an Eye to Software}, journal = {University of California, Berkeley EECS Technical Report}, number = {UCB/EECS-2022-258}, year = {2022}, month = {2022-11}, publisher = {University of California, Berkeley}, abstract = {Randomized numerical linear algebra {\textendash} RandNLA, for short {\textendash} concerns the use of randomization as a resource to develop improved algorithms for large-scale linear algebra computations. The origins of contemporary RandNLA lay in theoretical computer science, where it blossomed from a simple idea: randomization provides an avenue for computing approximate solutions to linear algebra problems more efficiently than deterministic algorithms. This idea proved fruitful in and was largely driven by the development of scalable algorithms for machine learning and statistical data analysis applications. However, the true potential of RandNLA only came into focus once it began to integrate with the fields of numerical analysis and {\textquotedblleft}classical{\textquotedblright} numerical linear algebra. Through the efforts of many individuals, randomized algorithms have been developed that provide full control over the accuracy of their solutions and that are every bit as reliable as algorithms that might be found in libraries such as LAPACK. The spectrum of possibilities offered by RandNLA has created a virtuous cycle of contributions by numerical analysts, statisticians, theoretical computer scientists, and the machine learning community. Recent years have even seen the incorporation of certain RandNLA methods into MATLAB, the NAG Library, and NVIDIA{\textquoteright}s cuSOLVER. In view of these developments, we believe the time is ripe to accelerate the adoption of RandNLA in the scientific community. In particular, we believe the community stands to benefit significantly from a suitably defined {\textquotedblleft}RandBLAS{\textquotedblright} and {\textquotedblleft}RandLAPACK,{\textquotedblright} to serve as standard libraries for RandNLA, in much the same way that BLAS and LAPACK serve as standards for deterministic linear algebra. This monograph surveys the field of RandNLA as a step toward building mean- ingful RandBLAS and RandLAPACK libraries. Section 1 begins by setting scope and design principles for RandLAPACK and summarizing subsequent sections of the monograph. Section 2 focuses on RandBLAS, which is to be responsible for sketching. Details of functionality suitable for RandLAPACK are covered in the five sections that follow. Specifically, Sections 3 to 5 cover least squares and optimization, low- rank approximation, and other select problems that are well-understood in how they benefit from randomized algorithms. The remaining sections {\textendash} on statistical leverage scores (Section 6) and tensor computations (Section 7) {\textendash} read more like traditional surveys. The different flavor of these latter sections reflects how, in our assessment, the literature on these topics is still maturing. We provide a substantial amount of pseudo-code and supplementary material over the course of five appendices. Much of the pseudo-code has been tested via publicly available Matlab and Python implementations.}, keywords = {Randomized algorithms}, doi = {10.48550/arXiv.2302.1147}, url = {https://www2.eecs.berkeley.edu/Pubs/TechRpts/2022/EECS-2022-258.html}, author = {Riley Murray and James Demmel and Michael W. Mahoney and N. Benjamin Erichson and Maksim Melnichenko and Osman Asif Malik and Laura Grigori and Piotr Luszczek and Micha{\l} Derezi{\'n}ski and Miles E. Lopes and Tianyu Liang and Hengrui Luo and Jack Dongarra} } @article {, title = {Reinventing High Performance Computing: Challenges and Opportunities}, number = {ICL-UT-22-03}, year = {2022}, month = {2022-03}, abstract = {The world of computing is in rapid transition, now dominated by a world of smartphones and cloud services, with profound implications for the future of advanced scientific computing. Simply put, high-performance computing (HPC) is at an important inflection point. For the last 60 years, the world{\textquoteright}s fastest supercomputers were almost exclusively produced in the United States on behalf of scientific research in the national laboratories. Change is now in the wind. While costs now stretch the limits of U.S. government funding for advanced computing, Japan and China are now leaders in the bespoke HPC systems funded by government mandates. Meanwhile, the global semiconductor shortage and political battles surrounding fabrication facilities affect everyone. However, another, perhaps even deeper, fundamental change has occurred. The major cloud vendors have invested in global networks of massive scale systems that dwarf today{\textquoteright}s HPC systems. Driven by the computing demands of AI, these cloud systems are increasingly built using custom semiconductors, reducing the financial leverage of traditional computing vendors. These cloud systems are now breaking barriers in game playing and computer vision, reshaping how we think about the nature of scientific computation. Building the next generation of leading edge HPC systems will require rethinking many fundamentals and historical approaches by embracing end-to-end co-design; custom hardware configurations and packaging; large-scale prototyping, as was common thirty years ago; and collaborative partnerships with the dominant computing ecosystem companies, smartphone and cloud computing vendors.}, author = {Daniel Reed and Dennis Gannon and Jack Dongarra} } @techreport {, title = {Report on the Oak Ridge National Laboratory{\textquoteright}s Frontier System}, journal = {ICL Technical Report}, number = {ICL-UT-22-05}, year = {2022}, month = {2022-05}, author = {Jack Dongarra and Al Geist} } @inproceedings {, title = {Reshaping Geostatistical Modeling and Prediction for Extreme-Scale Environmental Applications}, journal = {2022 International Conference for High Performance Computing, Networking, Storage and Analysis (SC22)}, year = {2022}, month = {2022-11}, publisher = {IEEE Press}, address = {Dallas, TX}, abstract = {We extend the capability of space-time geostatistical modeling using algebraic approximations, illustrating application-expected accuracy worthy of double precision from majority low-precision computations and low-rank matrix approximations. We exploit the mathematical structure of the dense covariance matrix whose inverse action and determinant are repeatedly required in Gaussian log-likelihood optimization. Geostatistics augments first-principles modeling approaches for the prediction of environmental phenomena given the availability of measurements at a large number of locations; however, traditional Cholesky-based approaches grow cubically in complexity, gating practical extension to continental and global datasets now available. We combine the linear algebraic contributions of mixed-precision and low-rank computations within a tilebased Cholesky solver with on-demand casting of precisions and dynamic runtime support from PaRSEC to orchestrate tasks and data movement. Our adaptive approach scales on various systems and leverages the Fujitsu A64FX nodes of Fugaku to achieve up to 12X performance speedup against the highly optimized dense Cholesky implementation.}, keywords = {climate/weather prediction, dynamic runtime systems, high performance computing., low- rank matrix approximations, mixed-precision computations, space-time geospatial statistics, Task-based programming models}, isbn = {9784665454445}, url = {https://dl.acm.org/doi/abs/10.5555/3571885.3571888}, author = {Cao, Qinglei and Abdulah, Sameh and Rabab Alomairy and Pei, Yu and Pratik Nag and George Bosilca and Dongarra, Jack and Genton, Marc G. and Keyes, David and Ltaief, Hatem and Sun, Ying} } @article {, title = {Resiliency in numerical algorithm design for extreme scale simulations}, journal = {The International Journal of High Performance Computing Applications}, volume = {36371337212766180823}, year = {2022}, month = {2022-03}, pages = {251 - 285}, keywords = {Fault tolerance, Numerical algorithms, parallel computer architecture, resilience}, issn = {1094-3420}, doi = {10.1177/10943420211055188}, url = {http://journals.sagepub.com/doi/10.1177/10943420211055188http://journals.sagepub.com/doi/pdf/10.1177/10943420211055188http://journals.sagepub.com/doi/pdf/10.1177/10943420211055188http://journals.sagepub.com/doi/full-xml/10.1177/10943420211055188}, author = {Agullo, Emmanuel and Altenbernd, Mirco and Anzt, Hartwig and Bautista-Gomez, Leonardo and Benacchio, Tommaso and Bonaventura, Luca and Bungartz, Hans-Joachim and Chatterjee, Sanjay and Ciorba, Florina M and DeBardeleben, Nathan and Drzisga, Daniel and Eibl, Sebastian and Engelmann, Christian and Gansterer, Wilfried N and Giraud, Luc and G{\"o}ddeke, Dominik and Heisig, Marco and J{\'e}z{\'e}quel, Fabienne and Kohl, Nils and Li, Xiaoye Sherry and Lion, Romain and Mehl, Miriam and Mycek, Paul and Obersteiner, Michael and Quintana-Ort{\'\i}, Enrique S and Rizzi, Francesco and R{\"u}de, Ulrich and Schulz, Martin and Fung, Fred and Speck, Robert and Stals, Linda and Teranishi, Keita and Thibault, Samuel and Th{\"o}nnes, Dominik and Wagner, Andreas and Wohlmuth, Barbara} } @conference {, title = {Threshold Pivoting for Dense LU Factorization}, booktitle = {ScalAH22: 13th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Heterogeneous Systems }, year = {2022}, month = {2022-11}, publisher = {IEEE}, organization = {IEEE}, address = {Dallas, Texas}, abstract = {LU factorization is a key approach for solving large, dense systems of linear equations. Partial row pivoting is commonly used to ensure numerical stability; however, the data movement needed for the row interchanges can reduce performance. To improve this, we propose using threshold pivoting to find pivots almost as good as those selected by partial pivoting but that result in less data movement. Our theoretical analysis bounds the element growth similarly to partial pivoting; however, it also shows that the growth of threshold pivoting for a given matrix cannot be bounded by that of partial pivoting and vice versa. Additionally, we experimentally tested the approach on the Summit supercomputer. Threshold pivoting improved performance by up to 32\% without a significant effect on accuracy. For a more aggressive configuration with up to one digit of accuracy lost, the improvement was as high as 44\%.}, doi = {10.1109/ScalAH56622.2022.00010}, author = {Neil Lindquist and Mark Gates and Piotr Luszczek and Jack Dongarra} } @article {, title = {Callback-based completion notification using MPI Continuations}, journal = {Parallel Computing}, volume = {21238566}, year = {2021}, month = {Jan-05-2021}, pages = {102793}, abstract = {Asynchronous programming models (APM) are gaining more and more traction, allowing applications to expose the available concurrency to a runtime system tasked with coordinating the execution. While MPI has long provided support for multi-threaded communication and nonblocking operations, it falls short of adequately supporting APMs as correctly and efficiently handling MPI communication in different models is still a challenge. We have previously proposed an extension to the MPI standard providing operation completion notifications using callbacks, so-called MPI Continuations. This interface is flexible enough to accommodate a wide range of different APMs. In this paper, we present an extension to the previously described interface that allows for finer control of the behavior of the MPI Continuations interface. We then present some of our first experiences in using the interface in the context of different applications, including the NAS parallel benchmarks, the PaRSEC task-based runtime system, and a load-balancing scheme within an adaptive mesh refinement solver called ExaHyPE. We show that the interface, implemented inside Open MPI, enables low-latency, high-throughput completion notifications that outperform solutions implemented in the application space.}, keywords = {MPI, MPI Continuations, OmpSs, OpenMP, parsec, TAMPI, Task-based programming models}, issn = {01678191}, doi = {10.1016/j.parco.2021.102793}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0167819121000466?via\%3Dihub}, author = {Schuchart, Joseph and Samfass, Philipp and Niethammer, Christoph and Gracia, Jos{\'e} and George Bosilca} } @inproceedings {, title = {Evaluating Task Dropping Strategies for Overloaded Real-Time Systems (Work-In-Progress)}, journal = {42nd Real Time Systems Symposium (RTSS)}, year = {2021}, publisher = {IEEE Computer Society Press}, author = {Yiqin Gao and Guillaume Pallez and Yves Robert and Frederic Vivien} } @article {, title = {Gingko: A Sparse Linear Algebrea Library for HPC}, year = {2021}, month = {2021-04}, publisher = {2021 ECP Annual Meeting}, author = {Hartwig Anzt and Natalie Beams and Terry Cojean and Fritz G{\"o}bel and Thomas Gr{\"u}tzmacher and Aditya Kashi and Pratik Nayak and Tobias Ribizel and Yuhsiang M. Tsai} } @article {, title = {An international survey on MPI users}, journal = {Parallel Computing}, volume = {108}, year = {2021}, month = {2021-12}, abstract = {The Message Passing Interface (MPI) plays a crucial part in the parallel computing ecosystem, a driving force behind many of thehigh-performance computing (HPC) successes. To maintain its relevance to the user community{\textemdash}and in particular to the growingHPC community at large{\textemdash}the MPI standard needs to identify and understand the MPI users{\textquoteright} concerns and expectations, and adaptaccordingly to continue to efficiently bridge the gap between users and hardware. This questionnaire survey was conducted usingtwo online questionnaire frameworks and has gathered more than 850 answers from 42 countries since February 2019. Some ofpreceding surveys of MPI uses are questionnaire surveys like ours, while others are conducted either by analyzing MPI programsto reveal static behavior or by using profiling tools to analyze the dynamic runtime behavior of MPI jobs. Our survey is differentfrom other questionnaire surveys in terms of its larger number of participants and wide geographic spread. As a result, it is possibleto illustrate the current status of MPI users more accurately and with a wider geographical distribution. In this report, we will showsome interesting findings, compare the results with preceding studies when possible, and provide some recommendations for MPIForum based on the findings.}, keywords = {message passing interface, MPI, survey}, doi = {10.1016/j.parco.2021.102853}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0167819121000983}, author = {Atsushi Hori and Emmanuel Jeannot and George Bosilca and Takahiro Ogura and Balazs Gerofi and Jie Yin and Yutaka Ishikawa} } @article {, title = {libCEED: Fast algebra for high-order element-based discretizations}, journal = {Journal of Open Source Software}, volume = {6}, number = {63}, year = {2021}, pages = {2945}, abstract = {Finite element methods are widely used to solve partial differential equations (PDE) in science and engineering, but their standard implementation (Arndt et al., 2020; Kirk et al., 2006; Logg et al., 2012) relies on assembling sparse matrices. Sparse matrix multiplication and triangular operations perform a scalar multiply and add for each nonzero entry, just 2 floating point operations (flops) per scalar that must be loaded from memory (Williams et al., 2009). Modern hardware is capable of nearly 100 flops per scalar streamed from memory (Rupp, 2020) so sparse matrix operations cannot achieve more than about 2\% utilization of arithmetic units. Matrix assembly becomes even more problematic when the polynomial degree p of the basis functions is increased, resulting in O(pd) storage and O(p2d) compute per degree of freedom (DoF) in d dimensions. Methods pioneered by the spectral element community (Deville et al., 2002; Orszag, 1980) exploit problem structure to reduce costs to O(1) storage and O(p) compute per DoF, with very high utilization of modern CPUs and GPUs. Unfortunately, highquality implementations have been relegated to applications and intrusive frameworks that are often difficult to extend to new problems or incorporate into legacy applications, especially when strong preconditioners are required. libCEED, the Code for Efficient Extensible Discretization (Abdelfattah et al., 2021), is a lightweight library that provides a purely algebraic interface for linear and nonlinear operators and preconditioners with element-based discretizations. libCEED provides portable performance via run-time selection of implementations optimized for CPUs and GPUs, including support for just-in-time (JIT) compilation. It is designed for convenient use in new and legacy software, and offers interfaces in C99 (International Standards Organisation, 1999), Fortran77 (ANSI, 1978), Python (Python, 2021), Julia (Bezanson et al., 2017), and Rust (Rust, 2021). Users and library developers can integrate libCEED at a low level into existing applications in place of existing matrix-vector products without significant refactoring of their own discretization infrastructure. Alternatively, users can utilize integrated libCEED support in MFEM (Anderson et al., 2020; MFEM, 2021). In addition to supporting applications and discretization libraries, libCEED provides a platform for performance engineering and co-design, as well as an algebraic interface for solvers research like adaptive p-multigrid, much like how sparse matrix libraries enable development and deployment of algebraic multigrid solvers}, keywords = {finite elements, high-order methods, High-performance computing, matrix-free, spectral elements}, doi = {10.21105/joss.02945}, url = {https://doi.org/10.21105/joss.02945}, author = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jean-Sylvain Camier and Veselin Dobrev and Yohann Dudouit and Leila Ghaffari and Tzanio Kolev and David Medina and Will Pazner and Thilina Ratnayaka and Jeremy Thompson and Stanimire Tomov} } @conference {, title = {Quo Vadis MPI RMA? Towards a More Efficient Use of MPI One-Sided Communication}, booktitle = {EuroMPI{\textquoteright}21}, year = {2021}, address = {Garching, Munich Germany}, abstract = { The MPI standard has long included one-sided communication abstractions through the MPI Remote Memory Access (RMA) interface. Unfortunately, the MPI RMA chapter in the 4.0 version of the MPI standard still contains both well-known and lesser known short-comings for both implementations and users, which lead to potentially non-optimal usage patterns. In this paper, we identify a set of issues and propose ways for applications to better express anticipated usage of RMA routines, allowing the MPI implementation to better adapt to the application{\textquoteright}s needs. In order to increase the flexibility of the RMA interface, we add the capability to duplicate windows, allowing access to the same resources encapsulated by a window using different configurations. In the same vein, we introduce the concept of MPI memory handles, meant to provide life-time guarantees on memory attached to dynamic windows, removing the overhead currently present in using dynamically exposed memory. We will show that our extensions provide improved accumulate latencies, reduced overheads for multi-threaded flushes, and allow for zero overhead dynamic memory window usage. }, keywords = {Memory Handles, MPI, MPI-RMA, RDMA}, url = {https://arxiv.org/abs/2111.08142}, author = {Schuchart, Joseph and Niethammer, Christoph and Gracia, Jos{\'e} and George Bosilca} } @article {, title = {A Set of Batched Basic Linear Algebra Subprograms and LAPACK Routines}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {47}, number = {3}, year = {2021}, pages = {1{\textendash}23}, abstract = {This article describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular, half precision is used in many very large scale applications, such as those associated with machine learning.}, keywords = {Computations on matrices, Mathematical analysis, Mathematics of computing, Numerical analysis}, doi = {10.1145/3431921}, author = {Abdelfattah, Ahmad and Costa, Timothy and Jack Dongarra and Mark Gates and Haidar, Azzam and Hammarling, Sven and Higham, Nicholas J and Kurzak, Jakub and Piotr Luszczek and Stanimire Tomov and others} } @techreport {, title = {SLATE Performance Improvements: QR and Eigenvalues}, journal = {SLATE Working Notes}, number = {17, ICL-UT-21-02}, year = {2021}, month = {2021-04}, author = {Kadir Akbudak and Paul Bagwell and Sebastien Cayrols and Mark Gates and Dalal Sukkari and Asim YarKhan and Jack Dongarra} } @techreport {, title = {SLATE Port to AMD and Intel Platforms}, journal = {SLATE Working Notes}, number = {16, ICL-UT-21-01}, year = {2021}, month = {2021-04}, author = {Ahmad Abdelfattah and Mohammed Al Farhan and Cade Brown and Mark Gates and Dalal Sukkari and Asim YarKhan and Jack Dongarra} } @article {, title = {A survey of numerical linear algebra methods utilizing mixed-precision arithmetic}, journal = {The International Journal of High Performance Computing Applications}, volume = {35}, number = {4}, year = {2021}, pages = {344{\textendash}369}, abstract = {The efficient utilization of mixed-precision numerical linear algebra algorithms can offer attractive acceleration to scientific computing applications. Especially with the hardware integration of low-precision special-function units designed for machine learning applications, the traditional numerical algorithms community urgently needs to reconsider the floating point formats used in the distinct operations to efficiently leverage the available compute power. In this work, we provide a comprehensive survey of mixed-precision numerical linear algebra routines, including the underlying concepts, theoretical background, and experimental results for both dense and sparse linear algebra problems.}, keywords = {GPUs, High-performance computing, linear algebra, Mixed-precision arithmetic, numerical mathematics}, doi = {10.1177/10943420211003313}, author = {Abdelfattah, Ahmad and Anzt, Hartwig and Boman, Erik G and Carson, Erin and Cojean, Terry and Jack Dongarra and Fox, Alyson and Mark Gates and Higham, Nicholas J and Li, Xiaoye S and others} } @inproceedings {, title = {Task-graph scheduling extensions for efficient synchronization and communication}, journal = {Proceedings of the ACM International Conference on Supercomputing}, year = {2021}, pages = {88{\textendash}101}, abstract = {Task graphs have been studied for decades as a foundation for scheduling irregular parallel applications and incorporated in many programming models including OpenMP. While many high-performance parallel libraries are based on task graphs, they also have additional scheduling requirements, such as synchronization within inner levels of data parallelism and internal blocking communications. In this paper, we extend task-graph scheduling to support efficient synchronization and communication within tasks. Compared to past work, our scheduler avoids deadlock and oversubscription of worker threads, and refines victim selection to increase the overlap of sibling tasks. To the best of our knowledge, our approach is the first to combine gang-scheduling and work-stealing in a single runtime. Our approach has been evaluated on the SLATE high-performance linear algebra library. Relative to the LLVM OMP runtime, our runtime demonstrates performance improvements of up to 13.82\%, 15.2\%, and 36.94\% for LU, QR, and Cholesky, respectively, evaluated across different configurations related to matrix size, number of nodes, and use of CPUs vs GPUs.}, keywords = {Compilers, Computing methodologies, Parallel computing methodologies, Parallel programming languages, Runtime environments, Software and its engineering, Software notations and tools}, doi = {10.1145/3447818.3461616}, author = {Bak, Seonmyeong and Hernandez, Oscar and Mark Gates and Piotr Luszczek and Sarkar, Vivek} } @article {, title = {Translational process: Mathematical software perspective}, journal = {Journal of Computational Science}, volume = {52}, year = {2021}, pages = {101216}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators}, doi = {10.1016/j.jocs.2020.101216}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {, title = {ASCR@40: Four Decades of Department of Energy Leadership in Advanced Scientific Computing Research}, year = {2020}, month = {2020-08}, publisher = {Advanced Scientific Computing Advisory Committee (ASCAC), US Department of Energy}, url = {https://computing.llnl.gov/misc/ASCR@40-Highlights.pdf}, author = {Bruce Hendrickson and Paul Messina and Buddy Bland and Jackie Chen and Phil Colella and Eli Dart and Jack Dongarra and Thom Dunning and Ian Foster and Richard Gerber and Rachel Harken and Wendy Huntoon and Bill Johnston and John Sarrao and Jeff Vetter} } @techreport {, title = {ASCR@40: Highlights and Impacts of ASCR{\textquoteright}s Programs}, year = {2020}, month = {2020-06}, publisher = {US Department of Energy{\textquoteright}s Office of Advanced Scientific Computing Research}, abstract = {The Office of Advanced Scientific Computing Research (ASCR) sits within the Office of Science in the Department of Energy (DOE). Per their web pages, {\textquotedblleft}the mission of the ASCR program is to discover, develop, and deploy computational and networking capabilities to analyze, model, simulate, and predict complex phenomena important to the DOE.{\textquotedblright} This succinct statement encompasses a wide range of responsibilities for computing and networking facilities; for procuring, deploying, and operating high performance computing, networking, and storage resources; for basic research in mathematics and computer science; for developing and sustaining a large body of software; and for partnering with organizations across the Office of Science and beyond. While its mission statement may seem very contemporary, the roots of ASCR are quite deep{\textemdash}long predating the creation of DOE. Applied mathematics and advanced computing were both elements of the Theoretical Division of the Manhattan Project. In the early 1950s, the Manhattan Project scientist and mathematician John von Neumann, then a commissioner for the AEC (Atomic Energy Commission), advocated for the creation of a Mathematics program to support the continued development and applications of digital computing. Los Alamos National Laboratory (LANL) scientist John Pasta created such a program to fund researchers at universities and AEC laboratories. Under several organizational name changes, this program has persisted ever since, and would eventually grow to become ASCR.}, doi = {https://doi.org/10.2172/1631812}, url = {https://www.osti.gov/servlets/purl/1631812}, author = {Bruce Hendrickson and Paul Messina and Buddy Bland and Jackie Chen and Phil Colella and Eli Dart and Jack Dongarra and Thom Dunning and Ian Foster and Richard Gerber and Rachel Harken and Wendy Huntoon and Bill Johnston and John Sarrao and Jeff Vetter} } @article {, title = {Clover: Computational Libraries Optimized via Exascale Research}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Stanimire Tomov and Hartwig Anzt and Piotr Luszczek and Jack Dongarra} } @conference {, title = {Energy-Aware Strategies for Reliability-Oriented Real-Time Task Allocation on Heterogeneous Platforms}, booktitle = {49th International Conference on Parallel Processing (ICPP 2020)}, year = {2020}, publisher = {ACM Press}, organization = {ACM Press}, address = {Edmonton, AB, Canada}, author = {Li Han and Yiqin Gao and Jing Liu and Yves Robert and Frederic Vivien} } @article {1459, title = {Fault Tolerance of MPI Applications in Exascale Systems: The ULFM Solution}, journal = {Future Generation Computer Systems}, volume = {106}, year = {2020}, month = {2020-05}, pages = { 467-481}, abstract = {The growth in the number of computational resources used by high-performance computing (HPC) systems leads to an increase in failure rates. Fault-tolerant techniques will become essential for long-running applications executing in future exascale systems, not only to ensure the completion of their execution in these systems but also to improve their energy consumption. Although the Message Passing Interface (MPI) is the most popular programming model for distributed-memory HPC systems, as of now, it does not provide any fault-tolerant construct for users to handle failures. Thus, the recovery procedure is postponed until the application is aborted and re-spawned. The proposal of the User Level Failure Mitigation (ULFM) interface in the MPI forum provides new opportunities in this field, enabling the implementation of resilient MPI applications, system runtimes, and programming language constructs able to detect and react to failures without aborting their execution. This paper presents a global overview of the resilience interfaces provided by the ULFM specification, covers archetypal usage patterns and building blocks, and surveys the wide variety of application-driven solutions that have exploited them in recent years. The large and varied number of approaches in the literature proves that ULFM provides the necessary flexibility to implement efficient fault-tolerant MPI applications. All the proposed solutions are based on application-driven recovery mechanisms, which allows reducing the overhead and obtaining the required level of efficiency needed in the future exascale platforms.}, keywords = {Application-level checkpointing, MPI, resilience, ULFM}, issn = {0167-739X}, doi = {https://doi.org/10.1016/j.future.2020.01.026}, url = {https://www.sciencedirect.com/science/article/pii/S0167739X1930860X}, author = {Nuria Losada and Patricia Gonz{\'a}lez and Mar{\'\i}a J. Mart{\'\i}n and George Bosilca and Aurelien Bouteiller and Keita Teranishi} } @article {, title = {Ginkgo: A High Performance Numerical Linear Algebra Library}, journal = {Journal of Open Source Software}, volume = {5}, year = {2020}, month = {2020-08}, abstract = {Ginkgo is a production-ready sparse linear algebra library for high performance computing on GPU-centric architectures with a high level of performance portability and focuses on software sustainability. The library focuses on solving sparse linear systems and accommodates a large variety of matrix formats, state-of-the-art iterative (Krylov) solvers and preconditioners, which make the library suitable for a variety of scientific applications. Ginkgo supports many architectures such as multi-threaded CPU, NVIDIA GPUs, and AMD GPUs. The heavy use of modern C++ features simplifies the addition of new executor paradigms and algorithmic functionality without introducing significant performance overhead. Solving linear systems is usually one of the most computationally and memory intensive aspects of any application. Hence there has been a significant amount of effort in this direction with software libraries such as UMFPACK (Davis, 2004) and CHOLMOD (Chen, Davis, Hager, \& Rajamanickam, 2008) for solving linear systems with direct methods and PETSc (Balay et al., 2020), Trilinos ({\textquotedblleft}The Trilinos Project Website,{\textquotedblright} 2020), Eigen (Guennebaud, Jacob, \& others, 2010) and many more to solve linear systems with iterative methods. With Ginkgo, we aim to ensure high performance while not compromising portability. Hence, we provide very efficient low level kernels optimized for different architectures and separate these kernels from the algorithms thereby ensuring extensibility and ease of use. Ginkgo is also a part of the xSDK effort (Bartlett et al., 2017) and available as a Spack (Gamblin et al., 2015) package. xSDK aims to provide infrastructure for and interoperability between a collection of related and complementary software elements to foster rapid and efficient development of scientific applications using High Performance Computing. Within this effort, we provide interoperability with application libraries such as deal.ii (Arndt et al., 2019) and mfem (Anderson et al., 2020). Ginkgo provides wrappers within these two libraries so that they can take advantage of the features of Ginkgo.}, doi = {https://doi.org/10.21105/joss.02260}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai} } @article {, title = {Ginkgo: A Node-Level Sparse Linear Algebra Library for HPC (Poster)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai and Jack Dongarra} } @article {, title = {MAGMA Templates for Scalable Linear Algebra on Emerging Architectures}, journal = {The International Journal of High Performance Computing Applications}, volume = {34}, year = {2020}, month = {2020-11}, pages = {645-658}, abstract = {With the acquisition and widespread use of more resources that rely on accelerator/wide vector{\textendash}based computing, there has been a strong demand for science and engineering applications to take advantage of these latest assets. This, however, has been extremely challenging due to the diversity of systems to support their extreme concurrency, complex memory hierarchies, costly data movement, and heterogeneous node architectures. To address these challenges, we design a programming model and describe its ease of use in the development of a new MAGMA Templates library that delivers high-performance scalable linear algebra portable on current and emerging architectures. MAGMA Templates derives its performance and portability by (1) building on existing state-of-the-art linear algebra libraries, like MAGMA, SLATE, Trilinos, and vendor-optimized math libraries, and (2) providing access (seamlessly to the users) to the latest algorithms and architecture-specific optimizations through a single, easy-to-use C++-based API.}, issn = {1094-3420}, doi = {https://doi.org/10.1177/1094342020938421}, author = {Mohammed Al Farhan and Ahmad Abdelfattah and Stanimire Tomov and Mark Gates and Dalal Sukkari and Azzam Haidar and Robert Rosenberg and Jack Dongarra} } @conference {, title = {Multiprecision Block-Jacobi for Iterative Triangular Solves}, booktitle = {European Conference on Parallel Processing (Euro-Par 2020)}, year = {2020}, month = {2020-08}, publisher = {Springer}, organization = {Springer}, abstract = {Recent research efforts have shown that Jacobi and block-Jacobi relaxation methods can be used as an effective and highly parallel approach for the solution of sparse triangular linear systems arising in the application of ILU-type preconditioners. Simultaneously, a few independent works have focused on designing efficient high performance adaptive-precision block-Jacobi preconditioning (block-diagonal scaling), in the context of the iterative solution of sparse linear systems, on manycore architectures. In this paper, we bridge the gap between relaxation methods based on regular splittings and preconditioners by demonstrating that iterative refinement can be leveraged to construct a relaxation method from the preconditioner. In addition, we exploit this insight to construct a highly-efficient sparse triangular system solver for graphics processors that combines iterative refinement with the block-Jacobi preconditioner available in the Ginkgo library.}, keywords = {Block-Jacobi, graphics processing units (GPUs), incomplete factorization preconditioning, multiprecision, sparse linear algebra}, doi = {https://doi.org/10.1007/978-3-030-57675-2_34}, author = {Fritz Goebel and Hartwig Anzt and Terry Cojean and Goran Flegar and Enrique S. Quintana-Orti} } @article {1476, title = {Numerical Algorithms for High-Performance Computational Science}, journal = {Philosophical Transactions of the Royal Society A}, volume = {378}, year = {2020}, abstract = {A number of features of today{\textquoteright}s high-performance computers make it challenging to exploit these machines fully for computational science. These include increasing core counts but stagnant clock frequencies; the high cost of data movement; use of accelerators (GPUs, FPGAs, coprocessors), making architectures increasingly heterogeneous; and multi- ple precisions of floating-point arithmetic, including half-precision. Moreover, as well as maximizing speed and accuracy, minimizing energy consumption is an important criterion. New generations of algorithms are needed to tackle these challenges. We discuss some approaches that we can take to develop numerical algorithms for high-performance computational science, with a view to exploiting the next generation of supercomputers.}, issn = {1471-2962}, doi = {https://doi.org/10.1098/rsta.2019.0066}, author = {Jack Dongarra and Laura Grigori and Nicholas J. Higham} } @techreport {1454, title = {Performance Tuning SLATE}, journal = {SLATE Working Notes}, number = {14, ICL-UT-20-01}, year = {2020}, month = {2020-01}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Mark Gates and Ali Charara and Asim YarKhan and Dalal Sukkari and Mohammed Al Farhan and Jack Dongarra} } @article {, title = {A Report of the MPI International Survey (Poster)}, year = {2020}, month = {2020-09}, publisher = {EuroMPI/USA {\textquoteright}20: 27th European MPI Users{\textquoteright} Group Meeting}, address = {Austin, TX}, author = {Atsushi Hori and Takahiro Ogura and Balazs Gerofi and Jie Yin and Yutaka Ishikawa and Emmanuel Jeannot and George Bosilca} } @conference {, title = {Reservation and Checkpointing Strategies for Stochastic Jobs}, booktitle = {34th IEEE International Parallel and Distributed Processing Symposium (IPDPS 2020)}, year = {2020}, month = {2020-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {New Orleans, LA}, author = {Ana Gainaru and Brice Goglin and Valentin Honor{\'e} and Padma Raghavan and Guillaume Pallez and Padma Raghavan and Yves Robert and Hongyang Sun} } @techreport {, title = {Roadmap for Refactoring Classic PAPI to PAPI++: Part II: Formulation of Roadmap Based on Survey Results}, journal = {PAPI++ Working Notes}, number = {2, ICL-UT-20-09}, year = {2020}, month = {2020-07}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Heike Jagode and Anthony Danalis and Damien Genet} } @article {, title = {A Set of Batched Basic Linear Algebra Subprograms}, journal = {ACM Transactions on Mathematical Software}, year = {2020}, month = {2020-10}, abstract = {This paper describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular half precision is used in many very large scale applications, such as those associated with machine learning.}, author = {Ahmad Abdelfattah and Timothy Costa and Jack Dongarra and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Mawussi Zounon} } @techreport {, title = {SLATE Performance Report: Updates to Cholesky and LU Factorizations}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-14}, year = {2020}, month = {2020-10}, publisher = {University of Tennessee}, author = {Asim YarKhan and Mohammed Al Farhan and Dalal Sukkari and Mark Gates and Jack Dongarra} } @article {, title = {SLATE: Software for Linear Algebra Targeting Exascale (POSTER)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Ali Charara and Jakub Kurzak and Asim YarKhan and Mohammed Al Farhan and Dalal Sukkari and Jack Dongarra} } @article {1464, title = {SLATE Tutorial}, year = {2020}, month = {2020-02}, publisher = {2020 ECP Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Jakub Kurzak and Asim YarKhan and Ali Charara and Jamie Finney and Dalal Sukkari and Mohammed Al Farhan and Ichitaro Yamazaki and Panruo Wu and Jack Dongarra} } @techreport {1278, title = {SLATE Users{\textquoteright} Guide}, journal = {SLATE Working Notes}, number = {10, ICL-UT-19-01}, year = {2020}, month = {2020-07}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Mark Gates and Ali Charara and Jakub Kurzak and Asim YarKhan and Mohammed Al Farhan and Dalal Sukkari and Jack Dongarra} } @techreport {, title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic}, journal = {SLATE Working Notes}, number = {15, ICL-UT-20-08}, year = {2020}, month = {2020-07}, publisher = {University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang} } @conference {, title = {Task Bench: A Parameterized Benchmark for Evaluating Parallel Runtime Performance}, booktitle = {International Conference for High Performance Computing Networking, Storage, and Analysis (SC20)}, year = {2020}, month = {2020-11}, publisher = {ACM}, organization = {ACM}, abstract = {We present Task Bench, a parameterized benchmark designed to explore the performance of distributed programming systems under a variety of application scenarios. Task Bench dramatically lowers the barrier to benchmarking and comparing multiple programming systems by making the implementation for a given system orthogonal to the benchmarks themselves: every benchmark constructed with Task Bench runs on every Task Bench implementation. Furthermore, Task Bench{\textquoteright}s parameterization enables a wide variety of benchmark scenarios that distill the key characteristics of larger applications. To assess the effectiveness and overheads of the tested systems, we introduce a novel metric, minimum effective task granularity (METG). We conduct a comprehensive study with 15 programming systems on up to 256 Haswell nodes of the Cori supercomputer. Running at scale, 100μs-long tasks are the finest granularity that any system runs efficiently with current technologies. We also study each system{\textquoteright}s scalability, ability to hide communication and mitigate load imbalance.}, url = {https://dl.acm.org/doi/10.5555/3433701.3433783}, author = {Elliott Slaughter and Wei Wu and Yuankun Fu and Legend Brandenburg and Nicolai Garcia and Wilhem Kautz and Emily Marx and Kaleb S. Morris and Qinglei Cao and George Bosilca and Seema Mirchandaney and Wonchan Lee and Sean Treichler and Patrick McCormick and Alex Aiken} } @article {, title = {Translational Process: Mathematical Software Perspective}, journal = {Journal of Computational Science}, year = {2020}, month = {2020-09}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators}, doi = {https://doi.org/10.1016/j.jocs.2020.101216}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {, title = {Translational Process: Mathematical Software Perspective}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-11}, year = {2020}, month = {2020-08}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to worldclass high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, data flow scheduling runtimes, hardware accelerators}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {1400, title = {BDEC2 Platform White Paper}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-11}, year = {2019}, month = {2019-09}, publisher = {University of Tennessee}, author = {Todd Gamblin and Pete Beckman and Kate Keahey and Kento Sato and Masaaki Kondo and Gerofi Balazs} } @conference {1449, title = {Characterization of Power Usage and Performance in Data-Intensive Applications using MapReduce over MPI}, booktitle = {2019 International Conference on Parallel Computing (ParCo2019)}, year = {2019}, month = {2019-09}, address = {Prague, Czech Republic}, author = {Joshua Davis and Tao Gao and Sunita Chandrasekaran and Heike Jagode and Anthony Danalis and Pavan Balaji and Jack Dongarra and Michela Taufer} } @techreport {1398, title = {A Collection of Presentations from the BDEC2 Workshop in Kobe, Japan}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-09}, year = {2019}, month = {2019-02}, publisher = {University of Tennessee, Knoxville}, author = {Rosa M. Badia and Micah Beck and Fran{\c c}ois Bodin and Taisuke Boku and Franck Cappello and Alok Choudhary and Carlos Costa and Ewa Deelman and Nicola Ferrier and Katsuki Fujisawa and Kohei Fujita and Maria Girone and Geoffrey Fox and Shantenu Jha and Yoshinari Kameda and Christian Kniep and William Kramer and James Lin and Kengo Nakajima and Yiwei Qiu and Kishore Ramachandran and Glenn Ricart and Kim Serradell and Dan Stanzione and Lin Gan and Martin Swany and Christine Sweeney and Alex Szalay and Christine Kirkpatrick and Kenton McHenry and Alainna White and Steve Tuecke and Ian Foster and Joe Mambretti and William. M Tang and Michela Taufer and Miguel V{\'a}zquez} } @techreport {1399, title = {A Collection of White Papers from the BDEC2 Workshop in Poznan, Poland}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-10}, year = {2019}, month = {2019-05}, publisher = {University of Tennessee, Knoxville}, author = {Gabriel Antoniu and Alexandru Costan and Ovidiu Marcu and Maria S. P{\'e}rez and Nenad Stojanovic and Rosa M. Badia and Miguel V{\'a}zquez and Sergi Girona and Micah Beck and Terry Moore and Piotr Luszczek and Ezra Kissel and Martin Swany and Geoffrey Fox and Vibhatha Abeykoon and Selahattin Akkas and Kannan Govindarajan and Gurhan Gunduz and Supun Kamburugamuve and Niranda Perera and Ahmet Uyar and Pulasthi Wickramasinghe and Chathura Widanage and Maria Girone and Toshihiro Hanawa and Richard Moreno and Ariel Oleksiak and Martin Swany and Ryousei Takano and M.P. van Haarlem and J. van Leeuwen and J.B.R. Oonk and T. Shimwell and L.V.E. Koopmans} } @techreport {1408, title = {A Collection of White Papers from the BDEC2 Workshop in San Diego, CA}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-13}, year = {2019}, month = {2019-10}, publisher = {University of Tennessee}, author = {Ilkay Altintas and Kyle Marcus and Volkan Vural and Shweta Purawat and Daniel Crawl and Gabriel Antoniu and Alexandru Costan and Ovidiu Marcu and Prasanna Balaprakash and Rongqiang Cao and Yangang Wang and Franck Cappello and Robert Underwood and Sheng Di and Justin M. Wozniak and Jon C. Calhoun and Cong Xu and Antonio Lain and Paolo Faraboschi and Nic Dube and Dejan Milojicic and Balazs Gerofi and Maria Girone and Viktor Khristenko and Tony Hey and Erza Kissel and Yu Liu and Richard Loft and Pekka Manninen and Sebastian von Alfthan and Takemasa Miyoshi and Bruno Raffin and Olivier Richard and Denis Trystram and Maryam Rahnemoonfar and Robin Murphy and Joel Saltz and Kentaro Sano and Rupak Roy and Kento Sato and Jian Guo and Jen s Domke and Weikuan Yu and Takaki Hatsui and Yasumasa Joti and Alex Szalay and William M. Tang and Michael R. Wyatt II and Michela Taufer and Todd Gamblin and Stephen Herbein and Adam Moody and Dong H. Ahn and Rich Wolski and Chandra Krintz and Fatih Bakir and Wei-tsung Lin and Gareth George} } @article {1313, title = {Co-Scheduling HPC Workloads on Cache-Partitioned CMP Platforms}, journal = {International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1221-1239}, abstract = {With the recent advent of many-core architectures such as chip multiprocessors (CMPs), the number of processing units accessing a global shared memory is constantly increasing. Co-scheduling techniques are used to improve application throughput on such architectures, but sharing resources often generates critical interferences. In this article, we focus on the interferences in the last level of cache (LLC) and use the Cache Allocation Technology (CAT) recently provided by Intel to partition the LLC and give each co-scheduled application their own cache area. We consider m iterative HPC applications running concurrently and answer to the following questions: (i) How to precisely model the behavior of these applications on the cache-partitioned platform? and (ii) how many cores and cache fractions should be assigned to each application to maximize the platform efficiency? Here, platform efficiency is defined as maximizing the performance either globally, or as guaranteeing a fixed ratio of iterations per second for each application. Through extensive experiments using CAT, we demonstrate the impact of cache partitioning when multiple HPC applications are co-scheduled onto CMP platforms.}, keywords = {cache partitioning, chip multiprocessor, co-scheduling, HPC application}, doi = {https://doi.org/10.1177/1094342019846956}, author = {Guillaume Aupy and Anne Benoit and Brice Goglin and Lo{\"\i}c Pottier and Yves Robert} } @article {1369, title = {A Customized Precision Format Based on Mantissa Segmentation for Accelerating Sparse Linear Algebra}, journal = {Concurrency and Computation: Practice and Experience}, volume = {40319}, year = {2019}, month = {2019-01}, issn = {1532-0626}, doi = {https://doi.org/10.1002/cpe.5418}, author = {Thomas Gruetzmacher and Terry Cojean and Goran Flegar and Fritz G{\"o}bel and Hartwig Anzt} } @inproceedings {1404, title = {Least Squares Solvers for Distributed-Memory Machines with GPU Accelerators}, journal = {ACM International Conference on Supercomputing (ICS {\textquoteright}19)}, year = {2019}, month = {2019-06}, pages = {117{\textendash}126}, publisher = {ACM}, address = {Phoenix, Arizona}, isbn = {9781450360791}, doi = {https://dl.acm.org/doi/abs/10.1145/3330345.3330356}, author = {Jakub Kurzak and Mark Gates and Ali Charara and Asim YarKhan and Jack Dongarra} } @inproceedings {1405, title = {Linear Systems Solvers for Distributed-Memory Machines with GPU Accelerators}, journal = {Euro-Par 2019: Parallel Processing}, volume = {11725}, year = {2019}, month = {2019-08}, pages = {495{\textendash}506}, publisher = {Springer}, isbn = {978-3-030-29399-4}, doi = {https://doi.org/10.1007/978-3-030-29400-7_35}, url = {https://link.springer.com/chapter/10.1007/978-3-030-29400-7_35}, author = {Kurzak, Jakub and Mark Gates and Charara, Ali and Asim YarKhan and Yamazaki, Ichitaro and Jack Dongarra}, editor = {Yahyapour, Ramin} } @article {1230, title = {Local Rollback for Resilient MPI Applications with Application-Level Checkpointing and Message Logging}, journal = {Future Generation Computer Systems}, volume = {91}, year = {2019}, month = {2019-02}, pages = {450-464}, abstract = {The resilience approach generally used in high-performance computing (HPC) relies on coordinated checkpoint/restart, a global rollback of all the processes that are running the application. However, in many instances, the failure has a more localized scope and its impact is usually restricted to a subset of the resources being used. Thus, a global rollback would result in unnecessary overhead and energy consumption, since all processes, including those unaffected by the failure, discard their state and roll back to the last checkpoint to repeat computations that were already done. The User Level Failure Mitigation (ULFM) interface {\textendash} the last proposal for the inclusion of resilience features in the Message Passing Interface (MPI) standard {\textendash} enables the deployment of more flexible recovery strategies, including localized recovery. This work proposes a local rollback approach that can be generally applied to Single Program, Multiple Data (SPMD) applications by combining ULFM, the ComPiler for Portable Checkpointing (CPPC) tool, and the Open MPI VProtocol system-level message logging component. Only failed processes are recovered from the last checkpoint, while consistency before further progress in the execution is achieved through a two-level message logging process. To further optimize this approach point-to-point communications are logged by the Open MPI VProtocol component, while collective communications are optimally logged at the application level{\textemdash}thereby decoupling the logging protocol from the particular collective implementation. This spatially coordinated protocol applied by CPPC reduces the log size, the log memory requirements and overall the resilience impact on the applications.}, keywords = {Application-level checkpointing, Local rollback, Message logging, MPI, resilience}, doi = {https://doi.org/10.1016/j.future.2018.09.041}, author = {Nuria Losada and George Bosilca and Aurelien Bouteiller and Patricia Gonz{\'a}lez and Mar{\'\i}a J. Mart{\'\i}n} } @article {1366, title = {MagmaDNN 0.2 High-Performance Data Analytics for Manycore GPUs and CPUs}, year = {2019}, month = {2019-01}, publisher = {University of Tennessee}, doi = {10.13140/RG.2.2.14906.64961}, author = {Lucien Ng and Sihan Chen and Alex Gessinger and Daniel Nichols and Sophia Cheng and Anu Meenasorna and Kwai Wong and Stanimire Tomov and Azzam Haidar and Eduardo D{\textquoteright}Azevedo and Jack Dongarra} } @conference {1326, title = {MagmaDNN: Accelerated Deep Learning Using MAGMA}, booktitle = {Practice and Experience in Advanced Research Computing (PEARC {\textquoteright}19)}, year = {2019}, month = {2019-07}, publisher = {ACM}, organization = {ACM}, address = {Chicago, IL}, author = {Daniel Nichols and Kwai Wong and Stanimire Tomov and Lucien Ng and Sihan Chen and Alex Gessinger} } @conference {1373, title = {Massively Parallel Automated Software Tuning}, booktitle = {48th International Conference on Parallel Processing (ICPP 2019)}, year = {2019}, month = {2019-08}, publisher = {ACM Press}, organization = {ACM Press}, address = {Kyoto, Japan}, abstract = {This article presents an implementation of a distributed autotuning engine developed as part of the Bench-testing OpenN Software Autotuning Infrastructure project. The system is geared towards performance optimization of computational kernels for graphics processing units, and allows for the deployment of vast autotuning sweeps to massively parallel machines. The software implements dynamic work scheduling to distributed-memory resources and takes advantage of multithreading for parallel compilation and dispatches kernel launches to multiple accelerators. This paper lays out the main design principles of the system and discusses the basic mechanics of the initial implementation. Preliminary performance results are presented, encountered challenges are discussed, and the future directions are outlined.}, doi = {https://doi.org/10.1145/3337821.3337908}, author = {Jakub Kurzak and Yaohung Tsai and Mark Gates and Ahmad Abdelfattah and Jack Dongarra} } @article {1270, title = {PLASMA: Parallel Linear Algebra Software for Multicore Using OpenMP}, journal = {ACM Transactions on Mathematical Software}, volume = {45}, year = {2019}, month = {2019-06}, doi = {https://doi.org/10.1145/3264491}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Maksims Abalenkovs and Negin Bagherpour and Sven Hammarling and Jakub Sistek} } @article {1440, title = {Race to Exascale}, journal = {Computing in Science and Engineering}, volume = {21}, year = {2019}, month = {2019-03}, pages = {4-5}, abstract = {Whether called leadership computing, flagship computing, or just plain exascale, over the next few years, governments around the world are planning to spend over 10 billion dollars on a handful of new computer systems that will strive to reach an exascale level of performance. These systems and projects reflect the widespread and expanding recognition that almost all science and engineering endeavors now are intrinsically reliant on computing power not just for modeling and simulation but for data analysis, big data, and machine learning. Scientists and engineers consider computers as {\textquotedblleft}universal instruments{\textquotedblright} of insight.}, issn = {1558-366X}, doi = {https://doi.org/10.1109/MCSE.2018.2882574}, author = {Jack Dongarra and Steven Gottlieb and William T. Kramer} } @conference {1316, title = {Reservation Strategies for Stochastic Jobs}, booktitle = {33rd IEEE International Parallel and Distributed Processing Symposium (IPDPS 2019)}, year = {2019}, month = {2019-05}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Rio de Janeiro, Brazil}, author = {Guillaume Aupy and Ana Gainaru and Valentin Honor{\'e} and Padma Raghavan and Yves Robert and Hongyang Sun} } @conference {1339, title = {Scheduling Independent Stochastic Tasks on Heterogeneous Cloud Platforms}, booktitle = {IEEE Cluster 2019}, year = {2019}, month = {2019-09}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Albuquerque, New Mexico}, author = {Yiqin Gao and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @article {1463, title = {SLATE: Design of a Modern Distributed and Accelerated Linear Algebra Library}, year = {2019}, month = {2019-11}, publisher = {International Conference for High Performance Computing, Networking, Storage and Analysis (SC19)}, address = {Denver, CO}, author = {Mark Gates and Jakub Kurzak and Ali Charara and Asim YarKhan and Jack Dongarra} } @conference {1450, title = {SLATE: Design of a Modern Distributed and Accelerated Linear Algebra Library}, booktitle = {International Conference for High Performance Computing, Networking, Storage and Analysis (SC19)}, year = {2019}, month = {2019-11}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, abstract = {The SLATE (Software for Linear Algebra Targeting Exascale) library is being developed to provide fundamental dense linear algebra capabilities for current and upcoming distributed high-performance systems, both accelerated CPU-GPU based and CPU based. SLATE will provide coverage of existing ScaLAPACK functionality, including the parallel BLAS; linear systems using LU and Cholesky; least squares problems using QR; and eigenvalue and singular value problems. In this respect, it will serve as a replacement for ScaLAPACK, which after two decades of operation, cannot adequately be retrofitted for modern accelerated architectures. SLATE uses modern techniques such as communication-avoiding algorithms, lookahead panels to overlap communication and computation, and task-based scheduling, along with a modern C++ framework. Here we present the design of SLATE and initial reports of several of its components.}, doi = {https://doi.org/10.1145/3295500.3356223}, author = {Mark Gates and Jakub Kurzak and Ali Charara and Asim YarKhan and Jack Dongarra} } @techreport {1279, title = {SLATE Developers{\textquoteright} Guide}, journal = {SLATE Working Notes}, number = {11, ICL-UT-19-02}, year = {2019}, month = {2019-12}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Ali Charara and Mark Gates and Jakub Kurzak and Asim YarKhan and Jack Dongarra} } @techreport {1304, title = {SLATE Mixed Precision Performance Report}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-03}, year = {2019}, month = {2019-04}, publisher = {University of Tennessee}, author = {Ali Charara and Jack Dongarra and Mark Gates and Jakub Kurzak and Asim YarKhan} } @techreport {1321, title = {SLATE Working Note 12: Implementing Matrix Inversions}, journal = {SLATE Working Notes}, number = {12, ICL-UT-19-04}, year = {2019}, month = {2019-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Jakub Kurzak and Mark Gates and Ali Charara and Asim YarKhan and Jack Dongarra} } @techreport {1394, title = {SLATE Working Note 13: Implementing Singular Value and Symmetric/Hermitian Eigenvalue Solvers}, journal = {SLATE Working Notes}, number = {13, ICL-UT-19-07}, year = {2019}, note = {revision 06-2023}, month = {2019-09}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Mark Gates and Mohammed Al Farhan and Ali Charara and Jakub Kurzak and Dalal Sukkari and Asim YarKhan and Jack Dongarra} } @article {1317, title = {Toward a Modular Precision Ecosystem for High-Performance Computing}, journal = {The International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1069-1078}, abstract = {With the memory bandwidth of current computer architectures being significantly slower than the (floating point) arithmetic performance, many scientific computations only leverage a fraction of the computational power in today{\textquoteright}s high-performance architectures. At the same time, memory operations are the primary energy consumer of modern architectures, heavily impacting the resource cost of large-scale applications and the battery life of mobile devices. This article tackles this mismatch between floating point arithmetic throughput and memory bandwidth by advocating a disruptive paradigm change with respect to how data are stored and processed in scientific applications. Concretely, the goal is to radically decouple the data storage format from the processing format and, ultimately, design a {\textquotedblleft}modular precision ecosystem{\textquotedblright} that allows for more flexibility in terms of customized data access. For memory-bounded scientific applications, dynamically adapting the memory precision to the numerical requirements allows for attractive resource savings. In this article, we demonstrate the potential of employing a modular precision ecosystem for the block-Jacobi preconditioner and the PageRank algorithm{\textemdash}two applications that are popular in the communities and at the same characteristic representatives for the field of numerical linear algebra and data analytics, respectively.}, keywords = {conjugate gradient, GPUs, Jacobi method, Modular precision, multicore processors, PageRank, parallel numerical linear algebra}, issn = {1094-3420}, doi = {https://doi.org/10.1177/1094342019846547}, author = {Hartwig Anzt and Goran Flegar and Thomas Gruetzmacher and Enrique S. Quintana-Orti} } @article {1272, title = {The 30th Anniversary of the Supercomputing Conference: Bringing the Future Closer{\textemdash}Supercomputing History and the Immortality of Now}, journal = {Computer}, volume = {51}, year = {2018}, month = {2018-11}, pages = {74{\textendash}85}, abstract = {A panel of experts{\textemdash}including Gordon Bell, Jack Dongarra, William E. (Bill) Johnston, Horst Simon, Erich Strohmaier, and Mateo Valero{\textemdash}discuss historical reflections on the past 30 years of the Supercomputing (SC) conference, its leading role for the professional community and some exciting future challenges.}, keywords = {High-performance computing, history of computing, SC, Scientific computing, supercomputing, Virtual Roundtable}, doi = {10.1109/MC.2018.3971352}, author = {Jack Dongarra and Vladimir Getov and Kevin Walsh} } @article {1336, title = {Accelerating Linear Algebra with MAGMA}, year = {2018}, month = {2018-02}, publisher = {ECP Annual Meeting 2018, Tutorial}, address = {Knoxville, TN}, author = {Stanimire Tomov and Mark Gates and Azzam Haidar} } @article {1161, title = {Accelerating the SVD Two Stage Bidiagonal Reduction and Divide and Conquer Using GPUs}, journal = {Parallel Computing}, volume = {74}, year = {2018}, month = {2018-05}, pages = {3{\textendash}18}, abstract = {The increasing gap between memory bandwidth and computation speed motivates the choice of algorithms to take full advantage of today{\textquoteright}s high performance computers. For dense matrices, the classic algorithm for the singular value decomposition (SVD) uses a one stage reduction to bidiagonal form, which is limited in performance by the memory bandwidth. To overcome this limitation, a two stage reduction to bidiagonal has been gaining popularity. It first reduces the matrix to band form using high performance Level 3 BLAS, then reduces the band matrix to bidiagonal form. As accelerators such as GPUs and co-processors are becoming increasingly widespread in high-performance computing, a question of great interest to many SVD users is how much the employment of a two stage reduction, as well as other current best practices in GPU computing, can accelerate this important routine. To fulfill this interest, we have developed an accelerated SVD employing a two stage reduction to bidiagonal and a number of other algorithms that are highly optimized for GPUs. Notably, we also parallelize and accelerate the divide and conquer algorithm used to solve the subsequent bidiagonal SVD. By accelerating all phases of the SVD algorithm, we provide a significant speedup compared to existing multi-core and GPU-based SVD implementations. In particular, using a P100 GPU, we illustrate a performance of up to 804 Gflop/s in double precision arithmetic to compute the full SVD of a 20k {\texttimes} 20k matrix in 90 seconds, which is 8.9 {\texttimes} faster than MKL on two 10 core Intel Haswell E5-2650 v3 CPUs, 3.7 {\texttimes} over the multi-core PLASMA two stage version, and 2.6 {\texttimes} over the previously accelerated one stage MAGMA version.}, keywords = {2-stage, accelerator, Divide and conquer, gpu, Singular value decomposition, SVD}, issn = {01678191}, doi = {10.1016/j.parco.2017.10.004}, url = {https://www.sciencedirect.com/science/article/pii/S0167819117301758}, author = {Mark Gates and Stanimire Tomov and Jack Dongarra} } @article {1268, title = {Autotuning in High-Performance Computing Applications}, journal = {Proceedings of the IEEE}, volume = {106}, year = {2018}, month = {2018-11}, pages = {2068{\textendash}2083}, abstract = {Autotuning refers to the automatic generation of a search space of possible implementations of a computation that are evaluated through models and/or empirical measurement to identify the most desirable implementation. Autotuning has the potential to dramatically improve the performance portability of petascale and exascale applications. To date, autotuning has been used primarily in high-performance applications through tunable libraries or previously tuned application code that is integrated directly into the application. This paper draws on the authors{\textquoteright} extensive experience applying autotuning to high-performance applications, describing both successes and future challenges. If autotuning is to be widely used in the HPC community, researchers must address the software engineering challenges, manage configuration overheads, and continue to demonstrate significant performance gains and portability across architectures. In particular, tools that configure the application must be integrated into the application build process so that tuning can be reapplied as the application and target architectures evolve.}, keywords = {High-performance computing, performance tuning programming systems}, doi = {10.1109/JPROC.2018.2841200}, author = {Prasanna Balaprakash and Jack Dongarra and Todd Gamblin and Mary Hall and Jeffrey Hollingsworth and Boyana Norris and Richard Vuduc} } @article {1271, title = {Autotuning Numerical Dense Linear Algebra for Batched Computation With GPU Hardware Accelerators}, journal = {Proceedings of the IEEE}, volume = {106}, year = {2018}, month = {2018-11}, pages = {2040{\textendash}2055}, abstract = {Computational problems in engineering and scientific disciplines often rely on the solution of many instances of small systems of linear equations, which are called batched solves. In this paper, we focus on the important variants of both batch Cholesky factorization and subsequent substitution. The former requires the linear system matrices to be symmetric positive definite (SPD). We describe the implementation and automated performance engineering of these kernels that implement the factorization and the two substitutions. Our target platforms are graphics processing units (GPUs), which over the past decade have become an attractive high-performance computing (HPC) target for solvers of linear systems of equations. Due to their throughput-oriented design, GPUs exhibit the highest processing rates among the available processors. However, without careful design and coding, this speed is mostly restricted to large matrix sizes. We show an automated exploration of the implementation space as well as a new data layout for the batched class of SPD solvers. Our tests involve the solution of many thousands of linear SPD systems of exactly the same size. The primary focus of our techniques is on the individual matrices in the batch that have dimensions ranging from 5-by-5 up to 100-by-100. We compare our autotuned solvers against the state-of-the-art solvers such as those provided through NVIDIA channels and publicly available in the optimized MAGMA library. The observed performance is competitive and many times superior for many practical cases. The advantage of the presented methodology lies in achieving these results in a portable manner across matrix storage formats and GPU hardware architecture platforms.}, keywords = {Dense numerical linear algebra, performance autotuning}, doi = {10.1109/JPROC.2018.2868961}, author = {Jack Dongarra and Mark Gates and Jakub Kurzak and Piotr Luszczek and Yaohung Tsai} } @article {1300, title = {Batched BLAS (Basic Linear Algebra Subprograms) 2018 Specification}, year = {2018}, month = {2018-07}, abstract = {This document describes an API for Batch Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). We focus on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The extensions beyond the original BLAS standard are considered that specify a programming interface not only for routines with uniformly-sized matrices and/or vectors but also for the situation where the sizes vary. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance manycore platforms. These include multicore and many-core CPU processors; GPUs and coprocessors; as well as other hardware accelerators with floating-point compute facility.}, author = {Jack Dongarra and Iain Duff and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jonathan Hogg and Pedro Valero Lara and Piotr Luszczek and Mawussi Zounon and Samuel D. Relton and Stanimire Tomov and Timothy Costa and Sarah Knepper} } @article {1211, title = {Big Data and Extreme-Scale Computing: Pathways to Convergence - Toward a Shaping Strategy for a Future Software and Data Ecosystem for Scientific Inquiry}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-07}, pages = {435{\textendash}479}, abstract = {Over the past four years, the Big Data and Exascale Computing (BDEC) project organized a series of five international workshops that aimed to explore the ways in which the new forms of data-centric discovery introduced by the ongoing revolution in high-end data analysis (HDA) might be integrated with the established, simulation-centric paradigm of the high-performance computing (HPC) community. Based on those meetings, we argue that the rapid proliferation of digital data generators, the unprecedented growth in the volume and diversity of the data they generate, and the intense evolution of the methods for analyzing and using that data are radically reshaping the landscape of scientific computing. The most critical problems involve the logistics of wide-area, multistage workflows that will move back and forth across the computing continuum, between the multitude of distributed sensors, instruments and other devices at the networks edge, and the centralized resources of commercial clouds and HPC centers. We suggest that the prospects for the future integration of technological infrastructures and research ecosystems need to be considered at three different levels. First, we discuss the convergence of research applications and workflows that establish a research paradigm that combines both HPC and HDA, where ongoing progress is already motivating efforts at the other two levels. Second, we offer an account of some of the problems involved with creating a converged infrastructure for peripheral environments, that is, a shared infrastructure that can be deployed throughout the network in a scalable manner to meet the highly diverse requirements for processing, communication, and buffering/storage of massive data workflows of many different scientific domains. Third, we focus on some opportunities for software ecosystem convergence in big, logically centralized facilities that execute large-scale simulations and models and/or perform large-scale data analytics. We close by offering some conclusions and recommendations for future investment and policy review.}, doi = {https://doi.org/10.1177/1094342018778123}, author = {Mark Asch and Terry Moore and Rosa M. Badia and Micah Beck and Pete Beckman and Thierry Bidot and Fran{\c c}ois Bodin and Franck Cappello and Alok Choudhary and Bronis R. de Supinski and Ewa Deelman and Jack Dongarra and Anshu Dubey and Geoffrey Fox and Haohuan Fu and Sergi Girona and Michael Heroux and Yutaka Ishikawa and Kate Keahey and David Keyes and William T. Kramer and Jean-Fran{\c c}ois Lavignon and Yutong Lu and Satoshi Matsuoka and Bernd Mohr and St{\'e}phane Requena and Joel Saltz and Thomas Schulthess and Rick Stevens and Martin Swany and Alexander Szalay and William Tang and Ga{\"e}l Varoquaux and Jean-Pierre Vilotte and Robert W. Wisniewski and Zhiwei Xu and Igor Zacharov} } @techreport {1397, title = {A Collection of White Papers from the BDEC2 Workshop in Bloomington, IN}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-15}, year = {2018}, month = {2018-11}, publisher = {University of Tennessee, Knoxville}, author = {James Ahrens and Christopher M. Biwer and Alexandru Costan and Gabriel Antoniu and Maria S. P{\'e}rez and Nenad Stojanovic and Rosa Badia and Oliver Beckstein and Geoffrey Fox and Shantenu Jha and Micah Beck and Terry Moore and Sunita Chandrasekaran and Carlos Costa and Thierry Deutsch and Luigi Genovese and Tarek El-Ghazawi and Ian Foster and Dennis Gannon and Toshihiro Hanawa and Tevfik Kosar and William Kramer and Madhav V. Marathe and Christopher L. Barrett and Takemasa Miyoshi and Alex Pothen and Ariful Azad and Judy Qiu and Bo Peng and Ravi Teja and Sahil Tyagi and Chathura Widanage and Jon Koskey and Maryam Rahnemoonfar and Umakishore Ramachandran and Miles Deegan and William Tang and Osamu Tatebe and Michela Taufer and Michel Cuende and Ewa Deelman and Trilce Estrada and Rafael Ferreira Da Silva and Harrel Weinstein and Rodrigo Vargas and Miwako Tsuji and Kevin G. Yager and Wanling Gao and Jianfeng Zhan and Lei Wang and Chunjie Luo and Daoyi Zheng and Xu Wen and Rui Ren and Chen Zheng and Xiwen He and Hainan Ye and Haoning Tang and Zheng Cao and Shujie Zhang and Jiahui Dai} } @article {1263, title = {Computational Benefit of GPU Optimization for Atmospheric Chemistry Modeling}, journal = {Journal of Advances in Modeling Earth Systems}, volume = {10}, year = {2018}, month = {2018-08}, pages = {1952{\textendash}1969}, abstract = {Global chemistry-climate models are computationally burdened as the chemical mechanisms become more complex and realistic. Optimization for graphics processing units (GPU) may make longer global simulation with regional detail possible, but limited study has been done to explore the potential benefit for the atmospheric chemistry modeling. Hence, in this study, the second-order Rosenbrock solver of the chemistry module of CAM4-Chem is ported to the GPU to gauge potential speed-up. We find that on the CPU, the fastest performance is achieved using the Intel compiler with a block interleaved memory layout. Different combinations of compiler and memory layout lead to ~11.02{\texttimes} difference in the computational time. In contrast, the GPU version performs the best when using a combination of fully interleaved memory layout with block size equal to the warp size, CUDA streams for independent kernels, and constant memory. Moreover, the most efficient data transfer between CPU and GPU is gained by allocating the memory contiguously during the data initialization on the GPU. Compared to one CPU core, the speed-up of using one GPU alone reaches a factor of ~11.7{\texttimes} for the computation alone and ~3.82{\texttimes} when the data transfer between CPU and GPU is considered. Using one GPU alone is also generally faster than the multithreaded implementation for 16 CPU cores in a compute node and the single-source solution (OpenACC). The best performance is achieved by the implementation of the hybrid CPU/GPU version, but rescheduling the workload among the CPU cores is required before the practical CAM4-Chem simulation.}, keywords = {compiler, CUDA, data transfer, gpu, hybrid, memory layout}, doi = {https://doi.org/10.1029/2018MS001276}, author = {Jian Sun and Joshua Fu and John Drake and Qingzhao Zhu and Azzam Haidar and Mark Gates and Stanimire Tomov and Jack Dongarra} } @conference {1217, title = {Co-Scheduling HPC Workloads on Cache-Partitioned CMP Platforms}, booktitle = {Cluster 2018}, year = {2018}, month = {2018-09}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Belfast, UK}, author = {Guillaume Aupy and Anne Benoit and Brice Goglin and Lo{\"\i}c Pottier and Yves Robert} } @article {1089, title = {A Failure Detector for HPC Platforms}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-01}, pages = {139{\textendash}158}, abstract = {Building an infrastructure for exascale applications requires, in addition to many other key components, a stable and efficient failure detector. This article describes the design and evaluation of a robust failure detector that can maintain and distribute the correct list of alive resources within proven and scalable bounds. The detection and distribution of the fault information follow different overlay topologies that together guarantee minimal disturbance to the applications. A virtual observation ring minimizes the overhead by allowing each node to be observed by another single node, providing an unobtrusive behavior. The propagation stage uses a nonuniform variant of a reliable broadcast over a circulant graph overlay network and guarantees a logarithmic fault propagation. Extensive simulations, together with experiments on the Titan Oak Ridge National Laboratory supercomputer, show that the algorithm performs extremely well and exhibits all the desired properties of an exascale-ready algorithm.}, keywords = {failure detection, Fault tolerance, MPI}, doi = {https://doi.org/10.1177/1094342017711505}, author = {George Bosilca and Aurelien Bouteiller and Amina Guermouche and Thomas Herault and Yves Robert and Pierre Sens and Jack Dongarra} } @conference {1235, title = {High-Performance GPU Implementation of PageRank with Reduced Precision based on Mantissa Segmentation}, booktitle = {8th Workshop on Irregular Applications: Architectures and Algorithms}, year = {2018}, author = {Anzt, Hartwig and Thomas Gruetzmacher and Enrique S. Quintana-Orti and Scheidegger, Florian} } @techreport {1203, title = {Implementation of the C++ API for Batch BLAS}, journal = {SLATE Working Notes}, number = {07, ICL-UT-18-04}, year = {2018}, month = {2018-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Ahmad Abdelfattah and Mark Gates and Jakub Kurzak and Piotr Luszczek and Jack Dongarra} } @techreport {1274, title = {Initial Integration and Evaluation of SLATE and STRUMPACK}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-11}, year = {2018}, month = {2018-12}, publisher = {University of Tennessee}, author = {Pieter Ghysels and Sherry Li and Asim YarKhan and Jack Dongarra} } @techreport {1273, title = {Least Squares Performance Report}, journal = {SLATE Working Notes}, number = {09, ICL-UT-18-10}, year = {2018}, month = {2018-12}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Mark Gates and Ali Charara and Jakub Kurzak and Asim YarKhan and Ichitaro Yamazaki and Jack Dongarra} } @techreport {1228, title = {Linear Systems Performance Report}, journal = {SLATE Working Notes}, number = {08, ICL-UT-18-08}, year = {2018}, month = {2018-09}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Ali Charara and Asim YarKhan and Jamie Finney and Gerald Ragghianti and Piotr Luszczek and Jack Dongarra} } @techreport {1191, title = {Parallel BLAS Performance Report}, journal = {SLATE Working Notes}, number = {05, ICL-UT-18-01}, year = {2018}, month = {2018-04}, publisher = {University of Tennessee}, author = {Jakub Kurzak and Mark Gates and Asim YarKhan and Ichitaro Yamazaki and Panruo Wu and Piotr Luszczek and Jamie Finney and Jack Dongarra} } @techreport {1206, title = {Parallel Norms Performance Report}, journal = {SLATE Working Notes}, number = {06, ICL-UT-18-06}, year = {2018}, month = {2018-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Jakub Kurzak and Mark Gates and Asim YarKhan and Ichitaro Yamazaki and Piotr Luszczek and Jamie Finney and Jack Dongarra} } @article {1258, title = {The Singular Value Decomposition: Anatomy of Optimizing an Algorithm for Extreme Scale}, journal = {SIAM Review}, volume = {60}, year = {2018}, month = {2018-11}, pages = {808{\textendash}865}, abstract = {The computation of the singular value decomposition, or SVD, has a long history with many improvements over the years, both in its implementations and algorithmically. Here, we survey the evolution of SVD algorithms for dense matrices, discussing the motivation and performance impacts of changes. There are two main branches of dense SVD methods: bidiagonalization and Jacobi. Bidiagonalization methods started with the implementation by Golub and Reinsch in Algol60, which was subsequently ported to Fortran in the EISPACK library, and was later more efficiently implemented in the LINPACK library, targeting contemporary vector machines. To address cache-based memory hierarchies, the SVD algorithm was reformulated to use Level 3 BLAS in the LAPACK library. To address new architectures, ScaLAPACK was introduced to take advantage of distributed computing, and MAGMA was developed for accelerators such as GPUs. Algorithmically, the divide and conquer and MRRR algorithms were developed to reduce the number of operations. Still, these methods remained memory bound, so two-stage algorithms were developed to reduce memory operations and increase the computational intensity, with efficient implementations in PLASMA, DPLASMA, and MAGMA. Jacobi methods started with the two-sided method of Kogbetliantz and the one-sided method of Hestenes. They have likewise had many developments, including parallel and block versions and preconditioning to improve convergence. In this paper, we investigate the impact of these changes by testing various historical and current implementations on a common, modern multicore machine and a distributed computing platform. We show that algorithmic and implementation improvements have increased the speed of the SVD by several orders of magnitude, while using up to 40 times less energy.}, keywords = {bidiagonal matrix, bisection, Divide and conquer, Hestenes method, Jacobi method, Kogbetliantz method, MRRR, QR iteration, Singular value decomposition, SVD}, issn = {0036-1445}, doi = {10.1137/17M1117732}, url = {https://epubs.siam.org/doi/10.1137/17M1117732}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @article {1231, title = {A Survey of MPI Usage in the US Exascale Computing Project}, journal = {Concurrency Computation: Practice and Experience}, year = {2018}, month = {2018-09}, type = {Special Issue}, abstract = {The Exascale Computing Project (ECP) is currently the primary effort in theUnited States focused on developing {\textquotedblleft}exascale{\textquotedblright} levels of computing capabilities, including hardware, software, and applications. In order to obtain amore thorough understanding of how the software projects under the ECPare using, and planning to use theMessagePassing Interface (MPI), and help guide the work of our own project within the ECP, we created a survey.Of the 97 ECP projects active at the time the survey was distributed, we received 77 responses, 56 of which reported that their projects were usingMPI. This paper reports the results of that survey for the benefit of the broader community of MPI developers.}, keywords = {exascale, MPI}, doi = {https://doi.org/10.1002/cpe.4851}, author = {David E. Bernholdt and Swen Boehm and George Bosilca and Manjunath Gorentla Venkata and Ryan E. Grant and Thomas Naughton and Howard P. Pritchard and Martin Schulz and Geoffroy R. Vallee} } @techreport {1280, title = {Tensor Contraction on Distributed Hybrid Architectures using a Task-Based Runtime System}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-13}, year = {2018}, month = {2018-12}, publisher = {University of Tennessee}, abstract = {The needs for predictive simulation of electronic structure in chemistry and materials science calls for fast/reduced-scaling formulations of quantum n-body methods that replace the traditional dense tensors with element-, block-, rank-, and block-rank-sparse (data-sparse) tensors. The resulting, highly irregular data structures are a poor match to imperative, bulk-synchronous parallel programming style due to the dynamic nature of the problem and to the lack of clear domain decomposition to guarantee a fair load-balance. TESSE runtime and the associated programming model aim to support performance-portable composition of applications involving irregular and dynamically changing data. In this paper we report an implementation of irregular dense tensor contraction in a paradigmatic electronic structure application based on the TESSE extension of PaRSEC, a distributed hybrid task runtime system, and analyze the resulting performance on a distributed memory cluster of multi-GPU nodes. Unprecedented strong scaling and promising efficiency indicate a viable future for task-based programming of complete production-quality reduced scaling models of electronic structure.}, author = {George Bosilca and Damien Genet and Robert Harrison and Thomas Herault and Mohammad Mahdi Javanmard and Chong Peng and Edward Valeev} } @conference {1234, title = {Variable-Size Batched Condition Number Calculation on GPUs}, booktitle = {SBAC-PAD}, year = {2018}, month = {2018-09}, address = {Lyon, France}, url = {https://ieeexplore.ieee.org/document/8645907}, author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Thomas Gruetzmacher} } @article {1176, title = {Argobots: A Lightweight Low-Level Threading and Tasking Framework}, journal = {IEEE Transactions on Parallel and Distributed Systems}, year = {2017}, month = {2017-10}, abstract = {In the past few decades, a number of user-level threading and tasking models have been proposed in the literature to address the shortcomings of OS-level threads, primarily with respect to cost and flexibility. Current state-of-the-art user-level threading and tasking models, however, are either too specific to applications or architectures or are not as powerful or flexible. In this paper, we present Argobots, a lightweight, low-level threading and tasking framework that is designed as a portable and performant substrate for high-level programming models or runtime systems. Argobots offers a carefully designed execution model that balances generality of functionality with providing a rich set of controls to allow specialization by the user or high-level programming model. We describe the design, implementation, and optimization of Argobots and present integrations with three example high-level models: OpenMP, MPI, and co-located I/O service. Evaluations show that (1) Argobots outperforms existing generic threading runtimes; (2) our OpenMP runtime offers more efficient interoperability capabilities than production OpenMP runtimes do; (3) when MPI interoperates with Argobots instead of Pthreads, it enjoys reduced synchronization costs and better latency hiding capabilities; and (4) I/O service with Argobots reduces interference with co-located applications, achieving performance competitive with that of the Pthreads version.}, keywords = {Argobots, context switch, I/O, interoperability, lightweight, MPI, OpenMP, stackable scheduler, tasklet, user-level thread}, doi = {10.1109/TPDS.2017.2766062}, url = {http://ieeexplore.ieee.org/document/8082139/}, author = {Sangmin Seo and Abdelhalim Amer and Pavan Balaji and Cyril Bordage and George Bosilca and Alex Brooks and Philip Carns and Adrian Castello and Damien Genet and Thomas Herault and Shintaro Iwasaki and Prateek Jindal and Sanjay Kale and Sriram Krishnamoorthy and Jonathan Lifflander and Huiwei Lu and Esteban Meneses and Mar Snir and Yanhua Sun and Kenjiro Taura and Pete Beckman} } @conference {1169, title = {Autotuning Batch Cholesky Factorization in CUDA with Interleaved Layout of Matrices}, booktitle = {Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2017}, month = {2017-06}, publisher = {IEEE}, organization = {IEEE}, address = {Orlando, FL}, abstract = {Batch matrix operations address the case of solving the same linear algebra problem for a very large number of very small matrices. In this paper, we focus on implementing the batch Cholesky factorization in CUDA, in single precision arithmetic, for NVIDIA GPUs. Specifically, we look into the benefits of using noncanonical data layouts, where consecutive memory locations store elements with the same row and column index in a set of consecutive matrices. We discuss a number of different implementation options and tuning parameters. We demonstrate superior performance to traditional implementations for the case of very small matrices.}, keywords = {batch computation, Cholesky Factorization, data layout, GPU computing, numerical linear algebra}, doi = {10.1109/IPDPSW.2017.18}, author = {Mark Gates and Jakub Kurzak and Piotr Luszczek and Yu Pei and Jack Dongarra} } @inbook {1167, title = {Bringing High Performance Computing to Big Data Algorithms}, booktitle = {Handbook of Big Data Technologies}, year = {2017}, publisher = {Springer}, organization = {Springer}, isbn = {978-3-319-49339-8}, doi = {10.1007/978-3-319-49340-4}, author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @techreport {1175, title = {C++ API for Batch BLAS}, journal = {SLATE Working Notes}, number = {04, ICL-UT-17-12}, year = {2017}, month = {2017-12}, publisher = {University of Tennessee}, author = {Ahmad Abdelfattah and Konstantin Arturov and Cris Cecka and Jack Dongarra and Chip Freitag and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Panruo Wu} } @techreport {1081, title = {C++ API for BLAS and LAPACK}, journal = {SLATE Working Notes}, number = {02, ICL-UT-17-03}, year = {2017}, note = {Revision 02-21-2018}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Mark Gates and Piotr Luszczek and Ahmad Abdelfattah and Jakub Kurzak and Jack Dongarra and Konstantin Arturov and Cris Cecka and Chip Freitag} } @techreport {1133, title = {Designing SLATE: Software for Linear Algebra Targeting Exascale}, journal = {SLATE Working Notes}, number = {03, ICL-UT-17-06}, year = {2017}, month = {2017-10}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Jakub Kurzak and Panruo Wu and Mark Gates and Ichitaro Yamazaki and Piotr Luszczek and Gerald Ragghianti and Jack Dongarra} } @techreport {1130, title = {MAGMA-sparse Interface Design Whitepaper}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-05}, year = {2017}, month = {2017-09}, type = {Technical Report}, abstract = {In this report we describe the logic and interface we develop for the MAGMA-sparse library to allow for easy integration as third-party library into a top-level software ecosystem. The design choices are based on extensive consultation with other software library developers, in particular the Trilinos software development team. The interface documentation is at this point not exhaustive, but a first proposal for setting a standard. Although the interface description targets the MAGMA-sparse software module, we hope that the design choices carry beyond this specific library, and are attractive for adoption in other packages. This report is not intended as static document, but will be updated over time to reflect the agile software development in the ECP 1.3.3.11 STMS11-PEEKS project.}, author = {Hartwig Anzt and Erik Boman and Jack Dongarra and Goran Flegar and Mark Gates and Mike Heroux and Mark Hoemmen and Jakub Kurzak and Piotr Luszczek and Sivasankaran Rajamanickam and Stanimire Tomov and Stephen Wood and Ichitaro Yamazaki} } @techreport {1173, title = {PLASMA 17 Performance Report}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-11}, year = {2017}, month = {2017-06}, publisher = {University of Tennessee}, abstract = {PLASMA (Parallel Linear Algebra for Multicore Architectures) is a dense linear algebra package at the forefront of multicore computing. PLASMA is designed to deliver the highest possible performance from a system with multiple sockets of multicore processors. PLASMA achieves this objective by combining state of the art solutions in parallel algorithms, scheduling, and software engineering. PLASMA currently offers a collection of routines for solving linear systems of equations and least square problems.}, author = {Maksims Abalenkovs and Negin Bagherpour and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Samuel Relton and Jakub Sistek and David Stevens and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Mawussi Zounon} } @techreport {1172, title = {PLASMA 17.1 Functionality Report}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-10}, year = {2017}, month = {2017-06}, publisher = {University of Tennessee}, abstract = {PLASMA (Parallel Linear Algebra for Multicore Architectures) is a dense linear algebra package at the forefront of multicore computing. PLASMA is designed to deliver the highest possible performance from a system with multiple sockets of multicore processors. PLASMA achieves this objective by combining state of the art solutions in parallel algorithms, scheduling, and software engineering. PLASMA currently offers a collection of routines for solving linear systems of equations and least square problems.}, author = {Maksims Abalenkovs and Negin Bagherpour and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Samuel Relton and Jakub Sistek and David Stevens and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Mawussi Zounon} } @article {1067, title = {Preconditioned Krylov Solvers on GPUs}, journal = {Parallel Computing}, year = {2017}, month = {2017-06}, abstract = {In this paper, we study the effect of enhancing GPU-accelerated Krylov solvers with preconditioners. We consider the BiCGSTAB, CGS, QMR, and IDR(s) Krylov solvers. For a large set of test matrices, we assess the impact of Jacobi and incomplete factorization preconditioning on the solvers{\textquoteright} numerical stability and time-to-solution performance. We also analyze how the use of a preconditioner impacts the choice of the fastest solver.}, keywords = {gpu, ILU, Jacobi, Krylov solvers, Preconditioning}, issn = {01678191}, doi = {10.1016/j.parco.2017.05.006}, url = {http://www.sciencedirect.com/science/article/pii/S0167819117300777}, author = {Hartwig Anzt and Mark Gates and Jack Dongarra and Moritz Kreutzer and Gerhard Wellein and Martin Kohler} } @techreport {1080, title = {Roadmap for the Development of a Linear Algebra Library for Exascale Computing: SLATE: Software for Linear Algebra Targeting Exascale}, journal = {SLATE Working Notes}, number = {01, ICL-UT-17-02}, year = {2017}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Aurelien Bouteiller and Anthony Danalis and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Stephen Wood and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan} } @article {, title = {With Extreme Computing, the Rules Have Changed}, journal = {Computing in Science \& Engineering}, volume = {19}, year = {2017}, month = {2017-05}, pages = {52-62}, abstract = {On the eve of exascale computing, traditional wisdom no longer applies. High-performance computing is gone as we know it. This article discusses a range of new algorithmic techniques emerging in the context of exascale computing, many of which defy the common wisdom of high-performance computing and are considered unorthodox, but could turn out to be a necessity in near future.}, doi = {https://doi.org/10.1109/MCSE.2017.48}, author = {Jack Dongarra and Stanimire Tomov and Piotr Luszczek and Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Hartwig Anzt and Azzam Haidar and Ahmad Abdelfattah} } @inproceedings {979, title = {Failure Detection and Propagation in HPC Systems}, journal = { Proceedings of the The International Conference for High Performance Computing, Networking, Storage and Analysis (SC{\textquoteright}16)}, year = {2016}, month = {2016-11}, pages = {27:1-27:11}, publisher = {IEEE Press}, address = {Salt Lake City, Utah}, keywords = {failure detection, fault-tolerance, MPI}, isbn = {978-1-4673-8815-3}, url = {http://dl.acm.org/citation.cfm?id=3014904.3014941}, author = {George Bosilca and Aurelien Bouteiller and Amina Guermouche and Thomas Herault and Yves Robert and Pierre Sens and Jack Dongarra} } @conference {939, title = {Heterogeneous Streaming}, booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This paper introduces a new heterogeneous streaming library called hetero Streams (hStreams). We show how a simple FIFO streaming model can be applied to heterogeneous systems that include manycore coprocessors and multicore CPUs. This model supports concurrency across nodes, among tasks within a node, and between data transfers and computation. We give examples for different approaches, show how the implementation can be layered, analyze overheads among layers, and apply those models to parallelize applications using simple, intuitive interfaces. We compare the features and versatility of hStreams, OpenMP, CUDA Streams1 and OmpSs. We show how the use of hStreams makes it easier for scientists to identify tasks and easily expose concurrency among them, and how it enables tuning experts and runtime systems to tailor execution for different heterogeneous targets. Practical application examples are taken from the field of numerical linear algebra, commercial structural simulation software, and a seismic processing application.}, keywords = {plasma}, author = {Chris J. Newburn and Gaurav Bansal and Michael Wood and Luis Crivelli and Judit Planas and Alejandro Duran and Paulo Souza and Leonardo Borges and Piotr Luszczek and Stanimire Tomov and Jack Dongarra and Hartwig Anzt and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Ichitaro Yamazaki and Jesus Labarta} } @article {1472, title = {Linear Algebra Software for Large-Scale Accelerated Multicore Computing}, journal = {Acta Numerica}, volume = {25}, year = {2016}, month = {2016-05}, pages = {1-160}, abstract = {Many crucial scientific computing applications, ranging from national security to medical advances, rely on high-performance linear algebra algorithms and technologies, underscoring their importance and broad impact. Here we present the state-of-the-art design and implementation practices for the acceleration of the predominant linear algebra algorithms on large-scale accelerated multicore systems. Examples are given with fundamental dense linear algebra algorithms {\textendash} from the LU, QR, Cholesky, and LDLT factorizations needed for solving linear systems of equations, to eigenvalue and singular value decomposition (SVD) problems. The implementations presented are readily available via the open-source PLASMA and MAGMA libraries, which represent the next generation modernization of the popular LAPACK library for accelerated multicore systems. To generate the extreme level of parallelism needed for the efficient use of these systems, algorithms of interest are redesigned and then split into well-chosen computational tasks. The task execution is scheduled over the computational components of a hybrid system of multicore CPUs with GPU accelerators and/or Xeon Phi coprocessors, using either static scheduling or light-weight runtime systems. The use of light-weight runtime systems keeps scheduling overheads low, similar to static scheduling, while enabling the expression of parallelism through sequential-like code. This simplifies the development effort and allows exploration of the unique strengths of the various hardware components. Finally, we emphasize the development of innovative linear algebra algorithms using three technologies {\textendash} mixed precision arithmetic, batched operations, and asynchronous iterations {\textendash} that are currently of high interest for accelerated multicore systems.}, doi = {10.1017/S0962492916000015}, author = {Ahmad Abdelfattah and Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and undefined and Asim YarKhan} } @conference {963, title = {LU, QR, and Cholesky Factorizations: Programming Model, Performance Analysis and Optimization Techniques for the Intel Knights Landing Xeon Phi}, booktitle = {IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}16)}, year = {2016}, month = {2016-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {A wide variety of heterogeneous compute resources, ranging from multicore CPUs to GPUs and coprocessors, are available to modern computers, making it challenging to design unified numerical libraries that efficiently and productively use all these varied resources. For example, in order to efficiently use Intel{\textquoteright}s Knights Langing (KNL) processor, the next-generation of Xeon Phi architectures, one must design and schedule an application in multiple degrees of parallelism and task grain sizes in order to obtain efficient performance. We propose a productive and portable programming model that allows us to write a serial-looking code, which, however, achieves parallelism and scalability by using a lightweight runtime environment to manage the resource-specific workload, and to control the dataflow and the parallel execution. This is done through multiple techniques ranging from multi-level data partitioning to adaptive task grain sizes, and dynamic task scheduling. In addition, our task abstractions enable unified algorithmic development across all the heterogeneous resources. Finally, we outline the strengths and the effectiveness of this approach {\textendash} especially in regards to hardware trends and ease of programming high-performance numerical software that current applications need {\textendash} in order to motivate current work and future directions for the next generation of parallel programming models for high-performance linear algebra libraries on heterogeneous systems.}, author = {Azzam Haidar and Stanimire Tomov and Konstantin Arturov and Murat Guney and Shane Story and Jack Dongarra} } @conference {968, title = {Performance Analysis and Acceleration of Explicit Integration for Large Kinetic Networks using Batched GPU Computations}, booktitle = {2016 IEEE High Performance Extreme Computing Conference (HPEC {\textquoteleft}16)}, year = {2016}, month = {2016-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {We demonstrate the systematic implementation of recently-developed fast explicit kinetic integration algorithms that solve efficiently N coupled ordinary differential equations (subject to initial conditions) on modern GPUs. We take representative test cases (Type Ia supernova explosions) and demonstrate two or more orders of magnitude increase in efficiency for solving such systems (of realistic thermonuclear networks coupled to fluid dynamics). This implies that important coupled, multiphysics problems in various scientific and technical disciplines that were intractable, or could be simulated only with highly schematic kinetic networks, are now computationally feasible. As examples of such applications we present the computational techniques developed for our ongoing deployment of these new methods on modern GPU accelerators. We show that similarly to many other scientific applications, ranging from national security to medical advances, the computation can be split into many independent computational tasks, each of relatively small-size. As the size of each individual task does not provide sufficient parallelism for the underlying hardware, especially for accelerators, these tasks must be computed concurrently as a single routine, that we call batched routine, in order to saturate the hardware with enough work.}, author = {Azzam Haidar and Benjamin Brock and Stanimire Tomov and Michael Guidry and Jay Jay Billings and Daniel Shyles and Jack Dongarra} } @conference {962, title = {Search Space Generation and Pruning System for Autotuners}, booktitle = {30th IEEE International Parallel \& Distributed Processing Symposium (IPDPS)}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This work tackles two simultaneous challenges faced by autotuners: the ease of describing a complex, multidimensional search space, and the speed of evaluating that space, while applying a multitude of pruning constraints. This article presents a declarative notation for describing a search space and a translation system for conversion to a standard C code for fast and multithreaded, as necessary, evaluation. The notation is Python-based and thus simple in syntax and easy to assimilate by the user interested in tuning rather than learning a new programming language. A large number of dimensions and a large number of pruning constraints may be expressed with little effort. The system is discussed in the context of autotuning the canonical matrix multiplication kernel for NVIDIA GPUs, where the search space has 15 dimensions and involves application of 10 complex pruning constrains. The speed of evaluation is compared against generators created using imperative programming style in various scripting and compiled languages.}, author = {Piotr Luszczek and Mark Gates and Jakub Kurzak and Anthony Danalis and Jack Dongarra} } @conference {880, title = {Accelerating Collaborative Filtering for Implicit Feedback Datasets using GPUs}, booktitle = {2015 IEEE International Conference on Big Data (IEEE BigData 2015)}, year = {2015}, month = {2015-11}, publisher = {IEEE}, organization = {IEEE}, address = {Santa Clara, CA}, abstract = {In this paper we accelerate the Alternating Least Squares (ALS) algorithm used for generating product recommendations on the basis of implicit feedback datasets. We approach the algorithm with concepts proven to be successful in High Performance Computing. This includes the formulation of the algorithm as a mix of cache-optimized algorithm-specific kernels and standard BLAS routines, acceleration via graphics processing units (GPUs), use of parallel batched kernels, and autotuning to identify performance winners. For benchmark datasets, the multi-threaded CPU implementation we propose achieves more than a 10 times speedup over the implementations available in the GraphLab and Spark MLlib software packages. For the GPU implementation, the parameters of an algorithm-specific kernel were optimized using a comprehensive autotuning sweep. This results in an additional 2 times speedup over our CPU implementation.}, author = {Mark Gates and Hartwig Anzt and Jakub Kurzak and Jack Dongarra} } @article {820, title = {Algorithm-based Fault Tolerance for Dense Matrix Factorizations, Multiple Failures, and Accuracy}, journal = {ACM Transactions on Parallel Computing}, volume = {1}, number = {10}, year = {2015}, month = {2015-01}, pages = {10:1-10:28}, abstract = {Dense matrix factorizations, such as LU, Cholesky and QR, are widely used for scientific applications that require solving systems of linear equations, eigenvalues and linear least squares problems. Such computations are normally carried out on supercomputers, whose ever-growing scale induces a fast decline of the Mean Time To Failure (MTTF). This paper proposes a new hybrid approach, based on Algorithm-Based Fault Tolerance (ABFT), to help matrix factorizations algorithms survive fail-stop failures. We consider extreme conditions, such as the absence of any reliable node and the possibility of losing both data and checksum from a single failure. We will present a generic solution for protecting the right factor, where the updates are applied, of all above mentioned factorizations. For the left factor, where the panel has been applied, we propose a scalable checkpointing algorithm. This algorithm features high degree of checkpointing parallelism and cooperatively utilizes the checksum storage leftover from the right factor protection. The fault-tolerant algorithms derived from this hybrid solution is applicable to a wide range of dense matrix factorizations, with minor modifications. Theoretical analysis shows that the fault tolerance overhead decreases inversely to the scaling in the number of computing units and the problem size. Experimental results of LU and QR factorization on the Kraken (Cray XT5) supercomputer validate the theoretical evaluation and confirm negligible overhead, with- and without-errors. Applicability to tolerate multiple failures and accuracy after multiple recovery is also considered.}, keywords = {ABFT, algorithms, fault-tolerance, High Performance Computing, linear algebra}, doi = {10.1145/2686892}, author = {Aurelien Bouteiller and Thomas Herault and George Bosilca and Peng Du and Jack Dongarra}, editor = {Phillip B. Gibbons} } @conference {896, title = {Comparing Hybrid CPU-GPU and Native GPU-only Acceleration for Linear Algebra}, booktitle = {2015 SIAM Conference on Applied Linear Algebra}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {Accelerating dense linear algebra using GPUs admits two models: hybrid CPU-GPU and GPU-only. The hybrid model factors the panel on the CPU while updating the trailing matrix on the GPU, concentrating the GPU on high-performance matrix multiplies. The GPU-only model performs the entire computation on the GPU, avoiding costly data transfers to the CPU. We compare these two approaches for three QR-based algorithms: QR factorization, rank revealing QR, and reduction to Hessenberg.}, author = {Mark Gates and Stanimire Tomov and Azzam Haidar} } @article {1345, title = {On the Design, Autotuning, and Optimization of GPU Kernels for Kinetic Network Simulations Using Fast Explicit Integration and GPU Batched Computation}, year = {2015}, month = {2015-09}, publisher = {Joint Institute for Computational Sciences Seminar Series, Presentation}, address = {Oak Ridge, TN}, author = {Michael Guidry and Azzam Haidar} } @inbook {927, title = {High-Performance Computing}, booktitle = {The Princeton Companion to Applied Mathematics}, year = {2015}, pages = {839-842}, publisher = {Princeton University Press}, organization = {Princeton University Press}, address = {Princeton, New Jersey}, isbn = {9781400874477}, author = {Jack Dongarra and Nicholas J. Higham and Mark R. Dennis and Paul Glendinning and Paul A. Martin and Fadil Santosa and Jared Tanner} } @article {829, title = {HPC Programming on Intel Many-Integrated-Core Hardware with MAGMA Port to Xeon Phi}, journal = {Scientific Programming}, volume = {23}, year = {2015}, month = {2015-01}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms for multicore with Intel Xeon Phi Coprocessors. In particular, we consider algorithms for solving linear systems. Further, we give an overview of the MAGMA MIC library, an open source, high performance library that incorporates the developments presented, and in general provides to heterogeneous architectures of multicore with coprocessors the DLA functionality of the popular LAPACK library. The LAPACK-compliance simplifies the use of the MAGMA MIC library in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance BLAS, hardware-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components. Our methodology and programming techniques are incorporated into the MAGMA MIC API, which abstracts the application developer from the specifics of the Xeon Phi architecture and is therefore applicable to algorithms beyond the scope of DLA.}, keywords = {communication and computation overlap, dynamic runtime scheduling using dataflow dependences, hardware accelerators and coprocessors, Intel Xeon Phi processor, Many Integrated Cores, numerical linear algebra}, issn = {1058-9244}, doi = {10.3233/SPR-140404}, author = {Azzam Haidar and Jack Dongarra and Khairul Kabir and Mark Gates and Piotr Luszczek and Stanimire Tomov and Yulu Jia} } @article {881, title = {Implementation and Tuning of Batched Cholesky Factorization and Solve for NVIDIA GPUs}, journal = {IEEE Transactions on Parallel and Distributed Systems}, number = {1045-9219}, year = {2015}, month = {2015-11}, author = {Jakub Kurzak and Hartwig Anzt and Mark Gates and Jack Dongarra} } @article {1347, title = {MAGMA MIC: Optimizing Linear Algebra for Intel Xeon Phi}, year = {2015}, month = {2015-06}, publisher = {ISC High Performance (ISC15), Intel Booth Presentation}, address = {Frankfurt, Germany}, author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Khairul Kabir and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @article {936, title = {Parallel Programming Models for Dense Linear Algebra on Heterogeneous Systems}, journal = {Supercomputing Frontiers and Innovations}, volume = {2}, number = {4}, year = {2015}, month = {2015-10}, abstract = {We present a review of the current best practices in parallel programming models for dense linear algebra (DLA) on heterogeneous architectures. We consider multicore CPUs, stand alone manycore coprocessors, GPUs, and combinations of these. Of interest is the evolution of the programming models for DLA libraries {\textendash} in particular, the evolution from the popular LAPACK and ScaLAPACK libraries to their modernized counterparts PLASMA (for multicore CPUs) and MAGMA (for heterogeneous architectures), as well as other programming models and libraries. Besides providing insights into the programming techniques of the libraries considered, we outline our view of the current strengths and weaknesses of their programming models {\textendash} especially in regards to hardware trends and ease of programming high-performance numerical software that current applications need {\textendash} in order to motivate work and future directions for the next generation of parallel programming models for high-performance linear algebra libraries on heterogeneous systems.}, keywords = {dense linear algebra, gpu, HPC, Multicore, plasma, Programming models, runtime}, doi = {10.14529/jsfi1504}, author = {Maksims Abalenkovs and Ahmad Abdelfattah and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki and Asim YarKhan} } @techreport {872, title = {Practical Scalable Consensus for Pseudo-Synchronous Distributed Systems: Formal Proof}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-15-01}, year = {2015}, month = {2015-04}, author = {Thomas Herault and Aurelien Bouteiller and George Bosilca and Marc Gamell and Keita Teranishi and Manish Parashar and Jack Dongarra} } @conference {889, title = {Practical Scalable Consensus for Pseudo-Synchronous Distributed Systems}, booktitle = {The International Conference for High Performance Computing, Networking, Storage and Analysis (SC15)}, year = {2015}, month = {2015-11}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, abstract = {The ability to consistently handle faults in a distributed environment requires, among a small set of basic routines, an agreement algorithm allowing surviving entities to reach a consensual decision between a bounded set of volatile resources. This paper presents an algorithm that implements an Early Returning Agreement (ERA) in pseudo-synchronous systems, which optimistically allows a process to resume its activity while guaranteeing strong progress. We prove the correctness of our ERA algorithm, and expose its logarithmic behavior, which is an extremely desirable property for any algorithm which targets future exascale platforms. We detail a practical implementation of this consensus algorithm in the context of an MPI library, and evaluate both its efficiency and scalability through a set of benchmarks and two fault tolerant scientific applications.}, author = {Thomas Herault and Aurelien Bouteiller and George Bosilca and Marc Gamell and Keita Teranishi and Manish Parashar and Jack Dongarra} } @article {699, title = {A Survey of Recent Developments in Parallel Implementations of Gaussian Elimination}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2015}, month = {2015-04}, pages = {1292-1309}, abstract = {Gaussian elimination is a canonical linear algebra procedure for solving linear systems of equations. In the last few years, the algorithm has received a lot of attention in an attempt to improve its parallel performance. This article surveys recent developments in parallel implementations of Gaussian elimination for shared memory architecture. Five different flavors are investigated. Three of them are based on different strategies for pivoting: partial pivoting, incremental pivoting, and tournament pivoting. The fourth one replaces pivoting with the Partial Random Butterfly Transformation, and finally, an implementation without pivoting is used as a performance baseline. The technique of iterative refinement is applied to recover numerical accuracy when necessary. All parallel implementations are produced using dynamic, superscalar, runtime scheduling and tile matrix layout. Results on two multisocket multicore systems are presented. Performance and numerical accuracy is analyzed.}, keywords = {Gaussian elimination, lu factorization, Multicore, parallel, plasma, shared memory}, doi = {10.1002/cpe.3306}, author = {Simplice Donfack and Jack Dongarra and Mathieu Faverge and Mark Gates and Jakub Kurzak and Piotr Luszczek and Ichitaro Yamazaki} } @inproceedings {1309, title = {UCX: An Open Source Framework for HPC Network APIs and Beyond}, journal = {2015 IEEE 23rd Annual Symposium on High-Performance Interconnects}, year = {2015}, month = {Aug}, pages = {40-43}, publisher = {IEEE}, address = {Santa Clara, CA, USA}, abstract = {This paper presents Unified Communication X (UCX), a set of network APIs and their implementations for high throughput computing. UCX comes from the combined effort of national laboratories, industry, and academia to design and implement a high-performing and highly-scalable network stack for next generation applications and systems. UCX design provides the ability to tailor its APIs and network functionality to suit a wide variety of application domains and hardware. We envision these APIs to satisfy the networking needs of many programming models such as Message Passing Interface (MPI), OpenSHMEM, Partitioned Global Address Space (PGAS) languages, task-based paradigms and I/O bound applications. To evaluate the design we implement the APIs and protocols, and measure the performance of overhead-critical network primitives fundamental for implementing many parallel programming models and system libraries. Our results show that the latency, bandwidth, and message rate achieved by the portable UCX prototype is very close to that of the underlying driver. With UCX, we achieved a message exchange latency of 0.89 us, a bandwidth of 6138.5 MB/s, and a message rate of 14 million messages per second. As far as we know, this is the highest bandwidth and message rate achieved by any network stack (publicly known) on this hardware.}, keywords = {application program interfaces, Bandwidth, Electronics packaging, Hardware, high throughput computing, highly-scalable network stack, HPC, HPC network APIs, I/O bound applications, Infiniband, input-output programs, Libraries, Memory management, message passing, message passing interface, Middleware, MPI, open source framework, OpenSHMEM, parallel programming, parallel programming models, partitioned global address space languages, PGAS, PGAS languages, Programming, protocols, public domain software, RDMA, system libraries, task-based paradigms, UCX, Unified Communication X}, isbn = {978-1-4673-9160-3}, doi = {10.1109/HOTI.2015.13}, author = {P. Shamis and Manjunath Gorentla Venkata and M. Graham Lopez and M. B. Baker and O. Hernandez and Y. Itigin and M. Dubman and G. Shainer and R. L. Graham and L. Liss and Y. Shahar and S. Potluri and D. Rossetti and D. Becker and D. Poole and C. Lamb and S. Kumar and C. Stunkel and George Bosilca and Aurelien Bouteiller} } @conference {766, title = {Accelerating Eigenvector Computation in the Nonsymmetric Eigenvalue Problem}, booktitle = {VECPAR 2014}, year = {2014}, month = {2014-06}, address = {Eugene, OR}, abstract = {In the nonsymmetric eigenvalue problem, work has focused on the Hessenberg reduction and QR iteration, using efficient algorithms and fast, Level 3 BLAS routines. Comparatively, computation of eigenvectors performs poorly, limited to slow, Level 2 BLAS performance with little speedup on multi-core systems. It has thus become a dominant cost in the eigenvalue problem. To address this, we present improvements for the eigenvector computation to use Level 3 BLAS where applicable and parallelize the remaining triangular solves, achieving good parallel scaling and accelerating the overall eigenvalue problem more than three-fold.}, author = {Mark Gates and Azzam Haidar and Jack Dongarra} } @inbook {780, title = {Accelerating Numerical Dense Linear Algebra Calculations with GPUs}, booktitle = {Numerical Computations with GPUs}, year = {2014}, pages = {3-28}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, chapter = {1}, isbn = {978-3-319-06547-2}, doi = {10.1007/978-3-319-06548-9_1}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @conference {853, title = {Assembly Operations for Multicore Architectures using Task-Based Runtime Systems}, booktitle = {Euro-Par 2014}, year = {2014}, month = {2014-08}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Porto, Portugal}, abstract = {Traditionally, numerical simulations based on finite element methods consider the algorithm as being divided in three major steps: the generation of a set of blocks and vectors, the assembly of these blocks in a matrix and a big vector, and the inversion of the matrix. In this paper we tackle the second step, the block assembly, where no parallel algorithm is widely available. Several strategies are proposed to decompose the assembly problem while relying on a scheduling middle-ware to maximize the overlap between stages and increase the parallelism and thus the performance. These strategies are quantified using examples covering two extremes in the field, large number of non-overlapping small blocks for CFD-like problems, and a smaller number of larger blocks with significant overlap which can be met in sparse linear algebra solvers.}, author = {Damien Genet and Abdou Guermouche and George Bosilca} } @conference {836, title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL }, booktitle = {International Workshop on OpenCL}, year = {2014}, month = {2014-05}, address = {Bristol University, England}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments presented, and in general provides to heterogeneous architectures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance OpenCL BLAS, hardware and OpenCL-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.}, author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @article {758, title = {A Novel Hybrid CPU-GPU Generalized Eigensolver for Electronic Structure Calculations Based on Fine Grained Memory Aware Tasks}, journal = {International Journal of High Performance Computing Applications}, volume = {28}, year = {2014}, month = {2014-05}, pages = {196-209}, chapter = {196}, abstract = {The adoption of hybrid CPU{\textendash}GPU nodes in traditional supercomputing platforms such as the Cray-XK6 opens acceleration opportunities for electronic structure calculations in materials science and chemistry applications, where medium-sized generalized eigenvalue problems must be solved many times. These eigenvalue problems are too small to effectively solve on distributed systems, but can benefit from the massive computing power concentrated on a single-node, hybrid CPU{\textendash}GPU system. However, hybrid systems call for the development of new algorithms that efficiently exploit heterogeneity and massive parallelism of not just GPUs, but of multicore/manycore CPUs as well. Addressing these demands, we developed a generalized eigensolver featuring novel algorithms of increased computational intensity (compared with the standard algorithms), decomposition of the computation into fine-grained memory aware tasks, and their hybrid execution. The resulting eigensolvers are state-of-the-art in high-performance computing, significantly outperforming existing libraries. We describe the algorithm and analyze its performance impact on applications of interest when different fractions of eigenvectors are needed by the host electronic structure code. }, keywords = {Eigensolver, electronic structure calculations, generalized eigensolver, gpu, high performance, hybrid, Multicore, two-stage}, doi = {10.1177/1094342013502097 }, author = {Azzam Haidar and Raffaele Solc{\`a} and Mark Gates and Stanimire Tomov and Thomas C. Schulthess and Jack Dongarra} } @conference {828, title = {Performance and Portability with OpenCL for Throughput-Oriented HPC Workloads Across Accelerators, Coprocessors, and Multicore Processors}, booktitle = {5th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA {\textquoteright}14)}, year = {2014}, month = {2014-11}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, abstract = {Ever since accelerators and coprocessors became the mainstream hardware for throughput-oriented HPC workloads, various programming techniques have been proposed to increase productivity in terms of both the performance and ease-of-use. We evaluate these aspects of OpenCL on a number of hardware platforms for an important subset of dense linear algebra operations that are relevant to a wide range of scientific applications. Our findings indicate that OpenCL portability has improved since our previous publication and many new and surprising usage scenarios are possible that rival those available after decades of software development on the CPUs. The combined performance-portability metric, even though not promised by the OpenCL standard, reflects the need for tuning performance-critical operations during the porting process and we show how a large portion of the available efficiency is lost if the tuning is not done correctly.}, doi = {10.1109/ScalA.2014.8}, author = {Azzam Haidar and Chongxiao Cao and Ichitaro Yamazaki and Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {681, title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL}, journal = {University of Tennessee Technical Report (Lawn 275)}, number = {UT-CS-13-706}, year = {2013}, month = {2013-03}, publisher = {University of Tennessee}, abstract = {This paper presents the design and implementation of sev- eral fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments pre- sented, and in general provides to heterogeneous architec- tures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is ob- tained through use of the high-performance OpenCL BLAS, hardware and OpenCL-speci c tuning, and a hybridization methodology where we split the algorithm into computa- tional tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.}, author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @inbook {762, title = {Keeneland: Computational Science Using Heterogeneous GPU Computing}, booktitle = {Contemporary High Performance Computing: From Petascale Toward Exascale}, series = {CRC Computational Science Series}, year = {2013}, publisher = {Taylor and Francis}, organization = {Taylor and Francis}, chapter = {7}, address = {Boca Raton, FL}, abstract = {The Keeneland Project is a five year Track 2D grant awarded by the National Science Foundation (NSF) under solicitation NSF 08-573 in August 2009 for the development and deployment of an innovative high performance computing system. The Keeneland project is led by the Georgia Institute of Technology (Georgia Tech) in collaboration with the University of Tennessee at Knoxville, National Institute of Computational Sciences, and Oak Ridge National Laboratory.}, author = {Jeffrey Vetter and Richard Glassbrook and Karsten Schwan and Sudha Yalamanchili and Mitch Horton and Ada Gavrilovska and Magda Slawinska and Jack Dongarra and Jeremy Meredith and Philip Roth and Kyle Spafford and Stanimire Tomov and John Wynkoop} } @article {756, title = {Level-3 Cholesky Factorization Routines Improve Performance of Many Cholesky Algorithms}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {39}, year = {2013}, month = {2013-02}, abstract = {Four routines called DPOTF3i, i = a,b,c,d, are presented. DPOTF3i are a novel type of level-3 BLAS for use by BPF (Blocked Packed Format) Cholesky factorization and LAPACK routine DPOTRF. Performance of routines DPOTF3i are still increasing when the performance of Level-2 routine DPOTF2 of LAPACK starts decreasing. This is our main result and it implies, due to the use of larger block size nb, that DGEMM, DSYRK, and DTRSM performance also increases! The four DPOTF3i routines use simple register blocking. Different platforms have different numbers of registers. Thus, our four routines have different register blocking sizes. BPF is introduced. LAPACK routines for POTRF and PPTRF using BPF instead of full and packed format are shown to be trivial modifications of LAPACK POTRF source codes. We call these codes BPTRF. There are two variants of BPF: lower and upper. Upper BPF is {\textquotedblleft}identical{\textquotedblright} to Square Block Packed Format (SBPF). {\textquotedblleft}LAPACK{\textquotedblright} implementations on multicore processors use SBPF. Lower BPF is less efficient than upper BPF. Vector inplace transposition converts lower BPF to upper BPF very efficiently. Corroborating performance results for DPOTF3i versus DPOTF2 on a variety of common platforms are given for n ≈ nb as well as results for large n comparing DBPTRF versus DPOTRF.}, doi = {10.1145/2427023.2427026}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra and Jos{\'e} Herrero and Julien Langou} } @techreport {icl:733, title = {Multi-criteria checkpointing strategies: optimizing response-time versus resource utilization}, journal = {University of Tennessee Computer Science Technical Report}, number = {ICL-UT-13-01}, year = {2013}, month = {2013-02}, abstract = {Failures are increasingly threatening the eciency of HPC systems, and current projections of Exascale platforms indicate that rollback recovery, the most convenient method for providing fault tolerance to generalpurpose applications, reaches its own limits at such scales. One of the reasons explaining this unnerving situation comes from the focus that has been given to per-application completion time, rather than to platform efficiency. In this paper, we discuss the case of uncoordinated rollback recovery where the idle time spent waiting recovering processors is used to progress a different, independent application from the system batch queue. We then propose an extended model of uncoordinated checkpointing that can discriminate between idle time and wasted computation. We instantiate this model in a simulator to demonstrate that, with this strategy, uncoordinated checkpointing per application completion time is unchanged, while it delivers near-perfect platform efficiency.}, author = {Aurelien Bouteiller and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert} } @conference {868, title = {Multi-criteria Checkpointing Strategies: Response-Time versus Resource Utilization}, booktitle = {Euro-Par 2013}, year = {2013}, month = {2013-08}, publisher = {Springer}, organization = {Springer}, address = {Aachen, Germany}, abstract = {Failures are increasingly threatening the efficiency of HPC systems, and current projections of Exascale platforms indicate that roll- back recovery, the most convenient method for providing fault tolerance to general-purpose applications, reaches its own limits at such scales. One of the reasons explaining this unnerving situation comes from the focus that has been given to per-application completion time, rather than to platform efficiency. In this paper, we discuss the case of uncoordinated rollback recovery where the idle time spent waiting recovering processors is used to progress a different, independent application from the sys- tem batch queue. We then propose an extended model of uncoordinated checkpointing that can discriminate between idle time and wasted com- putation. We instantiate this model in a simulator to demonstrate that, with this strategy, uncoordinated checkpointing per application comple- tion time is unchanged, while it delivers near-perfect platform efficiency.}, author = {Aurelien Bouteiller and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert} } @conference {753, title = {Portable HPC Programming on Intel Many-Integrated-Core Hardware with MAGMA Port to Xeon Phi}, booktitle = {PPAM 2013}, year = {2013}, month = {2013-09}, address = {Warsaw, Poland}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms for multicore with Intel Xeon Phi Coprocessors. In particular, we consider algorithms for solving linear systems. Further, we give an overview of the MAGMA MIC library, an open source, high performance library that incorporates the developments presented, and in general provides to heterogeneous architectures of multicore with coprocessors the DLA functionality of the popular LAPACK library. The LAPACK-compliance simplifies the use of the MAGMA MIC library in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance BLAS, hardware-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components. Our methodology and programming techniques are incorporated into the MAGMA MIC API, which abstracts the application developer from the specifics of the Xeon Phi architecture and is therefore applicable to algorithms beyond the scope of DLA.}, keywords = {magma, mic, xeon phi}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Piotr Luszczek and Stanimire Tomov} } @conference {761, title = {Standards for Graph Algorithm Primitives}, booktitle = {17th IEEE High Performance Extreme Computing Conference (HPEC {\textquoteright}13)}, year = {2013}, month = {2013-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {It is our view that the state of the art in constructing a large collection of graph algorithms in terms of linear algebraic operations is mature enough to support the emergence of a standard set of primitive building blocks. This paper is a position paper defining the problem and announcing our intention to launch an open effort to define this standard.}, keywords = {algorithms, graphs, linear algebra, software standards}, doi = {10.1109/HPEC.2013.6670338}, author = {Tim Mattson and David Bader and Jon Berry and Aydin Buluc and Jack Dongarra and Christos Faloutsos and John Feo and John Gilbert and Joseph Gonzalez and Bruce Hendrickson and Jeremy Kepner and Charles Lieserson and Andrew Lumsdaine and David Padua and Steve W. Poole and Steve Reinhardt and Mike Stonebraker and Steve Wallach and Andrew Yoo} } @conference {686, title = {Toward a scalable multi-GPU eigensolver via compute-intensive kernels and efficient communication}, booktitle = {Proceedings of the 27th ACM International Conference on Supercomputing (ICS {\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Eugene, Oregon, USA}, abstract = {The enormous gap between the high-performance capabilities of GPUs and the slow interconnect between them has made the development of numerical software that is scalable across multiple GPUs extremely challenging. We describe a successful methodology on how to address the challenges---starting from our algorithm design, kernel optimization and tuning, to our programming model---in the development of a scalable high-performance tridiagonal reduction algorithm for the symmetric eigenvalue problem. This is a fundamental linear algebra problem with many engineering and physics applications. We use a combination of a task-based approach to parallelism and a new algorithmic design to achieve high performance. The goal of the new design is to increase the computational intensity of the major compute kernels and to reduce synchronization and data transfers between GPUs. This may increase the number of flops, but the increase is offset by the more efficient execution and reduced data transfers. Our performance results are the best available, providing an enormous performance boost compared to current state-of-the-art solutions. In particular, our software scales up to 1070 Gflop/s using 16 Intel E5-2670 cores and eight M2090 GPUs, compared to 45 Gflop/s achieved by the optimized Intel Math Kernel Library (MKL) using only the 16 CPU cores.}, keywords = {eigenvalue, gpu communication, gpu computation, heterogeneous programming model, performance, reduction to tridiagonal, singular value decomposiiton, task parallelism}, isbn = {9781450321303}, doi = {10.1145/2464996.2465438}, url = {http://dl.acm.org/citation.cfm?doid=2464996.2465438}, author = {Azzam Haidar and Mark Gates and Stanimire Tomov and Jack Dongarra}, editor = {Allen D. Malony and Nemirovsky, Mario and Midkiff, Sam} } @article {748, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {Concurrency and Computation: Practice and Experience}, year = {2013}, month = {2013-11}, abstract = {In this paper, we present a unified model for several well-known checkpoint/restart protocols. The proposed model is generic enough to encompass both extremes of the checkpoint/restart space, from coordinated approaches to a variety of uncoordinated checkpoint strategies (with message logging). We identify a set of crucial parameters, instantiate them, and compare the expected efficiency of the fault tolerant protocols, for a given application/platform pair. We then propose a detailed analysis of several scenarios, including some of the most powerful currently available high performance computing platforms, as well as anticipated Exascale designs. The results of this analytical comparison are corroborated by a comprehensive set of simulations. Altogether, they outline comparative behaviors of checkpoint strategies at very large scale, thereby providing insight that is hardly accessible to direct experimentation.}, doi = {10.1002/cpe.3173}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @conference {icl:692, title = {Virtual Systolic Array for QR Decomposition}, booktitle = {15th Workshop on Advances in Parallel and Distributed Computational Models, IEEE International Parallel \& Distributed Processing Symposium (IPDPS 2013)}, year = {2013}, month = {2013-05}, publisher = {IEEE}, organization = {IEEE}, address = {Boston, MA}, abstract = {Systolic arrays offer a very attractive, data-centric, execution model as an alternative to the von Neumann architecture. Hardware implementations of systolic arrays turned out not to be viable solutions in the past. This article shows how the systolic design principles can be applied to a software solution to deliver an algorithm with unprecedented strong scaling capabilities. Systolic array for the QR decomposition is developed and a virtualization layer is used for mapping of the algorithm to a large distributed memory system. Strong scaling properties are discovered, superior to existing solutions.}, keywords = {dataflow programming, message passing, multi-core, QR decomposition, roofline model, systolic array}, doi = {10.1109/IPDPS.2013.119}, author = {Jakub Kurzak and Piotr Luszczek and Mark Gates and Ichitaro Yamazaki and Jack Dongarra} } @techreport {688, title = {On Algorithmic Variants of Parallel Gaussian Elimination: Comparison of Implementations in Terms of Performance and Numerical Properties}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-13-715}, year = {2012}, month = {2013-07}, abstract = {Gaussian elimination is a canonical linear algebra procedure for solving linear systems of equations. In the last few years, the algorithm received a lot of attention in an attempt to improve its parallel performance. This article surveys recent developments in parallel implementations of the Gaussian elimination. Five different flavors are investigated. Three of them are based on different strategies for pivoting: partial pivoting, incremental pivoting, and tournament pivoting. The fourth one replaces pivoting with the Random Butterfly Transformation, and finally, an implementation without pivoting is used as a performance baseline. The technique of iterative refinement is applied to recover numerical accuracy when necessary. All parallel implementations are produced using dynamic, superscalar, runtime scheduling and tile matrix layout. Results on two multi-socket multicore systems are presented. Performance and numerical accuracy is analyzed.}, author = {Simplice Donfack and Jack Dongarra and Mathieu Faverge and Mark Gates and Jakub Kurzak and Piotr Luszczek and Ichitaro Yamazaki} } @article {icl:697, title = {Block-asynchronous Multigrid Smoothers for GPU-accelerated Systems}, journal = {ICCS 2012}, year = {2012}, month = {2012-06}, address = {Omaha, NE}, author = {Hartwig Anzt and Stanimire Tomov and Mark Gates and Jack Dongarra and Vincent Heuveline} } @inproceedings {icl:685, title = {A Class of Communication-Avoiding Algorithms for Solving General Dense Linear Systems on CPU/GPU Parallel Machines}, journal = {Proc. of the International Conference on Computational Science (ICCS)}, volume = {9}, year = {2012}, month = {2012-06}, pages = {17-26}, keywords = {magma}, author = {Marc Baboulin and Simplice Donfack and Jack Dongarra and Laura Grigori and Adrien Remi and Stanimire Tomov} } @article {1349, title = {MAGMA: A New Generation of Linear Algebra Library for GPU and Multicore Architectures}, year = {2012}, month = {2012-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC12), Presentation}, address = {Salt Lake City, UT}, author = {Jack Dongarra and Tingxing Dong and Mark Gates and Azzam Haidar and Stanimire Tomov and Ichitaro Yamazaki} } @article {1354, title = {MAGMA MIC: Linear Algebra Library for Intel Xeon Phi Coprocessors}, year = {2012}, month = {2012-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC12)}, address = {Salt Lake City, UT}, author = {Jack Dongarra and Mark Gates and Yulu Jia and Khairul Kabir and Piotr Luszczek and Stanimire Tomov} } @article {1357, title = {MAGMA Tutorial}, year = {2012}, month = {2012-02}, publisher = {Keeneland Workshop}, address = {Atlanta, GA}, author = {Mark Gates} } @techreport {icl:716, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 269)}, number = {UT-CS-12-697}, year = {2012}, month = {2012-06}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @inproceedings {icl:607, title = {3-D parallel frequency-domain visco-acoustic wave modelling based on a hybrid direct/iterative solver}, journal = {73rd EAGE Conference \& Exhibition incorporating SPE EUROPEC 2011, Vienna, Austria, 23-26 May}, year = {2011}, month = {2011-00}, author = {Azzam Haidar and Luc Giraud and Hafedh Ben-Hadj-Ali and Florent Sourbier and St{\'e}phane Operto and Jean Virieux} } @inproceedings {icl:605, title = {Algebraic Schwarz Preconditioning for the Schur Complement: Application to the Time-Harmonic Maxwell Equations Discretized by a Discontinuous Galerkin Method.}, journal = {The Twentieth International Conference on Domain Decomposition Methods}, year = {2011}, month = {2011-02}, address = {La Jolla, California}, url = {http://hal.inria.fr/inria-00577639}, author = {Emmanuel Agullo and Luc Giraud and Amina Guermouche and Azzam Haidar and Stephane Lanteri and Jean Roman} } @article {icl:661, title = {Block-asynchronous Multigrid Smoothers for GPU-accelerated Systems}, number = {UT-CS-11-689}, year = {2011}, month = {2011-12}, keywords = {magma}, author = {Hartwig Anzt and Stanimire Tomov and Mark Gates and Jack Dongarra and Vincent Heuveline} } @article {icl:643, title = {The International Exascale Software Project Roadmap}, journal = {International Journal of High Performance Computing}, volume = {25}, number = {1}, year = {2011}, month = {2011-01}, pages = {3-60}, abstract = {Over the last 20 years, the open-source community has provided more and more software on which the world{\textquoteright}s high-performance computing systems depend for performance and productivity. The community has invested millions of dollars and years of effort to build key components. However, although the investments in these separate software elements have been tremendously valuable, a great deal of productivity has also been lost because of the lack of planning, coordination, and key integration of technologies necessary to make them work together smoothly and efficiently, both within individual petascale systems and between different systems. It seems clear that this completely uncoordinated development model will not provide the software needed to support the unprecedented parallelism required for peta/ exascale computation on millions of cores, or the flexibility required to exploit new hardware models and features, such as transactional memory, speculative execution, and graphics processing units. This report describes the work of the community to prepare for the challenges of exascale computing, ultimately combing their efforts in a coordinated International Exascale Software Project.}, doi = {https://doi.org/10.1177/1094342010391989}, author = {Jack Dongarra and Pete Beckman and Terry Moore and Patrick Aerts and Giovanni Aloisio and Jean-Claude Andre and David Barkai and Jean-Yves Berthou and Taisuke Boku and Bertrand Braunschweig and Franck Cappello and Barbara Chapman and Xuebin Chi and Alok Choudhary and Sudip Dosanjh and Thom Dunning and Sandro Fiore and Al Geist and Bill Gropp and Robert Harrison and Mark Hereld and Michael Heroux and Adolfy Hoisie and Koh Hotta and Zhong Jin and Yutaka Ishikawa and Fred Johnson and Sanjay Kale and Richard Kenway and David Keyes and Bill Kramer and Jesus Labarta and Alain Lichnewsky and Thomas Lippert and Bob Lucas and Barney MacCabe and Satoshi Matsuoka and Paul Messina and Peter Michielse and Bernd Mohr and Matthias S. Mueller and Wolfgang E. Nagel and Hiroshi Nakashima and Michael E. Papka and Dan Reed and Mitsuhisa Sato and Ed Seidel and John Shalf and David Skinner and Marc Snir and Thomas Sterling and Rick Stevens and Fred Streitz and Bob Sugar and Shinji Sumimoto and William Tang and John Taylor and Rajeev Thakur and Anne Trefethen and Mateo Valero and Aad van der Steen and Jeffrey Vetter and Peg Williams and Robert Wisniewski and Kathy Yelick} } @article {, title = {Keeneland: Bringing Heterogeneous GPU Computing to the Computational Science Community}, journal = {IEEE Computing in Science \& Engineering}, volume = {13}, year = {2011}, month = {2011-08}, pages = {90-95}, abstract = {The Keeneland project{\textquoteright}s goal is to develop and deploy an innovative, GPU-based high-performance computing system for the NSF computational science community.}, keywords = {Benchmark testing, Computational modeling, Computer architecture, Graphics processing unit, Hardware, Random access memory, Scientific computing}, doi = {https://doi.org/10.1109/MCSE.2011.83}, author = {Jeffrey Vetter and Richard Glassbrook and Jack Dongarra and Karsten Schwan and Bruce Loftis and Stephen McNally and Jeremy Meredith and James Rogers and Philip Roth and Kyle Spafford and Sudhakar Yalamanchili} } @inproceedings {icl:649, title = {Kernel Assisted Collective Intra-node MPI Communication Among Multi-core and Many-core CPUs}, journal = {Int{\textquoteright}l Conference on Parallel Processing (ICPP {\textquoteright}11)}, year = {2011}, month = {2011-09}, address = {Taipei, Taiwan}, author = {Teng Ma and George Bosilca and Aurelien Bouteiller and Brice Goglin and J. Squyres and Jack Dongarra} } @article {icl:647, title = {OMPIO: A Modular Software Architecture for MPI I/O}, journal = {18th EuroMPI}, year = {2011}, month = {2011-09}, pages = {81-89}, publisher = {Springer}, address = {Santorini, Greece}, author = {Mohamad Chaarawi and Edgar Gabriel and Rainer Keller and Richard L. Graham and George Bosilca and Jack Dongarra}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @article {icl:606, title = {Parallel algebraic domain decomposition solver for the solution of augmented systems.}, journal = {Parallel, Distributed, Grid and Cloud Computing for Engineering, Ajaccio, Corsica, France, 12-15 April}, year = {2011}, month = {2011-00}, author = {Emmanuel Agullo and Luc Giraud and Amina Guermouche and Azzam Haidar and Jean Roman} } @article {icl:603, title = {Three-dimensional parallel frequency-domain visco-acoustic wave modelling based on a hybrid direct/iterative solver.}, journal = {To appear in Geophysical Prospecting journal.}, year = {2011}, month = {2011-00}, author = {Florent Sourbier and Azzam Haidar and Luc Giraud and Hafedh Ben-Hadj-Ali and St{\'e}phane Operto and Jean Virieux} } @inproceedings {icl:534, title = {Dodging the Cost of Unavoidable Memory Copies in Message Logging Protocols}, journal = {Proceedings of EuroMPI 2010}, year = {2010}, month = {2010-09}, publisher = {Springer}, address = {Stuttgart, Germany}, keywords = {ftmpi}, author = {George Bosilca and Aurelien Bouteiller and Thomas Herault and Pierre Lemariner and Jack Dongarra}, editor = {Jack Dongarra and Michael Resch and Rainer Keller and Edgar Gabriel} } @inproceedings {icl:527, title = {Improvement of parallelization efficiency of batch pattern BP training algorithm using Open MPI}, journal = {Proceedings of International Conference on Computational Science, ICCS 2010 (to appear)}, year = {2010}, month = {2010-06}, publisher = {Elsevier}, address = {Amsterdam The Netherlands}, keywords = {hpcchallenge}, author = {Volodymyr Turchenko and Lucio Grandinetti and George Bosilca and Jack Dongarra} } @techreport {icl:597, title = {Kernel Assisted Collective Intra-node Communication Among Multicore and Manycore CPUs}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-10-663}, year = {2010}, month = {2010-11}, author = {Teng Ma and George Bosilca and Aurelien Bouteiller and Brice Goglin and J. Squyres and Jack Dongarra} } @article {icl:573, title = {Level-3 Cholesky Kernel Subroutine of a Fully Portable High Performance Minimal Storage Hybrid Format Cholesky Algorithm}, journal = {ACM TOMS (submitted), also LAPACK Working Note (LAWN) 211}, year = {2010}, month = {2010-00}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra} } @article {icl:553, title = {MaPHyS or the Development of a Parallel Algebraic Domain Decomposition Solver in the Course of the Solstice Project}, journal = {Sparse Days 2010 Meeting at CERFACS}, year = {2010}, month = {2010-06}, address = {Toulouse, France}, author = {Emmanuel Agullo and Luc Giraud and Amina Guermouche and Azzam Haidar and Jean Roman and Yohan Lee-Tin-Yien} } @inproceedings {icl:581, title = {Recent Advances in the Message Passing Interface, Lecture Notes in Computer Science (LNCS)}, journal = {EuroMPI 2010 Proceedings}, volume = {6305}, year = {2010}, month = {2010-09}, publisher = {Springer}, address = {Stuttgart, Germany}, editor = {Rainer Keller and Edgar Gabriel and Michael Resch and Jack Dongarra} } @article {icl:551, title = {Rectangular Full Packed Format for Cholesky{\textquoteright}s Algorithm: Factorization, Solution, and Inversion}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {37}, number = {2}, year = {2010}, month = {2010-04}, address = {Atlanta, GA}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra and Julien Langou} } @article {icl:570, title = {Rectangular Full Packed Format for Cholesky{\textquoteright}s Algorithm: Factorization, Solution and Inversion}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {37}, number = {2}, year = {2010}, month = {2010-04}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra and Julien Langou} } @article {icl:566, title = {SmartGridRPC: The new RPC model for high performance Grid Computing and Its Implementation in SmartGridSolve}, journal = {Concurrency and Computation: Practice and Experience (to appear)}, year = {2010}, month = {2010-01}, keywords = {netsolve}, author = {Thomas Brady and Alexey Lastovetsky and Keith Seymour and Michele Guidolin and Jack Dongarra} } @article {icl:545, title = {Sparse approximations of the Schur complement for parallel algebraic hybrid solvers in 3D}, journal = {Numerical Mathematics: Theory, Methods and Applications}, volume = {3}, number = {3}, year = {2010}, month = {2010-00}, pages = {64-82}, publisher = {Golbal Science Press}, address = {Beijing}, author = {Luc Giraud and Azzam Haidar and Yousef Saad}, editor = {C. Zhiming} } @article {icl:552, title = {Towards a Complexity Analysis of Sparse Hybrid Linear Solvers}, journal = {PARA 2010}, year = {2010}, month = {2010-06}, address = {Reykjavik, Iceland}, author = {Emmanuel Agullo and Luc Giraud and Amina Guermouche and Azzam Haidar and Jean Roman} } @article {icl:544, title = {Using multiple levels of parallelism to enhance the performance of domain decomposition solvers}, journal = {Parallel Computing}, volume = {36}, number = {5-6}, year = {2010}, month = {2010-00}, pages = {285-296}, publisher = {Elsevier journals}, author = {Luc Giraud and Azzam Haidar and Stephane Pralet}, editor = {Costas Bekas and Pascua D{\textquoteright}Ambra and Ananth Grama and Yousef Saad and Petko Yanev} } @article {icl:482, title = {Computing the Conditioning of the Components of a Linear Least-squares Solution}, journal = {Numerical Linear Algebra with Applications}, volume = {16}, number = {7}, year = {2009}, month = {2009-00}, pages = {517-533}, author = {Marc Baboulin and Jack Dongarra and Serge Gratton and Julien Langou} } @inproceedings {icl:602, title = {Modeling the Office of Science Ten Year Facilities Plan: The PERI Architecture Tiger Team}, journal = {SciDAC 2009, Journal of Physics: Conference Series}, volume = {180(2009)012039}, year = {2009}, month = {2009-07}, publisher = {IOP Publishing}, address = {San Diego, California}, keywords = {test}, author = {Bronis R. de Supinski and Sadaf Alam and David Bailey and Laura Carrington and Chris Daley and Anshu Dubey and Todd Gamblin and Dan Gunter and Paul D. Hovland and Heike Jagode and Karen Karavanic and Gabriel Marin and John Mellor-Crummey and Shirley Moore and Boyana Norris and Leonid Oliker and Catherine Olschanowsky and Philip C. Roth and Martin Schulz and Sameer Shende and Allan Snavely} } @article {icl:511, title = {Rectangular Full Packed Format for Cholesky{\textquoteright}s Algorithm: Factorization, Solution and Inversion}, journal = {ACM TOMS (to appear)}, year = {2009}, month = {2009-00}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra and Julien Langou} } @inproceedings {icl:519, title = {VGrADS: Enabling e-Science Workflows on Grids and Clouds with Fault Tolerance}, journal = {SC{\textquoteright}09 The International Conference for High Performance Computing, Networking, Storage and Analysis (to appear)}, year = {2009}, month = {2009-00}, address = {Portland, OR}, keywords = {grads}, author = {Lavanya Ramakrishan and Daniel Nurmi and Anirban Mandal and Charles Koelbel and Dennis Gannon and Mark Huang and Yang-Suk Kee and Graziano Obertelli and Kiran Thyagaraja and Rich Wolski and Asim YarKhan and Dmitrii Zagorodnov} } @article {icl:457, title = {Computing the Conditioning of the Components of a Linear Least Squares Solution}, journal = {VECPAR {\textquoteright}08, High Performance Computing for Computational Science}, year = {2008}, month = {2008-01}, address = {Toulouse, France}, author = {Marc Baboulin and Jack Dongarra and Serge Gratton and Julien Langou} } @article {icl:451, title = {DARPA{\textquoteright}s HPCS Program: History, Models, Tools, Languages}, journal = {in Advances in Computers}, volume = {72}, year = {2008}, month = {2008-01}, publisher = {Elsevier}, author = {Jack Dongarra and Robert Graybill and William Harrod and Robert Lucas and Ewing Lusk and Piotr Luszczek and Janice McMahon and Allan Snavely and Jeffrey Vetter and Katherine Yelick and Sadaf Alam and Roy Campbell and Laura Carrington and Tzu-Yi Chen and Omid Khalili and Jeremy Meredith and Mustafa Tikir}, editor = {M. Zelkowitz} } @article {icl:449, title = {Exploiting Mixed Precision Floating Point Hardware in Scientific Computations}, journal = {in High Performance Computing and Grids in Action}, year = {2008}, month = {2008-01}, publisher = {IOS Press}, address = {Amsterdam}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Julien Langou and Piotr Luszczek and Stanimire Tomov}, editor = {Lucio Grandinetti} } @article {icl:409, title = {High Performance GridRPC Middleware}, journal = {Recent developments in Grid Technology and Applications}, year = {2008}, month = {2008-00}, publisher = {Nova Science Publishers}, keywords = {netsolve}, author = {Yves Caniou and Eddy Caron and Frederic Desprez and Hidemoto Nakada and Yoshio Tanaka and Keith Seymour}, editor = {George A. Gravvanis and John P. Morrison and Hamid R. Arabnia and D. A. Power} } @article {icl:425, title = {Netlib and NA-Net: Building a Scientific Computing Community}, journal = {IEEE Annals of the History of Computing}, volume = {30}, number = {2}, year = {2008}, month = {2008-01}, pages = {30-41}, author = {Jack Dongarra and Gene H. Golub and Eric Grosse and Cleve Moler and Keith Moore} } @techreport {icl:422, title = {Rectangular Full Packed Format for Cholesky{\textquoteright}s Algorithm: Factorization, Solution and Inversion}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-614 (also LAPACK Working Note 199)}, year = {2008}, month = {2008-04}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra} } @inproceedings {icl:412, title = {Usage of the Scalasca Toolset for Scalable Performance Analysis of Large-scale Parallel Applications}, journal = {Proceedings of the 2nd International Workshop on Tools for High Performance Computing}, year = {2008}, month = {2008-01}, pages = {157-167}, publisher = {Springer}, address = {Stuttgart, Germany}, keywords = {point}, author = {Felix Wolf and Brian Wylie and Erika Abraham and Wolfgang Frings and Karl F{\"u}rlinger and Markus Geimer and Marc-Andre Hermanns and Bernd Mohr and Shirley Moore and Matthias Pfeifer}, editor = {Michael Resch and Rainer Keller and Valentin Himmler and Bettina Krammer and A Schulz} } @techreport {icl:430, title = {Using dual techniques to derive componentwise and mixed condition numbers for a linear functional of a linear least squares solution}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-622 (also LAPACK Working Note 207)}, year = {2008}, month = {2008-01}, author = {Marc Baboulin and Serge Gratton} } @article {icl:360, title = {A Comparison of Application Performance Using Open MPI and Cray MPI}, journal = {Cray User Group, CUG 2007}, year = {2007}, month = {2007-05}, author = {Richard L. Graham and George Bosilca and Jelena Pjesivac{\textendash}Grbovic} } @techreport {icl:391, title = {Computing the Conditioning of the Components of a Linear Least Squares Solution}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-07-604, (also LAPACK Working Note 193)}, year = {2007}, month = {2007-01}, author = {Marc Baboulin and Jack Dongarra and Serge Gratton and Julien Langou} } @article {icl:359, title = {An Evaluation of Open MPI{\textquoteright}s Matching Transport Layer on the Cray XT}, journal = {EuroPVM/MPI 2007}, year = {2007}, month = {2007-09}, author = {Richard L. Graham and Ron Brightwell and Brian Barrett and George Bosilca and Jelena Pjesivac{\textendash}Grbovic} } @article {icl:392, title = {Exploiting Mixed Precision Floating Point Hardware in Scientific Computations}, journal = {In High Performance Computing and Grids in Action (to appear)}, year = {2007}, month = {2007-00}, publisher = {IOS Press}, address = {Amsterdam}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Julie Langou and Piotr Luszczek and Stanimire Tomov}, editor = {Lucio Grandinetti} } @inproceedings {icl:339, title = {GridSolve: The Evolution of Network Enabled Solver}, journal = {Grid-Based Problem Solving Environments: IFIP TC2/WG 2.5 Working Conference on Grid-Based Problem Solving Environments (Prescott, AZ, July 2006)}, year = {2007}, month = {2007-00}, pages = {215-226}, publisher = {Springer}, keywords = {netsolve}, author = {Asim YarKhan and Jack Dongarra and Keith Seymour}, editor = {Patrick Gaffney} } @article {, title = {The Impact of Multicore on Computational Science Software}, journal = {CTWatch Quarterly}, volume = {3}, year = {2007}, month = {2007-02}, author = {Jack Dongarra and Dennis Gannon and Geoffrey Fox and Ken Kennedy} } @article {icl:394, title = {Netlib and NA-Net: building a scientific computing community}, journal = {In IEEE Annals of the History of Computing (to appear)}, year = {2007}, month = {2007-08}, author = {Jack Dongarra and Gene H. Golub and Cleve Moler and Keith Moore} } @article {icl:358, title = {Performance Analysis of MPI Collective Operations}, journal = {Cluster computing}, volume = {10}, number = {2}, year = {2007}, month = {2007-06}, pages = {127-143}, publisher = {Springer Netherlands}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and Thara Angskun and George Bosilca and Graham Fagg and Edgar Gabriel and Jack Dongarra} } @inproceedings {icl:384, title = {Scalability Analysis of the SPEC OpenMP Benchmarks on Large-Scale Shared Memory Multiprocessors}, journal = {Proceedings of the 2007 International Conference on Computational Science (ICCS 2007)}, volume = {4487-4490}, year = {2007}, pages = {815-822}, publisher = {Springer LNCS}, address = {Beijing, China}, keywords = {kojak}, doi = {10.1007/978-3-540-72586-2_115}, author = {Karl F{\"u}rlinger and Michael Gerndt and Jack Dongarra}, editor = {Yong Shi and Jack Dongarra and Geert Dick van Albada and Peter M. Sloot} } @article {icl:385, title = {Specification and detection of performance problems with ASL}, journal = {Concurrency and Computation: Practice and Experience}, volume = {19}, number = {11}, year = {2007}, month = {2007-01}, pages = {1451-1464}, publisher = {John Wiley and Sons Ltd.}, author = {Michael Gerndt and Karl F{\"u}rlinger} } @inproceedings {icl:382, title = {On Using Incremental Profiling for the Performance Analysis of Shared Memory Parallel Applications}, journal = {Proceedings of the 13th International Euro-Par Conference on Parallel Processing (Euro-Par {\textquoteright}07)}, year = {2007}, month = {2007-01}, publisher = {Springer LNCS}, address = {Rennes, France}, keywords = {kojak}, author = {Karl F{\"u}rlinger and Jack Dongarra and Michael Gerndt} } @article {icl:652, title = {A High-Performance, Heterogeneous MPI}, journal = {HeteroPar 2006}, year = {2006}, month = {2006-09}, address = {Barcelona, Spain}, author = {Richard L. Graham and Galen M. Shipman and Brian Barrett and Ralph Castain and George Bosilca and Andrew Lumsdaine} } @article {icl:327, title = {Predicting the electronic properties of 3D, million-atom semiconductor nanostructure architectures}, journal = {J. Phys.: Conf. Ser. 46}, volume = {:101088/1742-6596/46/1/040}, year = {2006}, month = {2006-01}, pages = {292-298}, keywords = {DOE_NANO}, author = {Alex Zunger and Alberto Franceschetti and Gabriel Bester and Wesley B. Jones and Kwiseon Kim and Peter A. Graf and Lin-Wang Wang and Andrew Canning and Osni Marques and Christof Voemel and Jack Dongarra and Julien Langou and Stanimire Tomov} } @article {icl:370, title = {Prospectus for the Next LAPACK and ScaLAPACK Libraries}, journal = {PARA 2006}, year = {2006}, month = {2006-06}, address = {Umea, Sweden}, author = {James Demmel and Jack Dongarra and B. Parlett and William Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye Li and Osni Marques and Jason E. Riedy and Christof Voemel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julien Langou and Stanimire Tomov} } @techreport {icl:308, title = {Twenty-Plus Years of Netlib and NA-Net}, journal = {University of Tennessee Computer Science Department Technical Report, UT-CS-04-526}, year = {2006}, month = {2006-00}, author = {Jack Dongarra and Gene H. Golub and Eric Grosse and Cleve Moler and Keith Moore} } @inproceedings {icl:265, title = {Fault Tolerant High Performance Computing by a Coding Approach}, journal = {Proceedings of ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (to appear)}, year = {2005}, month = {2005-01}, address = {Chicago, Illinois}, keywords = {ftmpi, grads, lacsi, sans}, author = {Zizhong Chen and Graham Fagg and Edgar Gabriel and Julien Langou and Thara Angskun and George Bosilca and Jack Dongarra} } @article {, title = {NanoPSE: A Nanoscience Problem Solving Environment for Atomistic Electronic Structure of Semiconductor Nanostructures}, journal = {Journal of Physics: Conference Series}, year = {2005}, month = {2005-06}, pages = {277-282}, abstract = {Researchers at the National Renewable Energy Laboratory and their collaborators have developed over the past ~10 years a set of algorithms for an atomistic description of the electronic structure of nanostructures, based on plane-wave pseudopotentials and configuration interaction. The present contribution describes the first step in assembling these various codes into a single, portable, integrated set of software packages. This package is part of an ongoing research project in the development stage. Components of NanoPSE include codes for atomistic nanostructure generation and passivation, valence force field model for atomic relaxation, code for potential field generation, empirical pseudopotential method solver, strained linear combination of bulk bands method solver, configuration interaction solver for excited states, selection of linear algebra methods, and several inverse band structure solvers. Although not available for general distribution at this time as it is being developed and tested, the design goal of the NanoPSE software is to provide a software context for collaboration. The software package is enabled by fcdev, an integrated collection of best practice GNU software for open source development and distribution augmented to better support FORTRAN.}, doi = {https://doi.org/10.1088/1742-6596/16/1/038}, url = {https://iopscience.iop.org/article/10.1088/1742-6596/16/1/038/meta}, author = {Wesley B. Jones and Gabriel Bester and Andrew Canning and Alberto Franceschetti and Peter A. Graf and Kwiseon Kim and Julien Langou and Lin-Wang Wang and Jack Dongarra and Alex Zunger} } @article {icl:276, title = {NetSolve: Grid Enabling Scientific Computing Environments}, journal = {Grid Computing and New Frontiers of High Performance Processing}, number = {14}, year = {2005}, month = {2005-00}, publisher = {Elsevier}, keywords = {netsolve}, author = {Keith Seymour and Asim YarKhan and Sudesh Agrawal and Jack Dongarra}, editor = {Lucio Grandinetti} } @article {icl:286, title = {On the Parallel Solution of Large Industrial Wave Propagation Problems}, journal = {Journal of Computational Acoustics (to appear)}, year = {2005}, month = {2005-01}, author = {Luc Giraud and Julien Langou and G. Sylvand} } @inproceedings {icl:249, title = {Performance Analysis of MPI Collective Operations}, journal = {4th International Workshop on Performance Modeling, Evaluation, and Optmization of Parallel and Distributed Systems (PMEO-PDS {\textquoteright}05)}, year = {2005}, month = {2005-04}, address = {Denver, Colorado}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and Thara Angskun and George Bosilca and Graham Fagg and Edgar Gabriel and Jack Dongarra} } @article {icl:306, title = {Performance Analysis of MPI Collective Operations}, journal = {Cluster Computing Journal (to appear)}, year = {2005}, month = {2005-01}, keywords = {ftmpi}, author = {Jelena Pjesivac{\textendash}Grbovic and Thara Angskun and George Bosilca and Graham Fagg and Edgar Gabriel and Jack Dongarra} } @article {icl:285, title = {Rounding Error Analysis of the Classical Gram-Schmidt Orthogonalization Process}, journal = {Numerische Mathematik}, volume = {101}, number = {1}, year = {2005}, month = {2005-01}, pages = {87-100}, author = {Luc Giraud and Julien Langou and Miroslav Rozlo{\v z}n{\'\i}k and Jasper van den Eshof} } @article {icl:244, title = {Self Adaptivity in Grid Computing}, journal = {Concurrency and Computation: Practice and Experience, Special Issue: Grid Performance}, volume = {17}, number = {2-4}, year = {2005}, month = {2005-00}, pages = {235-257}, keywords = {netsolve, sans}, author = {Sathish Vadhiyar and Jack Dongarra}, editor = {John Gurd and Anthony Hey and Juri Papay and Graham Riley} } @article {icl:236, title = {Cray X1 Evaluation Status Report}, journal = {Oak Ridge National Laboratory Report}, volume = {/-2004/13}, year = {2004}, month = {2004-01}, author = {Pratul Agarwal and R. A. Alexander and E. Apra and Satish Balay and Arthur S. Bland and James Colgan and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Tom Dunigan and Mark Fahey and Al Geist and M. Gordon and Robert Harrison and Dinesh Kaushik and M. Krishnakumar and Piotr Luszczek and Tony Mezzacapa and Jeff Nichols and Jarek Nieplocha and Leonid Oliker and T. Packwood and M. Pindzola and Thomas C. Schulthess and Jeffrey Vetter and James B White and T. Windus and Patrick H. Worley and Thomas Zacharia} } @inproceedings {icl:230, title = {Extending the MPI Specification for Process Fault Tolerance on High Performance Computing Systems}, journal = {Proceedings of ISC2004 (to appear)}, year = {2004}, month = {2004-06}, address = {Heidelberg, Germany}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and George Bosilca and Thara Angskun and Zizhong Chen and Jelena Pjesivac{\textendash}Grbovic and Kevin London and Jack Dongarra} } @techreport {icl:200, title = {NetBuild: Automated Installation and Use of Network-Accessible Software Libraries}, journal = {ICL Technical Report}, number = {ICL-UT-04-02}, year = {2004}, month = {2004-01}, keywords = {netbuild}, author = {Keith Moore and Jack Dongarra and Shirley Moore and Eric Grosse} } @article {icl:240, title = {Process Fault-Tolerance: Semantics, Design and Applications for High Performance Computing}, journal = {International Journal for High Performance Applications and Supercomputing (to appear)}, year = {2004}, month = {2004-04}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and Zizhong Chen and Thara Angskun and George Bosilca and Jelena Pjesivac{\textendash}Grbovic and Jack Dongarra} } @article {icl:202, title = {The Virtual Instrument: Support for Grid-enabled Scientific Simulations}, journal = {International Journal of High Performance Computing Applications}, volume = {18}, number = {1}, year = {2004}, month = {2004-01}, pages = {3-17}, author = {Henri Casanova and Thomas Bartol and Francine Berman and Adam Birnbaum and Jack Dongarra and Mark Ellisman and Marcio Faerman and Erhan Gockay and Michelle Miller and Graziano Obertelli and Stuart Pomerantz and Terry Sejnowski and Joel Stiles and Rich Wolski} } @article {icl:169, title = {Computational Science {\textemdash} ICCS 2003}, journal = {Lecture Notes in Computer Science}, volume = {2657-2660}, year = {2003}, month = {2003-06}, publisher = {Springer-Verlag, Berlin}, address = {ICCS 2003, International Conference. Melbourne, Australia}, issn = {978-3-540-40194-0}, author = {Peter M. Sloot and David Abramson and Alexander V. Bogdanov and Jack Dongarra and Albert Zomaya and Yuriy Gorbachev} } @article {icl:145, title = {Evaluating The Performance Of MPI-2 Dynamic Communicators And One-Sided Communication}, journal = {Lecture Notes in Computer Science, Recent Advances in Parallel Virtual Machine and Message Passing Interface, 10th European PVM/MPI User{\textquoteright}s Group Meeting}, volume = {2840}, year = {2003}, month = {2003-09}, pages = {88-97}, publisher = {Springer-Verlag, Berlin}, address = {Venice, Italy}, keywords = {ftmpi}, author = {Edgar Gabriel and Graham Fagg and Jack Dongarra} } @inproceedings {icl:153, title = {Fault Tolerant Communication Library and Applications for High Performance Computing}, journal = {Los Alamos Computer Science Institute (LACSI) Symposium 2003 (presented)}, year = {2003}, month = {2003-10}, address = {Santa Fe, NM}, keywords = {ftmpi, lacsi}, author = {Graham Fagg and Edgar Gabriel and Zizhong Chen and Thara Angskun and George Bosilca and Antonin Bukovsky and Jack Dongarra} } @inproceedings {icl:144, title = {A Fault-Tolerant Communication Library for Grid Environments}, journal = {17th Annual ACM International Conference on Supercomputing (ICS{\textquoteright}03) International Workshop on Grid Computing and e-Science}, year = {2003}, month = {2003-06}, address = {San Francisco}, keywords = {ftmpi, lacsi}, author = {Edgar Gabriel and Graham Fagg and Antonin Bukovsky and Thara Angskun and Jack Dongarra} } @article {icl:80, title = {Automatic Optimisation of Parallel Linear Algebra Routines in Systems with Variable Load}, journal = {EuroPar 2002}, year = {2002}, month = {2002-08}, address = {Paderborn, Germany}, author = {Javier Cuenca and Domingo Giminez and Jos{\'e} Gonz{\'a}lez and Jack Dongarra and Kenneth Roche} } @inproceedings {icl:79, title = {Toward a Framework for Preparing and Executing Adaptive Grid Programs}, journal = {International Parallel and Distributed Processing Symposium: IPDPS 2002 Workshops}, year = {2002}, month = {2002-04}, pages = {0171}, address = {Fort Lauderdale, FL}, keywords = {grads}, author = {Ken Kennedy and John Mellor-Crummey and Keith Cooper and Linda Torczon and Francine Berman and Andrew Chien and Dave Angulo and Ian Foster and Dennis Gannon and Lennart Johnsson and Carl Kesselman and Jack Dongarra and Sathish Vadhiyar} } @article {icl:95, title = {The Virtual Instrument: Support for Grid-enabled Scientific Simulations}, journal = {Journal of Parallel and Distributed Computing (submitted)}, year = {2002}, month = {2002-10}, author = {Henri Casanova and Thomas Bartol and Francine Berman and Adam Birnbaum and Jack Dongarra and Mark Ellisman and Marcio Faerman and Erhan Gockay and Michelle Miller and Graziano Obertelli and Stuart Pomerantz and Terry Sejnowski and Joel Stiles and Rich Wolski} } @article {icl:90, title = {The GrADS Project: Software Support for High-Level Grid Application Development}, journal = {International Journal of High Performance Applications and Supercomputing}, volume = {15}, number = {4}, year = {2001}, month = {2001-01}, pages = {327-344}, keywords = {grads}, author = {Francine Berman and Andrew Chien and Keith Cooper and Jack Dongarra and Ian Foster and Dennis Gannon and Lennart Johnsson and Ken Kennedy and Carl Kesselman and John Mellor-Crummey and Dan Reed and Linda Torczon and Rich Wolski} } @article {icl:14, title = {Parallel IO Support for Meta-Computing Applications: MPI_Connect IO Applied to PACX-MPI}, journal = {8th European PVM/MPI User{\textquoteright}s Group Meeting, Lecture Notes in Computer Science}, volume = {2131}, year = {2001}, month = {2001-09}, publisher = {Springer Verlag, Berlin}, address = {Greece}, keywords = {ftmpi}, author = {Graham Fagg and Edgar Gabriel and Michael Resch} } @article {icl:82, title = {Telescoping Languages: A Strategy for Automatic Generation of Scientific Problem-Solving Systems from Annotated Libraries}, journal = {Journal of Parallel and Distributed Computing}, volume = {61}, number = {12}, year = {2001}, month = {2001-12}, pages = {1803-1826}, author = {Ken Kennedy and Bradley Broom and Keith Cooper and Jack Dongarra and Rob Fowler and Dennis Gannon and Lennart Johnsson and John Mellor-Crummey and Linda Torczon} } @techreport {icl:30, title = {The GrADS Project: Software Support for High-Level Grid Application Development}, journal = {Technical Report}, year = {2000}, month = {2000-02}, keywords = {grads}, author = {Francine Berman and Andrew Chien and Keith Cooper and Jack Dongarra and Ian Foster and Dennis Gannon and Lennart Johnsson and Ken Kennedy and Carl Kesselman and Dan Reed and Linda Torczon and Rich Wolski} } @article {icl:31, title = {A Portable Programming Interface for Performance Evaluation on Modern Processors}, journal = {The International Journal of High Performance Computing Applications}, volume = {14}, number = {3}, year = {2000}, month = {2000-09}, pages = {189-204}, keywords = {papi}, doi = {https://doi.org/10.1177/109434200001400303}, author = {Shirley Browne and Jack Dongarra and Nathan Garner and George Ho and Phil Mucci} } @techreport {icl:226, title = {A Portable Programming Interface for Performance Evaluation on Modern Processors}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-00-444}, year = {2000}, month = {2000-07}, author = {Shirley Browne and Jack Dongarra and Nathan Garner and Kevin London and Phil Mucci} } @inproceedings {icl:32, title = {A Scalable Cross-Platform Infrastructure for Application Performance Tuning Using Hardware Counters}, journal = {Proceedings of SuperComputing 2000 (SC{\textquoteright}00)}, year = {2000}, month = {2000-11}, address = {Dallas, TX}, keywords = {papi}, author = {Shirley Browne and Jack Dongarra and Nathan Garner and Kevin London and Phil Mucci} } @article {icl:55, title = {HARNESS: A Next Generation Distributed Virtual Machine}, journal = {International Journal on Future Generation Computer Systems}, volume = {15}, number = {5-6}, year = {1999}, month = {1999-01}, pages = {571-582}, keywords = {harness}, author = {Micah Beck and Jack Dongarra and Graham Fagg and Al Geist and Paul Gray and James Kohl and Mauro Migliardi and Keith Moore and Terry Moore and Philip Papadopoulous and Stephen L. Scott and Vaidy Sunderam} } @article {icl:50, title = {LAPACK Users{\textquoteright} Guide, 3rd ed.}, journal = {Philadelphia: Society for Industrial and Applied Mathematics}, year = {1999}, month = {1999-01}, author = {Ed Anderson and Zhaojun Bai and Christian Bischof and Susan Blackford and James Demmel and Jack Dongarra and Jeremy Du Croz and Anne Greenbaum and Sven Hammarling and Alan McKenney and Danny Sorensen} }