@techreport {, title = {Communication Avoiding LU with Tournament Pivoting in SLATE}, journal = {SLATE Working Notes}, number = {18, ICL-UT-22-01}, year = {2022}, month = {2022-01}, author = {Rabab Alomairy and Mark Gates and Sebastien Cayrols and Dalal Sukkari and Kadir Akbudak and Asim YarKhan and Paul Bagwell and Jack Dongarra} } @techreport {, title = {PAQR: Pivoting Avoiding QR factorization}, journal = {ICL Technical Report}, number = {ICL-UT-22-06}, year = {2022}, month = {2022-06}, abstract = {The solution of linear least-squares problems is at the heart of many scientific and engineering applications. While any method able to minimize the backward error of such problems is considered numerically stable, the theory states that the forward error depends on the condition number of the matrix in the system of equations. On the one hand, the QR factorization is an efficient method to solve such problems, but the solutions it produces may have large forward errors when the matrix is deficient. On the other hand, QR with column pivoting (QRCP) is able to produce smaller forward errors on deficient matrices, but its cost is prohibitive compared to QR. The aim of this paper is to propose PAQR, an alternative solution method with the same cost (or smaller) as QR and as accurate as QRCP in practical cases, for the solution of rank-deficient linear least-squares problems. After presenting the algorithm and its implementations on different architectures, we compare its accuracy and performance results on a variety of application problems. }, author = {Wissam M. Sid-Lakhdar and Sebastien Cayrols and Daniel Bielich and Ahmad Abdelfattah and Piotr Luszczek and Mark Gates and Stanimire Tomov and Hans Johansen and David Williams-Young and Timothy A. Davis and Jack Dongarra} } @conference {, title = {Threshold Pivoting for Dense LU Factorization}, booktitle = {ScalAH22: 13th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Heterogeneous Systems }, year = {2022}, month = {2022-11}, publisher = {IEEE}, organization = {IEEE}, address = {Dallas, Texas}, abstract = {LU factorization is a key approach for solving large, dense systems of linear equations. Partial row pivoting is commonly used to ensure numerical stability; however, the data movement needed for the row interchanges can reduce performance. To improve this, we propose using threshold pivoting to find pivots almost as good as those selected by partial pivoting but that result in less data movement. Our theoretical analysis bounds the element growth similarly to partial pivoting; however, it also shows that the growth of threshold pivoting for a given matrix cannot be bounded by that of partial pivoting and vice versa. Additionally, we experimentally tested the approach on the Summit supercomputer. Threshold pivoting improved performance by up to 32\% without a significant effect on accuracy. For a more aggressive configuration with up to one digit of accuracy lost, the improvement was as high as 44\%.}, doi = {10.1109/ScalAH56622.2022.00010}, author = {Neil Lindquist and Mark Gates and Piotr Luszczek and Jack Dongarra} } @article {, title = {A Set of Batched Basic Linear Algebra Subprograms and LAPACK Routines}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {47}, number = {3}, year = {2021}, pages = {1{\textendash}23}, abstract = {This article describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular, half precision is used in many very large scale applications, such as those associated with machine learning.}, keywords = {Computations on matrices, Mathematical analysis, Mathematics of computing, Numerical analysis}, doi = {10.1145/3431921}, author = {Abdelfattah, Ahmad and Costa, Timothy and Jack Dongarra and Mark Gates and Haidar, Azzam and Hammarling, Sven and Higham, Nicholas J and Kurzak, Jakub and Piotr Luszczek and Stanimire Tomov and others} } @techreport {, title = {SLATE Performance Improvements: QR and Eigenvalues}, journal = {SLATE Working Notes}, number = {17, ICL-UT-21-02}, year = {2021}, month = {2021-04}, author = {Kadir Akbudak and Paul Bagwell and Sebastien Cayrols and Mark Gates and Dalal Sukkari and Asim YarKhan and Jack Dongarra} } @techreport {, title = {SLATE Port to AMD and Intel Platforms}, journal = {SLATE Working Notes}, number = {16, ICL-UT-21-01}, year = {2021}, month = {2021-04}, author = {Ahmad Abdelfattah and Mohammed Al Farhan and Cade Brown and Mark Gates and Dalal Sukkari and Asim YarKhan and Jack Dongarra} } @article {, title = {A survey of numerical linear algebra methods utilizing mixed-precision arithmetic}, journal = {The International Journal of High Performance Computing Applications}, volume = {35}, number = {4}, year = {2021}, pages = {344{\textendash}369}, abstract = {The efficient utilization of mixed-precision numerical linear algebra algorithms can offer attractive acceleration to scientific computing applications. Especially with the hardware integration of low-precision special-function units designed for machine learning applications, the traditional numerical algorithms community urgently needs to reconsider the floating point formats used in the distinct operations to efficiently leverage the available compute power. In this work, we provide a comprehensive survey of mixed-precision numerical linear algebra routines, including the underlying concepts, theoretical background, and experimental results for both dense and sparse linear algebra problems.}, keywords = {GPUs, High-performance computing, linear algebra, Mixed-precision arithmetic, numerical mathematics}, doi = {10.1177/10943420211003313}, author = {Abdelfattah, Ahmad and Anzt, Hartwig and Boman, Erik G and Carson, Erin and Cojean, Terry and Jack Dongarra and Fox, Alyson and Mark Gates and Higham, Nicholas J and Li, Xiaoye S and others} } @inproceedings {, title = {Task-graph scheduling extensions for efficient synchronization and communication}, journal = {Proceedings of the ACM International Conference on Supercomputing}, year = {2021}, pages = {88{\textendash}101}, abstract = {Task graphs have been studied for decades as a foundation for scheduling irregular parallel applications and incorporated in many programming models including OpenMP. While many high-performance parallel libraries are based on task graphs, they also have additional scheduling requirements, such as synchronization within inner levels of data parallelism and internal blocking communications. In this paper, we extend task-graph scheduling to support efficient synchronization and communication within tasks. Compared to past work, our scheduler avoids deadlock and oversubscription of worker threads, and refines victim selection to increase the overlap of sibling tasks. To the best of our knowledge, our approach is the first to combine gang-scheduling and work-stealing in a single runtime. Our approach has been evaluated on the SLATE high-performance linear algebra library. Relative to the LLVM OMP runtime, our runtime demonstrates performance improvements of up to 13.82\%, 15.2\%, and 36.94\% for LU, QR, and Cholesky, respectively, evaluated across different configurations related to matrix size, number of nodes, and use of CPUs vs GPUs.}, keywords = {Compilers, Computing methodologies, Parallel computing methodologies, Parallel programming languages, Runtime environments, Software and its engineering, Software notations and tools}, doi = {10.1145/3447818.3461616}, author = {Bak, Seonmyeong and Hernandez, Oscar and Mark Gates and Piotr Luszczek and Sarkar, Vivek} } @article {, title = {Translational process: Mathematical software perspective}, journal = {Journal of Computational Science}, volume = {52}, year = {2021}, pages = {101216}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators}, doi = {10.1016/j.jocs.2020.101216}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @article {, title = {Clover: Computational Libraries Optimized via Exascale Research}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Stanimire Tomov and Hartwig Anzt and Piotr Luszczek and Jack Dongarra} } @article {, title = {MAGMA Templates for Scalable Linear Algebra on Emerging Architectures}, journal = {The International Journal of High Performance Computing Applications}, volume = {34}, year = {2020}, month = {2020-11}, pages = {645-658}, abstract = {With the acquisition and widespread use of more resources that rely on accelerator/wide vector{\textendash}based computing, there has been a strong demand for science and engineering applications to take advantage of these latest assets. This, however, has been extremely challenging due to the diversity of systems to support their extreme concurrency, complex memory hierarchies, costly data movement, and heterogeneous node architectures. To address these challenges, we design a programming model and describe its ease of use in the development of a new MAGMA Templates library that delivers high-performance scalable linear algebra portable on current and emerging architectures. MAGMA Templates derives its performance and portability by (1) building on existing state-of-the-art linear algebra libraries, like MAGMA, SLATE, Trilinos, and vendor-optimized math libraries, and (2) providing access (seamlessly to the users) to the latest algorithms and architecture-specific optimizations through a single, easy-to-use C++-based API.}, issn = {1094-3420}, doi = {https://doi.org/10.1177/1094342020938421}, author = {Mohammed Al Farhan and Ahmad Abdelfattah and Stanimire Tomov and Mark Gates and Dalal Sukkari and Azzam Haidar and Robert Rosenberg and Jack Dongarra} } @techreport {1454, title = {Performance Tuning SLATE}, journal = {SLATE Working Notes}, number = {14, ICL-UT-20-01}, year = {2020}, month = {2020-01}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Mark Gates and Ali Charara and Asim YarKhan and Dalal Sukkari and Mohammed Al Farhan and Jack Dongarra} } @article {, title = {A Set of Batched Basic Linear Algebra Subprograms}, journal = {ACM Transactions on Mathematical Software}, year = {2020}, month = {2020-10}, abstract = {This paper describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular half precision is used in many very large scale applications, such as those associated with machine learning.}, author = {Ahmad Abdelfattah and Timothy Costa and Jack Dongarra and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Mawussi Zounon} } @techreport {, title = {SLATE Performance Report: Updates to Cholesky and LU Factorizations}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-14}, year = {2020}, month = {2020-10}, publisher = {University of Tennessee}, author = {Asim YarKhan and Mohammed Al Farhan and Dalal Sukkari and Mark Gates and Jack Dongarra} } @article {, title = {SLATE: Software for Linear Algebra Targeting Exascale (POSTER)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Ali Charara and Jakub Kurzak and Asim YarKhan and Mohammed Al Farhan and Dalal Sukkari and Jack Dongarra} } @article {1464, title = {SLATE Tutorial}, year = {2020}, month = {2020-02}, publisher = {2020 ECP Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Jakub Kurzak and Asim YarKhan and Ali Charara and Jamie Finney and Dalal Sukkari and Mohammed Al Farhan and Ichitaro Yamazaki and Panruo Wu and Jack Dongarra} } @techreport {1278, title = {SLATE Users{\textquoteright} Guide}, journal = {SLATE Working Notes}, number = {10, ICL-UT-19-01}, year = {2020}, month = {2020-07}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Mark Gates and Ali Charara and Jakub Kurzak and Asim YarKhan and Mohammed Al Farhan and Dalal Sukkari and Jack Dongarra} } @techreport {, title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic}, journal = {SLATE Working Notes}, number = {15, ICL-UT-20-08}, year = {2020}, month = {2020-07}, publisher = {University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang} } @article {, title = {Translational Process: Mathematical Software Perspective}, journal = {Journal of Computational Science}, year = {2020}, month = {2020-09}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators}, doi = {https://doi.org/10.1016/j.jocs.2020.101216}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {, title = {Translational Process: Mathematical Software Perspective}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-11}, year = {2020}, month = {2020-08}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to worldclass high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, data flow scheduling runtimes, hardware accelerators}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @inproceedings {1404, title = {Least Squares Solvers for Distributed-Memory Machines with GPU Accelerators}, journal = {ACM International Conference on Supercomputing (ICS {\textquoteright}19)}, year = {2019}, month = {2019-06}, pages = {117{\textendash}126}, publisher = {ACM}, address = {Phoenix, Arizona}, isbn = {9781450360791}, doi = {https://dl.acm.org/doi/abs/10.1145/3330345.3330356}, author = {Jakub Kurzak and Mark Gates and Ali Charara and Asim YarKhan and Jack Dongarra} } @inproceedings {1405, title = {Linear Systems Solvers for Distributed-Memory Machines with GPU Accelerators}, journal = {Euro-Par 2019: Parallel Processing}, volume = {11725}, year = {2019}, month = {2019-08}, pages = {495{\textendash}506}, publisher = {Springer}, isbn = {978-3-030-29399-4}, doi = {https://doi.org/10.1007/978-3-030-29400-7_35}, url = {https://link.springer.com/chapter/10.1007/978-3-030-29400-7_35}, author = {Kurzak, Jakub and Mark Gates and Charara, Ali and Asim YarKhan and Yamazaki, Ichitaro and Jack Dongarra}, editor = {Yahyapour, Ramin} } @conference {1373, title = {Massively Parallel Automated Software Tuning}, booktitle = {48th International Conference on Parallel Processing (ICPP 2019)}, year = {2019}, month = {2019-08}, publisher = {ACM Press}, organization = {ACM Press}, address = {Kyoto, Japan}, abstract = {This article presents an implementation of a distributed autotuning engine developed as part of the Bench-testing OpenN Software Autotuning Infrastructure project. The system is geared towards performance optimization of computational kernels for graphics processing units, and allows for the deployment of vast autotuning sweeps to massively parallel machines. The software implements dynamic work scheduling to distributed-memory resources and takes advantage of multithreading for parallel compilation and dispatches kernel launches to multiple accelerators. This paper lays out the main design principles of the system and discusses the basic mechanics of the initial implementation. Preliminary performance results are presented, encountered challenges are discussed, and the future directions are outlined.}, doi = {https://doi.org/10.1145/3337821.3337908}, author = {Jakub Kurzak and Yaohung Tsai and Mark Gates and Ahmad Abdelfattah and Jack Dongarra} } @article {1270, title = {PLASMA: Parallel Linear Algebra Software for Multicore Using OpenMP}, journal = {ACM Transactions on Mathematical Software}, volume = {45}, year = {2019}, month = {2019-06}, doi = {https://doi.org/10.1145/3264491}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Maksims Abalenkovs and Negin Bagherpour and Sven Hammarling and Jakub Sistek} } @article {1463, title = {SLATE: Design of a Modern Distributed and Accelerated Linear Algebra Library}, year = {2019}, month = {2019-11}, publisher = {International Conference for High Performance Computing, Networking, Storage and Analysis (SC19)}, address = {Denver, CO}, author = {Mark Gates and Jakub Kurzak and Ali Charara and Asim YarKhan and Jack Dongarra} } @conference {1450, title = {SLATE: Design of a Modern Distributed and Accelerated Linear Algebra Library}, booktitle = {International Conference for High Performance Computing, Networking, Storage and Analysis (SC19)}, year = {2019}, month = {2019-11}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, abstract = {The SLATE (Software for Linear Algebra Targeting Exascale) library is being developed to provide fundamental dense linear algebra capabilities for current and upcoming distributed high-performance systems, both accelerated CPU-GPU based and CPU based. SLATE will provide coverage of existing ScaLAPACK functionality, including the parallel BLAS; linear systems using LU and Cholesky; least squares problems using QR; and eigenvalue and singular value problems. In this respect, it will serve as a replacement for ScaLAPACK, which after two decades of operation, cannot adequately be retrofitted for modern accelerated architectures. SLATE uses modern techniques such as communication-avoiding algorithms, lookahead panels to overlap communication and computation, and task-based scheduling, along with a modern C++ framework. Here we present the design of SLATE and initial reports of several of its components.}, doi = {https://doi.org/10.1145/3295500.3356223}, author = {Mark Gates and Jakub Kurzak and Ali Charara and Asim YarKhan and Jack Dongarra} } @techreport {1279, title = {SLATE Developers{\textquoteright} Guide}, journal = {SLATE Working Notes}, number = {11, ICL-UT-19-02}, year = {2019}, month = {2019-12}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Ali Charara and Mark Gates and Jakub Kurzak and Asim YarKhan and Jack Dongarra} } @techreport {1304, title = {SLATE Mixed Precision Performance Report}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-03}, year = {2019}, month = {2019-04}, publisher = {University of Tennessee}, author = {Ali Charara and Jack Dongarra and Mark Gates and Jakub Kurzak and Asim YarKhan} } @techreport {1321, title = {SLATE Working Note 12: Implementing Matrix Inversions}, journal = {SLATE Working Notes}, number = {12, ICL-UT-19-04}, year = {2019}, month = {2019-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Jakub Kurzak and Mark Gates and Ali Charara and Asim YarKhan and Jack Dongarra} } @techreport {1394, title = {SLATE Working Note 13: Implementing Singular Value and Symmetric/Hermitian Eigenvalue Solvers}, journal = {SLATE Working Notes}, number = {13, ICL-UT-19-07}, year = {2019}, note = {revision 06-2023}, month = {2019-09}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Mark Gates and Mohammed Al Farhan and Ali Charara and Jakub Kurzak and Dalal Sukkari and Asim YarKhan and Jack Dongarra} } @article {1336, title = {Accelerating Linear Algebra with MAGMA}, year = {2018}, month = {2018-02}, publisher = {ECP Annual Meeting 2018, Tutorial}, address = {Knoxville, TN}, author = {Stanimire Tomov and Mark Gates and Azzam Haidar} } @article {1161, title = {Accelerating the SVD Two Stage Bidiagonal Reduction and Divide and Conquer Using GPUs}, journal = {Parallel Computing}, volume = {74}, year = {2018}, month = {2018-05}, pages = {3{\textendash}18}, abstract = {The increasing gap between memory bandwidth and computation speed motivates the choice of algorithms to take full advantage of today{\textquoteright}s high performance computers. For dense matrices, the classic algorithm for the singular value decomposition (SVD) uses a one stage reduction to bidiagonal form, which is limited in performance by the memory bandwidth. To overcome this limitation, a two stage reduction to bidiagonal has been gaining popularity. It first reduces the matrix to band form using high performance Level 3 BLAS, then reduces the band matrix to bidiagonal form. As accelerators such as GPUs and co-processors are becoming increasingly widespread in high-performance computing, a question of great interest to many SVD users is how much the employment of a two stage reduction, as well as other current best practices in GPU computing, can accelerate this important routine. To fulfill this interest, we have developed an accelerated SVD employing a two stage reduction to bidiagonal and a number of other algorithms that are highly optimized for GPUs. Notably, we also parallelize and accelerate the divide and conquer algorithm used to solve the subsequent bidiagonal SVD. By accelerating all phases of the SVD algorithm, we provide a significant speedup compared to existing multi-core and GPU-based SVD implementations. In particular, using a P100 GPU, we illustrate a performance of up to 804 Gflop/s in double precision arithmetic to compute the full SVD of a 20k {\texttimes} 20k matrix in 90 seconds, which is 8.9 {\texttimes} faster than MKL on two 10 core Intel Haswell E5-2650 v3 CPUs, 3.7 {\texttimes} over the multi-core PLASMA two stage version, and 2.6 {\texttimes} over the previously accelerated one stage MAGMA version.}, keywords = {2-stage, accelerator, Divide and conquer, gpu, Singular value decomposition, SVD}, issn = {01678191}, doi = {10.1016/j.parco.2017.10.004}, url = {https://www.sciencedirect.com/science/article/pii/S0167819117301758}, author = {Mark Gates and Stanimire Tomov and Jack Dongarra} } @article {1271, title = {Autotuning Numerical Dense Linear Algebra for Batched Computation With GPU Hardware Accelerators}, journal = {Proceedings of the IEEE}, volume = {106}, year = {2018}, month = {2018-11}, pages = {2040{\textendash}2055}, abstract = {Computational problems in engineering and scientific disciplines often rely on the solution of many instances of small systems of linear equations, which are called batched solves. In this paper, we focus on the important variants of both batch Cholesky factorization and subsequent substitution. The former requires the linear system matrices to be symmetric positive definite (SPD). We describe the implementation and automated performance engineering of these kernels that implement the factorization and the two substitutions. Our target platforms are graphics processing units (GPUs), which over the past decade have become an attractive high-performance computing (HPC) target for solvers of linear systems of equations. Due to their throughput-oriented design, GPUs exhibit the highest processing rates among the available processors. However, without careful design and coding, this speed is mostly restricted to large matrix sizes. We show an automated exploration of the implementation space as well as a new data layout for the batched class of SPD solvers. Our tests involve the solution of many thousands of linear SPD systems of exactly the same size. The primary focus of our techniques is on the individual matrices in the batch that have dimensions ranging from 5-by-5 up to 100-by-100. We compare our autotuned solvers against the state-of-the-art solvers such as those provided through NVIDIA channels and publicly available in the optimized MAGMA library. The observed performance is competitive and many times superior for many practical cases. The advantage of the presented methodology lies in achieving these results in a portable manner across matrix storage formats and GPU hardware architecture platforms.}, keywords = {Dense numerical linear algebra, performance autotuning}, doi = {10.1109/JPROC.2018.2868961}, author = {Jack Dongarra and Mark Gates and Jakub Kurzak and Piotr Luszczek and Yaohung Tsai} } @article {1300, title = {Batched BLAS (Basic Linear Algebra Subprograms) 2018 Specification}, year = {2018}, month = {2018-07}, abstract = {This document describes an API for Batch Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). We focus on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The extensions beyond the original BLAS standard are considered that specify a programming interface not only for routines with uniformly-sized matrices and/or vectors but also for the situation where the sizes vary. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance manycore platforms. These include multicore and many-core CPU processors; GPUs and coprocessors; as well as other hardware accelerators with floating-point compute facility.}, author = {Jack Dongarra and Iain Duff and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jonathan Hogg and Pedro Valero Lara and Piotr Luszczek and Mawussi Zounon and Samuel D. Relton and Stanimire Tomov and Timothy Costa and Sarah Knepper} } @article {1263, title = {Computational Benefit of GPU Optimization for Atmospheric Chemistry Modeling}, journal = {Journal of Advances in Modeling Earth Systems}, volume = {10}, year = {2018}, month = {2018-08}, pages = {1952{\textendash}1969}, abstract = {Global chemistry-climate models are computationally burdened as the chemical mechanisms become more complex and realistic. Optimization for graphics processing units (GPU) may make longer global simulation with regional detail possible, but limited study has been done to explore the potential benefit for the atmospheric chemistry modeling. Hence, in this study, the second-order Rosenbrock solver of the chemistry module of CAM4-Chem is ported to the GPU to gauge potential speed-up. We find that on the CPU, the fastest performance is achieved using the Intel compiler with a block interleaved memory layout. Different combinations of compiler and memory layout lead to ~11.02{\texttimes} difference in the computational time. In contrast, the GPU version performs the best when using a combination of fully interleaved memory layout with block size equal to the warp size, CUDA streams for independent kernels, and constant memory. Moreover, the most efficient data transfer between CPU and GPU is gained by allocating the memory contiguously during the data initialization on the GPU. Compared to one CPU core, the speed-up of using one GPU alone reaches a factor of ~11.7{\texttimes} for the computation alone and ~3.82{\texttimes} when the data transfer between CPU and GPU is considered. Using one GPU alone is also generally faster than the multithreaded implementation for 16 CPU cores in a compute node and the single-source solution (OpenACC). The best performance is achieved by the implementation of the hybrid CPU/GPU version, but rescheduling the workload among the CPU cores is required before the practical CAM4-Chem simulation.}, keywords = {compiler, CUDA, data transfer, gpu, hybrid, memory layout}, doi = {https://doi.org/10.1029/2018MS001276}, author = {Jian Sun and Joshua Fu and John Drake and Qingzhao Zhu and Azzam Haidar and Mark Gates and Stanimire Tomov and Jack Dongarra} } @techreport {1203, title = {Implementation of the C++ API for Batch BLAS}, journal = {SLATE Working Notes}, number = {07, ICL-UT-18-04}, year = {2018}, month = {2018-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Ahmad Abdelfattah and Mark Gates and Jakub Kurzak and Piotr Luszczek and Jack Dongarra} } @techreport {1273, title = {Least Squares Performance Report}, journal = {SLATE Working Notes}, number = {09, ICL-UT-18-10}, year = {2018}, month = {2018-12}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Mark Gates and Ali Charara and Jakub Kurzak and Asim YarKhan and Ichitaro Yamazaki and Jack Dongarra} } @techreport {1228, title = {Linear Systems Performance Report}, journal = {SLATE Working Notes}, number = {08, ICL-UT-18-08}, year = {2018}, month = {2018-09}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Ali Charara and Asim YarKhan and Jamie Finney and Gerald Ragghianti and Piotr Luszczek and Jack Dongarra} } @techreport {1191, title = {Parallel BLAS Performance Report}, journal = {SLATE Working Notes}, number = {05, ICL-UT-18-01}, year = {2018}, month = {2018-04}, publisher = {University of Tennessee}, author = {Jakub Kurzak and Mark Gates and Asim YarKhan and Ichitaro Yamazaki and Panruo Wu and Piotr Luszczek and Jamie Finney and Jack Dongarra} } @techreport {1206, title = {Parallel Norms Performance Report}, journal = {SLATE Working Notes}, number = {06, ICL-UT-18-06}, year = {2018}, month = {2018-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Jakub Kurzak and Mark Gates and Asim YarKhan and Ichitaro Yamazaki and Piotr Luszczek and Jamie Finney and Jack Dongarra} } @article {1258, title = {The Singular Value Decomposition: Anatomy of Optimizing an Algorithm for Extreme Scale}, journal = {SIAM Review}, volume = {60}, year = {2018}, month = {2018-11}, pages = {808{\textendash}865}, abstract = {The computation of the singular value decomposition, or SVD, has a long history with many improvements over the years, both in its implementations and algorithmically. Here, we survey the evolution of SVD algorithms for dense matrices, discussing the motivation and performance impacts of changes. There are two main branches of dense SVD methods: bidiagonalization and Jacobi. Bidiagonalization methods started with the implementation by Golub and Reinsch in Algol60, which was subsequently ported to Fortran in the EISPACK library, and was later more efficiently implemented in the LINPACK library, targeting contemporary vector machines. To address cache-based memory hierarchies, the SVD algorithm was reformulated to use Level 3 BLAS in the LAPACK library. To address new architectures, ScaLAPACK was introduced to take advantage of distributed computing, and MAGMA was developed for accelerators such as GPUs. Algorithmically, the divide and conquer and MRRR algorithms were developed to reduce the number of operations. Still, these methods remained memory bound, so two-stage algorithms were developed to reduce memory operations and increase the computational intensity, with efficient implementations in PLASMA, DPLASMA, and MAGMA. Jacobi methods started with the two-sided method of Kogbetliantz and the one-sided method of Hestenes. They have likewise had many developments, including parallel and block versions and preconditioning to improve convergence. In this paper, we investigate the impact of these changes by testing various historical and current implementations on a common, modern multicore machine and a distributed computing platform. We show that algorithmic and implementation improvements have increased the speed of the SVD by several orders of magnitude, while using up to 40 times less energy.}, keywords = {bidiagonal matrix, bisection, Divide and conquer, Hestenes method, Jacobi method, Kogbetliantz method, MRRR, QR iteration, Singular value decomposition, SVD}, issn = {0036-1445}, doi = {10.1137/17M1117732}, url = {https://epubs.siam.org/doi/10.1137/17M1117732}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @conference {1169, title = {Autotuning Batch Cholesky Factorization in CUDA with Interleaved Layout of Matrices}, booktitle = {Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2017}, month = {2017-06}, publisher = {IEEE}, organization = {IEEE}, address = {Orlando, FL}, abstract = {Batch matrix operations address the case of solving the same linear algebra problem for a very large number of very small matrices. In this paper, we focus on implementing the batch Cholesky factorization in CUDA, in single precision arithmetic, for NVIDIA GPUs. Specifically, we look into the benefits of using noncanonical data layouts, where consecutive memory locations store elements with the same row and column index in a set of consecutive matrices. We discuss a number of different implementation options and tuning parameters. We demonstrate superior performance to traditional implementations for the case of very small matrices.}, keywords = {batch computation, Cholesky Factorization, data layout, GPU computing, numerical linear algebra}, doi = {10.1109/IPDPSW.2017.18}, author = {Mark Gates and Jakub Kurzak and Piotr Luszczek and Yu Pei and Jack Dongarra} } @inbook {1167, title = {Bringing High Performance Computing to Big Data Algorithms}, booktitle = {Handbook of Big Data Technologies}, year = {2017}, publisher = {Springer}, organization = {Springer}, isbn = {978-3-319-49339-8}, doi = {10.1007/978-3-319-49340-4}, author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @techreport {1175, title = {C++ API for Batch BLAS}, journal = {SLATE Working Notes}, number = {04, ICL-UT-17-12}, year = {2017}, month = {2017-12}, publisher = {University of Tennessee}, author = {Ahmad Abdelfattah and Konstantin Arturov and Cris Cecka and Jack Dongarra and Chip Freitag and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Panruo Wu} } @techreport {1081, title = {C++ API for BLAS and LAPACK}, journal = {SLATE Working Notes}, number = {02, ICL-UT-17-03}, year = {2017}, note = {Revision 02-21-2018}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Mark Gates and Piotr Luszczek and Ahmad Abdelfattah and Jakub Kurzak and Jack Dongarra and Konstantin Arturov and Cris Cecka and Chip Freitag} } @techreport {1133, title = {Designing SLATE: Software for Linear Algebra Targeting Exascale}, journal = {SLATE Working Notes}, number = {03, ICL-UT-17-06}, year = {2017}, month = {2017-10}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Jakub Kurzak and Panruo Wu and Mark Gates and Ichitaro Yamazaki and Piotr Luszczek and Gerald Ragghianti and Jack Dongarra} } @techreport {1130, title = {MAGMA-sparse Interface Design Whitepaper}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-05}, year = {2017}, month = {2017-09}, type = {Technical Report}, abstract = {In this report we describe the logic and interface we develop for the MAGMA-sparse library to allow for easy integration as third-party library into a top-level software ecosystem. The design choices are based on extensive consultation with other software library developers, in particular the Trilinos software development team. The interface documentation is at this point not exhaustive, but a first proposal for setting a standard. Although the interface description targets the MAGMA-sparse software module, we hope that the design choices carry beyond this specific library, and are attractive for adoption in other packages. This report is not intended as static document, but will be updated over time to reflect the agile software development in the ECP 1.3.3.11 STMS11-PEEKS project.}, author = {Hartwig Anzt and Erik Boman and Jack Dongarra and Goran Flegar and Mark Gates and Mike Heroux and Mark Hoemmen and Jakub Kurzak and Piotr Luszczek and Sivasankaran Rajamanickam and Stanimire Tomov and Stephen Wood and Ichitaro Yamazaki} } @techreport {1173, title = {PLASMA 17 Performance Report}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-11}, year = {2017}, month = {2017-06}, publisher = {University of Tennessee}, abstract = {PLASMA (Parallel Linear Algebra for Multicore Architectures) is a dense linear algebra package at the forefront of multicore computing. PLASMA is designed to deliver the highest possible performance from a system with multiple sockets of multicore processors. PLASMA achieves this objective by combining state of the art solutions in parallel algorithms, scheduling, and software engineering. PLASMA currently offers a collection of routines for solving linear systems of equations and least square problems.}, author = {Maksims Abalenkovs and Negin Bagherpour and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Samuel Relton and Jakub Sistek and David Stevens and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Mawussi Zounon} } @techreport {1172, title = {PLASMA 17.1 Functionality Report}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-10}, year = {2017}, month = {2017-06}, publisher = {University of Tennessee}, abstract = {PLASMA (Parallel Linear Algebra for Multicore Architectures) is a dense linear algebra package at the forefront of multicore computing. PLASMA is designed to deliver the highest possible performance from a system with multiple sockets of multicore processors. PLASMA achieves this objective by combining state of the art solutions in parallel algorithms, scheduling, and software engineering. PLASMA currently offers a collection of routines for solving linear systems of equations and least square problems.}, author = {Maksims Abalenkovs and Negin Bagherpour and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Samuel Relton and Jakub Sistek and David Stevens and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Mawussi Zounon} } @article {1067, title = {Preconditioned Krylov Solvers on GPUs}, journal = {Parallel Computing}, year = {2017}, month = {2017-06}, abstract = {In this paper, we study the effect of enhancing GPU-accelerated Krylov solvers with preconditioners. We consider the BiCGSTAB, CGS, QMR, and IDR(s) Krylov solvers. For a large set of test matrices, we assess the impact of Jacobi and incomplete factorization preconditioning on the solvers{\textquoteright} numerical stability and time-to-solution performance. We also analyze how the use of a preconditioner impacts the choice of the fastest solver.}, keywords = {gpu, ILU, Jacobi, Krylov solvers, Preconditioning}, issn = {01678191}, doi = {10.1016/j.parco.2017.05.006}, url = {http://www.sciencedirect.com/science/article/pii/S0167819117300777}, author = {Hartwig Anzt and Mark Gates and Jack Dongarra and Moritz Kreutzer and Gerhard Wellein and Martin Kohler} } @techreport {1080, title = {Roadmap for the Development of a Linear Algebra Library for Exascale Computing: SLATE: Software for Linear Algebra Targeting Exascale}, journal = {SLATE Working Notes}, number = {01, ICL-UT-17-02}, year = {2017}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Aurelien Bouteiller and Anthony Danalis and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Stephen Wood and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan} } @article {, title = {With Extreme Computing, the Rules Have Changed}, journal = {Computing in Science \& Engineering}, volume = {19}, year = {2017}, month = {2017-05}, pages = {52-62}, abstract = {On the eve of exascale computing, traditional wisdom no longer applies. High-performance computing is gone as we know it. This article discusses a range of new algorithmic techniques emerging in the context of exascale computing, many of which defy the common wisdom of high-performance computing and are considered unorthodox, but could turn out to be a necessity in near future.}, doi = {https://doi.org/10.1109/MCSE.2017.48}, author = {Jack Dongarra and Stanimire Tomov and Piotr Luszczek and Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Hartwig Anzt and Azzam Haidar and Ahmad Abdelfattah} } @conference {939, title = {Heterogeneous Streaming}, booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This paper introduces a new heterogeneous streaming library called hetero Streams (hStreams). We show how a simple FIFO streaming model can be applied to heterogeneous systems that include manycore coprocessors and multicore CPUs. This model supports concurrency across nodes, among tasks within a node, and between data transfers and computation. We give examples for different approaches, show how the implementation can be layered, analyze overheads among layers, and apply those models to parallelize applications using simple, intuitive interfaces. We compare the features and versatility of hStreams, OpenMP, CUDA Streams1 and OmpSs. We show how the use of hStreams makes it easier for scientists to identify tasks and easily expose concurrency among them, and how it enables tuning experts and runtime systems to tailor execution for different heterogeneous targets. Practical application examples are taken from the field of numerical linear algebra, commercial structural simulation software, and a seismic processing application.}, keywords = {plasma}, author = {Chris J. Newburn and Gaurav Bansal and Michael Wood and Luis Crivelli and Judit Planas and Alejandro Duran and Paulo Souza and Leonardo Borges and Piotr Luszczek and Stanimire Tomov and Jack Dongarra and Hartwig Anzt and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Ichitaro Yamazaki and Jesus Labarta} } @article {1472, title = {Linear Algebra Software for Large-Scale Accelerated Multicore Computing}, journal = {Acta Numerica}, volume = {25}, year = {2016}, month = {2016-05}, pages = {1-160}, abstract = {Many crucial scientific computing applications, ranging from national security to medical advances, rely on high-performance linear algebra algorithms and technologies, underscoring their importance and broad impact. Here we present the state-of-the-art design and implementation practices for the acceleration of the predominant linear algebra algorithms on large-scale accelerated multicore systems. Examples are given with fundamental dense linear algebra algorithms {\textendash} from the LU, QR, Cholesky, and LDLT factorizations needed for solving linear systems of equations, to eigenvalue and singular value decomposition (SVD) problems. The implementations presented are readily available via the open-source PLASMA and MAGMA libraries, which represent the next generation modernization of the popular LAPACK library for accelerated multicore systems. To generate the extreme level of parallelism needed for the efficient use of these systems, algorithms of interest are redesigned and then split into well-chosen computational tasks. The task execution is scheduled over the computational components of a hybrid system of multicore CPUs with GPU accelerators and/or Xeon Phi coprocessors, using either static scheduling or light-weight runtime systems. The use of light-weight runtime systems keeps scheduling overheads low, similar to static scheduling, while enabling the expression of parallelism through sequential-like code. This simplifies the development effort and allows exploration of the unique strengths of the various hardware components. Finally, we emphasize the development of innovative linear algebra algorithms using three technologies {\textendash} mixed precision arithmetic, batched operations, and asynchronous iterations {\textendash} that are currently of high interest for accelerated multicore systems.}, doi = {10.1017/S0962492916000015}, author = {Ahmad Abdelfattah and Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and undefined and Asim YarKhan} } @conference {962, title = {Search Space Generation and Pruning System for Autotuners}, booktitle = {30th IEEE International Parallel \& Distributed Processing Symposium (IPDPS)}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This work tackles two simultaneous challenges faced by autotuners: the ease of describing a complex, multidimensional search space, and the speed of evaluating that space, while applying a multitude of pruning constraints. This article presents a declarative notation for describing a search space and a translation system for conversion to a standard C code for fast and multithreaded, as necessary, evaluation. The notation is Python-based and thus simple in syntax and easy to assimilate by the user interested in tuning rather than learning a new programming language. A large number of dimensions and a large number of pruning constraints may be expressed with little effort. The system is discussed in the context of autotuning the canonical matrix multiplication kernel for NVIDIA GPUs, where the search space has 15 dimensions and involves application of 10 complex pruning constrains. The speed of evaluation is compared against generators created using imperative programming style in various scripting and compiled languages.}, author = {Piotr Luszczek and Mark Gates and Jakub Kurzak and Anthony Danalis and Jack Dongarra} } @conference {880, title = {Accelerating Collaborative Filtering for Implicit Feedback Datasets using GPUs}, booktitle = {2015 IEEE International Conference on Big Data (IEEE BigData 2015)}, year = {2015}, month = {2015-11}, publisher = {IEEE}, organization = {IEEE}, address = {Santa Clara, CA}, abstract = {In this paper we accelerate the Alternating Least Squares (ALS) algorithm used for generating product recommendations on the basis of implicit feedback datasets. We approach the algorithm with concepts proven to be successful in High Performance Computing. This includes the formulation of the algorithm as a mix of cache-optimized algorithm-specific kernels and standard BLAS routines, acceleration via graphics processing units (GPUs), use of parallel batched kernels, and autotuning to identify performance winners. For benchmark datasets, the multi-threaded CPU implementation we propose achieves more than a 10 times speedup over the implementations available in the GraphLab and Spark MLlib software packages. For the GPU implementation, the parameters of an algorithm-specific kernel were optimized using a comprehensive autotuning sweep. This results in an additional 2 times speedup over our CPU implementation.}, author = {Mark Gates and Hartwig Anzt and Jakub Kurzak and Jack Dongarra} } @conference {896, title = {Comparing Hybrid CPU-GPU and Native GPU-only Acceleration for Linear Algebra}, booktitle = {2015 SIAM Conference on Applied Linear Algebra}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {Accelerating dense linear algebra using GPUs admits two models: hybrid CPU-GPU and GPU-only. The hybrid model factors the panel on the CPU while updating the trailing matrix on the GPU, concentrating the GPU on high-performance matrix multiplies. The GPU-only model performs the entire computation on the GPU, avoiding costly data transfers to the CPU. We compare these two approaches for three QR-based algorithms: QR factorization, rank revealing QR, and reduction to Hessenberg.}, author = {Mark Gates and Stanimire Tomov and Azzam Haidar} } @article {829, title = {HPC Programming on Intel Many-Integrated-Core Hardware with MAGMA Port to Xeon Phi}, journal = {Scientific Programming}, volume = {23}, year = {2015}, month = {2015-01}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms for multicore with Intel Xeon Phi Coprocessors. In particular, we consider algorithms for solving linear systems. Further, we give an overview of the MAGMA MIC library, an open source, high performance library that incorporates the developments presented, and in general provides to heterogeneous architectures of multicore with coprocessors the DLA functionality of the popular LAPACK library. The LAPACK-compliance simplifies the use of the MAGMA MIC library in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance BLAS, hardware-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components. Our methodology and programming techniques are incorporated into the MAGMA MIC API, which abstracts the application developer from the specifics of the Xeon Phi architecture and is therefore applicable to algorithms beyond the scope of DLA.}, keywords = {communication and computation overlap, dynamic runtime scheduling using dataflow dependences, hardware accelerators and coprocessors, Intel Xeon Phi processor, Many Integrated Cores, numerical linear algebra}, issn = {1058-9244}, doi = {10.3233/SPR-140404}, author = {Azzam Haidar and Jack Dongarra and Khairul Kabir and Mark Gates and Piotr Luszczek and Stanimire Tomov and Yulu Jia} } @article {881, title = {Implementation and Tuning of Batched Cholesky Factorization and Solve for NVIDIA GPUs}, journal = {IEEE Transactions on Parallel and Distributed Systems}, number = {1045-9219}, year = {2015}, month = {2015-11}, author = {Jakub Kurzak and Hartwig Anzt and Mark Gates and Jack Dongarra} } @article {1347, title = {MAGMA MIC: Optimizing Linear Algebra for Intel Xeon Phi}, year = {2015}, month = {2015-06}, publisher = {ISC High Performance (ISC15), Intel Booth Presentation}, address = {Frankfurt, Germany}, author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Khairul Kabir and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @article {936, title = {Parallel Programming Models for Dense Linear Algebra on Heterogeneous Systems}, journal = {Supercomputing Frontiers and Innovations}, volume = {2}, number = {4}, year = {2015}, month = {2015-10}, abstract = {We present a review of the current best practices in parallel programming models for dense linear algebra (DLA) on heterogeneous architectures. We consider multicore CPUs, stand alone manycore coprocessors, GPUs, and combinations of these. Of interest is the evolution of the programming models for DLA libraries {\textendash} in particular, the evolution from the popular LAPACK and ScaLAPACK libraries to their modernized counterparts PLASMA (for multicore CPUs) and MAGMA (for heterogeneous architectures), as well as other programming models and libraries. Besides providing insights into the programming techniques of the libraries considered, we outline our view of the current strengths and weaknesses of their programming models {\textendash} especially in regards to hardware trends and ease of programming high-performance numerical software that current applications need {\textendash} in order to motivate work and future directions for the next generation of parallel programming models for high-performance linear algebra libraries on heterogeneous systems.}, keywords = {dense linear algebra, gpu, HPC, Multicore, plasma, Programming models, runtime}, doi = {10.14529/jsfi1504}, author = {Maksims Abalenkovs and Ahmad Abdelfattah and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki and Asim YarKhan} } @article {699, title = {A Survey of Recent Developments in Parallel Implementations of Gaussian Elimination}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2015}, month = {2015-04}, pages = {1292-1309}, abstract = {Gaussian elimination is a canonical linear algebra procedure for solving linear systems of equations. In the last few years, the algorithm has received a lot of attention in an attempt to improve its parallel performance. This article surveys recent developments in parallel implementations of Gaussian elimination for shared memory architecture. Five different flavors are investigated. Three of them are based on different strategies for pivoting: partial pivoting, incremental pivoting, and tournament pivoting. The fourth one replaces pivoting with the Partial Random Butterfly Transformation, and finally, an implementation without pivoting is used as a performance baseline. The technique of iterative refinement is applied to recover numerical accuracy when necessary. All parallel implementations are produced using dynamic, superscalar, runtime scheduling and tile matrix layout. Results on two multisocket multicore systems are presented. Performance and numerical accuracy is analyzed.}, keywords = {Gaussian elimination, lu factorization, Multicore, parallel, plasma, shared memory}, doi = {10.1002/cpe.3306}, author = {Simplice Donfack and Jack Dongarra and Mathieu Faverge and Mark Gates and Jakub Kurzak and Piotr Luszczek and Ichitaro Yamazaki} } @conference {766, title = {Accelerating Eigenvector Computation in the Nonsymmetric Eigenvalue Problem}, booktitle = {VECPAR 2014}, year = {2014}, month = {2014-06}, address = {Eugene, OR}, abstract = {In the nonsymmetric eigenvalue problem, work has focused on the Hessenberg reduction and QR iteration, using efficient algorithms and fast, Level 3 BLAS routines. Comparatively, computation of eigenvectors performs poorly, limited to slow, Level 2 BLAS performance with little speedup on multi-core systems. It has thus become a dominant cost in the eigenvalue problem. To address this, we present improvements for the eigenvector computation to use Level 3 BLAS where applicable and parallelize the remaining triangular solves, achieving good parallel scaling and accelerating the overall eigenvalue problem more than three-fold.}, author = {Mark Gates and Azzam Haidar and Jack Dongarra} } @inbook {780, title = {Accelerating Numerical Dense Linear Algebra Calculations with GPUs}, booktitle = {Numerical Computations with GPUs}, year = {2014}, pages = {3-28}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, chapter = {1}, isbn = {978-3-319-06547-2}, doi = {10.1007/978-3-319-06548-9_1}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @conference {836, title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL }, booktitle = {International Workshop on OpenCL}, year = {2014}, month = {2014-05}, address = {Bristol University, England}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments presented, and in general provides to heterogeneous architectures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance OpenCL BLAS, hardware and OpenCL-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.}, author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @article {758, title = {A Novel Hybrid CPU-GPU Generalized Eigensolver for Electronic Structure Calculations Based on Fine Grained Memory Aware Tasks}, journal = {International Journal of High Performance Computing Applications}, volume = {28}, year = {2014}, month = {2014-05}, pages = {196-209}, chapter = {196}, abstract = {The adoption of hybrid CPU{\textendash}GPU nodes in traditional supercomputing platforms such as the Cray-XK6 opens acceleration opportunities for electronic structure calculations in materials science and chemistry applications, where medium-sized generalized eigenvalue problems must be solved many times. These eigenvalue problems are too small to effectively solve on distributed systems, but can benefit from the massive computing power concentrated on a single-node, hybrid CPU{\textendash}GPU system. However, hybrid systems call for the development of new algorithms that efficiently exploit heterogeneity and massive parallelism of not just GPUs, but of multicore/manycore CPUs as well. Addressing these demands, we developed a generalized eigensolver featuring novel algorithms of increased computational intensity (compared with the standard algorithms), decomposition of the computation into fine-grained memory aware tasks, and their hybrid execution. The resulting eigensolvers are state-of-the-art in high-performance computing, significantly outperforming existing libraries. We describe the algorithm and analyze its performance impact on applications of interest when different fractions of eigenvectors are needed by the host electronic structure code. }, keywords = {Eigensolver, electronic structure calculations, generalized eigensolver, gpu, high performance, hybrid, Multicore, two-stage}, doi = {10.1177/1094342013502097 }, author = {Azzam Haidar and Raffaele Solc{\`a} and Mark Gates and Stanimire Tomov and Thomas C. Schulthess and Jack Dongarra} } @conference {828, title = {Performance and Portability with OpenCL for Throughput-Oriented HPC Workloads Across Accelerators, Coprocessors, and Multicore Processors}, booktitle = {5th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA {\textquoteright}14)}, year = {2014}, month = {2014-11}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, abstract = {Ever since accelerators and coprocessors became the mainstream hardware for throughput-oriented HPC workloads, various programming techniques have been proposed to increase productivity in terms of both the performance and ease-of-use. We evaluate these aspects of OpenCL on a number of hardware platforms for an important subset of dense linear algebra operations that are relevant to a wide range of scientific applications. Our findings indicate that OpenCL portability has improved since our previous publication and many new and surprising usage scenarios are possible that rival those available after decades of software development on the CPUs. The combined performance-portability metric, even though not promised by the OpenCL standard, reflects the need for tuning performance-critical operations during the porting process and we show how a large portion of the available efficiency is lost if the tuning is not done correctly.}, doi = {10.1109/ScalA.2014.8}, author = {Azzam Haidar and Chongxiao Cao and Ichitaro Yamazaki and Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {681, title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL}, journal = {University of Tennessee Technical Report (Lawn 275)}, number = {UT-CS-13-706}, year = {2013}, month = {2013-03}, publisher = {University of Tennessee}, abstract = {This paper presents the design and implementation of sev- eral fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments pre- sented, and in general provides to heterogeneous architec- tures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is ob- tained through use of the high-performance OpenCL BLAS, hardware and OpenCL-speci c tuning, and a hybridization methodology where we split the algorithm into computa- tional tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.}, author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @conference {753, title = {Portable HPC Programming on Intel Many-Integrated-Core Hardware with MAGMA Port to Xeon Phi}, booktitle = {PPAM 2013}, year = {2013}, month = {2013-09}, address = {Warsaw, Poland}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms for multicore with Intel Xeon Phi Coprocessors. In particular, we consider algorithms for solving linear systems. Further, we give an overview of the MAGMA MIC library, an open source, high performance library that incorporates the developments presented, and in general provides to heterogeneous architectures of multicore with coprocessors the DLA functionality of the popular LAPACK library. The LAPACK-compliance simplifies the use of the MAGMA MIC library in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance BLAS, hardware-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components. Our methodology and programming techniques are incorporated into the MAGMA MIC API, which abstracts the application developer from the specifics of the Xeon Phi architecture and is therefore applicable to algorithms beyond the scope of DLA.}, keywords = {magma, mic, xeon phi}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Piotr Luszczek and Stanimire Tomov} } @conference {686, title = {Toward a scalable multi-GPU eigensolver via compute-intensive kernels and efficient communication}, booktitle = {Proceedings of the 27th ACM International Conference on Supercomputing (ICS {\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Eugene, Oregon, USA}, abstract = {The enormous gap between the high-performance capabilities of GPUs and the slow interconnect between them has made the development of numerical software that is scalable across multiple GPUs extremely challenging. We describe a successful methodology on how to address the challenges---starting from our algorithm design, kernel optimization and tuning, to our programming model---in the development of a scalable high-performance tridiagonal reduction algorithm for the symmetric eigenvalue problem. This is a fundamental linear algebra problem with many engineering and physics applications. We use a combination of a task-based approach to parallelism and a new algorithmic design to achieve high performance. The goal of the new design is to increase the computational intensity of the major compute kernels and to reduce synchronization and data transfers between GPUs. This may increase the number of flops, but the increase is offset by the more efficient execution and reduced data transfers. Our performance results are the best available, providing an enormous performance boost compared to current state-of-the-art solutions. In particular, our software scales up to 1070 Gflop/s using 16 Intel E5-2670 cores and eight M2090 GPUs, compared to 45 Gflop/s achieved by the optimized Intel Math Kernel Library (MKL) using only the 16 CPU cores.}, keywords = {eigenvalue, gpu communication, gpu computation, heterogeneous programming model, performance, reduction to tridiagonal, singular value decomposiiton, task parallelism}, isbn = {9781450321303}, doi = {10.1145/2464996.2465438}, url = {http://dl.acm.org/citation.cfm?doid=2464996.2465438}, author = {Azzam Haidar and Mark Gates and Stanimire Tomov and Jack Dongarra}, editor = {Allen D. Malony and Nemirovsky, Mario and Midkiff, Sam} } @conference {icl:692, title = {Virtual Systolic Array for QR Decomposition}, booktitle = {15th Workshop on Advances in Parallel and Distributed Computational Models, IEEE International Parallel \& Distributed Processing Symposium (IPDPS 2013)}, year = {2013}, month = {2013-05}, publisher = {IEEE}, organization = {IEEE}, address = {Boston, MA}, abstract = {Systolic arrays offer a very attractive, data-centric, execution model as an alternative to the von Neumann architecture. Hardware implementations of systolic arrays turned out not to be viable solutions in the past. This article shows how the systolic design principles can be applied to a software solution to deliver an algorithm with unprecedented strong scaling capabilities. Systolic array for the QR decomposition is developed and a virtualization layer is used for mapping of the algorithm to a large distributed memory system. Strong scaling properties are discovered, superior to existing solutions.}, keywords = {dataflow programming, message passing, multi-core, QR decomposition, roofline model, systolic array}, doi = {10.1109/IPDPS.2013.119}, author = {Jakub Kurzak and Piotr Luszczek and Mark Gates and Ichitaro Yamazaki and Jack Dongarra} } @techreport {688, title = {On Algorithmic Variants of Parallel Gaussian Elimination: Comparison of Implementations in Terms of Performance and Numerical Properties}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-13-715}, year = {2012}, month = {2013-07}, abstract = {Gaussian elimination is a canonical linear algebra procedure for solving linear systems of equations. In the last few years, the algorithm received a lot of attention in an attempt to improve its parallel performance. This article surveys recent developments in parallel implementations of the Gaussian elimination. Five different flavors are investigated. Three of them are based on different strategies for pivoting: partial pivoting, incremental pivoting, and tournament pivoting. The fourth one replaces pivoting with the Random Butterfly Transformation, and finally, an implementation without pivoting is used as a performance baseline. The technique of iterative refinement is applied to recover numerical accuracy when necessary. All parallel implementations are produced using dynamic, superscalar, runtime scheduling and tile matrix layout. Results on two multi-socket multicore systems are presented. Performance and numerical accuracy is analyzed.}, author = {Simplice Donfack and Jack Dongarra and Mathieu Faverge and Mark Gates and Jakub Kurzak and Piotr Luszczek and Ichitaro Yamazaki} } @article {icl:697, title = {Block-asynchronous Multigrid Smoothers for GPU-accelerated Systems}, journal = {ICCS 2012}, year = {2012}, month = {2012-06}, address = {Omaha, NE}, author = {Hartwig Anzt and Stanimire Tomov and Mark Gates and Jack Dongarra and Vincent Heuveline} } @article {1349, title = {MAGMA: A New Generation of Linear Algebra Library for GPU and Multicore Architectures}, year = {2012}, month = {2012-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC12), Presentation}, address = {Salt Lake City, UT}, author = {Jack Dongarra and Tingxing Dong and Mark Gates and Azzam Haidar and Stanimire Tomov and Ichitaro Yamazaki} } @article {1354, title = {MAGMA MIC: Linear Algebra Library for Intel Xeon Phi Coprocessors}, year = {2012}, month = {2012-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC12)}, address = {Salt Lake City, UT}, author = {Jack Dongarra and Mark Gates and Yulu Jia and Khairul Kabir and Piotr Luszczek and Stanimire Tomov} } @article {1357, title = {MAGMA Tutorial}, year = {2012}, month = {2012-02}, publisher = {Keeneland Workshop}, address = {Atlanta, GA}, author = {Mark Gates} } @article {icl:661, title = {Block-asynchronous Multigrid Smoothers for GPU-accelerated Systems}, number = {UT-CS-11-689}, year = {2011}, month = {2011-12}, keywords = {magma}, author = {Hartwig Anzt and Stanimire Tomov and Mark Gates and Jack Dongarra and Vincent Heuveline} }