@inproceedings {, title = {Addressing Irregular Patterns of Matrix Computations on GPUs and Their Impact on Applications Powered by Sparse Direct Solvers}, journal = {2022 International Conference for High Performance Computing, Networking, Storage and Analysis (SC22)}, year = {2022}, month = {2022-11}, pages = {354-367}, publisher = {IEEE Computer Society}, address = {Dallas, TX}, abstract = {Many scientific applications rely on sparse direct solvers for their numerical robustness. However, performance optimization for these solvers remains a challenging task, especially on GPUs. This is due to workloads of small dense matrices that are different in size. Matrix decompositions on such irregular workloads are rarely addressed on GPUs. This paper addresses irregular workloads of matrix computations on GPUs, and their application to accelerate sparse direct solvers. We design an interface for the basic matrix operations supporting problems of different sizes. The interface enables us to develop irrLU-GPU, an LU decomposition on matrices of different sizes. We demonstrate the impact of irrLU-GPU on sparse direct LU solvers using NVIDIA and AMD GPUs. Experimental results are shown for a sparse direct solver based on a multifrontal sparse LU decomposition applied to linear systems arising from the simulation, using finite element discretization on unstructured meshes, of a high-frequency indefinite Maxwell problem.}, keywords = {GPU computing, irregular computational workloads, lu factorization, multifrontal solvers, sparse direct solvers}, url = {https://dl.acm.org/doi/abs/10.5555/3571885.3571919}, author = {Ahmad Abdelfattah and Pieter Ghysels and Wajih Boukaram and Stanimire Tomov and Xiaoye Sherry Li and Jack Dongarra} } @techreport {, title = {PAQR: Pivoting Avoiding QR factorization}, journal = {ICL Technical Report}, number = {ICL-UT-22-06}, year = {2022}, month = {2022-06}, abstract = {The solution of linear least-squares problems is at the heart of many scientific and engineering applications. While any method able to minimize the backward error of such problems is considered numerically stable, the theory states that the forward error depends on the condition number of the matrix in the system of equations. On the one hand, the QR factorization is an efficient method to solve such problems, but the solutions it produces may have large forward errors when the matrix is deficient. On the other hand, QR with column pivoting (QRCP) is able to produce smaller forward errors on deficient matrices, but its cost is prohibitive compared to QR. The aim of this paper is to propose PAQR, an alternative solution method with the same cost (or smaller) as QR and as accurate as QRCP in practical cases, for the solution of rank-deficient linear least-squares problems. After presenting the algorithm and its implementations on different architectures, we compare its accuracy and performance results on a variety of application problems. }, author = {Wissam M. Sid-Lakhdar and Sebastien Cayrols and Daniel Bielich and Ahmad Abdelfattah and Piotr Luszczek and Mark Gates and Stanimire Tomov and Hans Johansen and David Williams-Young and Timothy A. Davis and Jack Dongarra} } @article {, title = {libCEED: Fast algebra for high-order element-based discretizations}, journal = {Journal of Open Source Software}, volume = {6}, number = {63}, year = {2021}, pages = {2945}, abstract = {Finite element methods are widely used to solve partial differential equations (PDE) in science and engineering, but their standard implementation (Arndt et al., 2020; Kirk et al., 2006; Logg et al., 2012) relies on assembling sparse matrices. Sparse matrix multiplication and triangular operations perform a scalar multiply and add for each nonzero entry, just 2 floating point operations (flops) per scalar that must be loaded from memory (Williams et al., 2009). Modern hardware is capable of nearly 100 flops per scalar streamed from memory (Rupp, 2020) so sparse matrix operations cannot achieve more than about 2\% utilization of arithmetic units. Matrix assembly becomes even more problematic when the polynomial degree p of the basis functions is increased, resulting in O(pd) storage and O(p2d) compute per degree of freedom (DoF) in d dimensions. Methods pioneered by the spectral element community (Deville et al., 2002; Orszag, 1980) exploit problem structure to reduce costs to O(1) storage and O(p) compute per DoF, with very high utilization of modern CPUs and GPUs. Unfortunately, highquality implementations have been relegated to applications and intrusive frameworks that are often difficult to extend to new problems or incorporate into legacy applications, especially when strong preconditioners are required. libCEED, the Code for Efficient Extensible Discretization (Abdelfattah et al., 2021), is a lightweight library that provides a purely algebraic interface for linear and nonlinear operators and preconditioners with element-based discretizations. libCEED provides portable performance via run-time selection of implementations optimized for CPUs and GPUs, including support for just-in-time (JIT) compilation. It is designed for convenient use in new and legacy software, and offers interfaces in C99 (International Standards Organisation, 1999), Fortran77 (ANSI, 1978), Python (Python, 2021), Julia (Bezanson et al., 2017), and Rust (Rust, 2021). Users and library developers can integrate libCEED at a low level into existing applications in place of existing matrix-vector products without significant refactoring of their own discretization infrastructure. Alternatively, users can utilize integrated libCEED support in MFEM (Anderson et al., 2020; MFEM, 2021). In addition to supporting applications and discretization libraries, libCEED provides a platform for performance engineering and co-design, as well as an algebraic interface for solvers research like adaptive p-multigrid, much like how sparse matrix libraries enable development and deployment of algebraic multigrid solvers}, keywords = {finite elements, high-order methods, High-performance computing, matrix-free, spectral elements}, doi = {10.21105/joss.02945}, url = {https://doi.org/10.21105/joss.02945}, author = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jean-Sylvain Camier and Veselin Dobrev and Yohann Dudouit and Leila Ghaffari and Tzanio Kolev and David Medina and Will Pazner and Thilina Ratnayaka and Jeremy Thompson and Stanimire Tomov} } @techreport {, title = {SLATE Port to AMD and Intel Platforms}, journal = {SLATE Working Notes}, number = {16, ICL-UT-21-01}, year = {2021}, month = {2021-04}, author = {Ahmad Abdelfattah and Mohammed Al Farhan and Cade Brown and Mark Gates and Dalal Sukkari and Asim YarKhan and Jack Dongarra} } @techreport {, title = {Design, Optimization, and Benchmarking of Dense Linear Algebra Algorithms on AMD GPUs}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-12}, year = {2020}, month = {2020-08}, publisher = {University of Tennessee}, abstract = {Dense linear algebra (DLA) has historically been in the vanguard of software that must be adapted first to hardware changes. This is because DLA is both critical to the accuracy and performance of so many different types of applications, and because they have proved to be outstanding vehicles for finding and implementing solutions to the problems that novel architectures pose. Therefore, in this paper we investigate the portability of the MAGMA DLA library to the latest AMD GPUs. We use auto tools to convert the CUDA code in MAGMA to the HeterogeneousComputing Interface for Portability (HIP) language. MAGMA provides LAPACK for GPUs and benchmarks for fundamental DLA routines ranging from BLAS to dense factorizations, linear systems and eigen-problem solvers. We port these routines to HIP and quantify currently achievable performance through the MAGMA benchmarks for the main workload algorithms on MI25 and MI50 AMD GPUs. Comparison with performance roofline models and theoretical expectations are used to identify current limitations and directions for future improvements.}, keywords = {AMD GPUs, GPU computing, HIP Runtime, HPC, numerical linear algebra, Portability}, author = {Cade Brown and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {, title = {Design, Optimization, and Benchmarking of Dense Linear Algebra Algorithms on AMD GPUs}, booktitle = {2020 IEEE High Performance Extreme Computing Virtual Conference}, year = {2020}, month = {2020-09}, publisher = {IEEE}, organization = {IEEE}, abstract = {Dense linear algebra (DLA) has historically been in the vanguard of software that must be adapted first to hardware changes. This is because DLA is both critical to the accuracy and performance of so many different types of applications, and because they have proved to be outstanding vehicles for finding and implementing solutions to the problems that novel architectures pose. Therefore, in this paper we investigate the portability of the MAGMA DLA library to the latest AMD GPUs. We use auto tools to convert the CUDA code in MAGMA to the HeterogeneousComputing Interface for Portability (HIP) language. MAGMA provides LAPACK for GPUs and benchmarks for fundamental DLA routines ranging from BLAS to dense factorizations, linear systems and eigen-problem solvers. We port these routines to HIP and quantify currently achievable performance through the MAGMA benchmarks for the main workload algorithms on MI25 and MI50 AMD GPUs. Comparison with performance roofline models and theoretical expectations are used to identify current limitations and directions for future improvements.}, author = {Cade Brown and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {, title = {Evaluating the Performance of NVIDIA{\textquoteright}s A100 Ampere GPU for Sparse and Batched Computations}, booktitle = {2020 IEEE/ACM Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {GPU accelerators have become an important backbone for scientific high performance-computing, and the performance advances obtained from adopting new GPU hardware are significant. In this paper we take a first look at NVIDIA{\textquoteright}s newest server-line GPU, the A100 architecture, part of the Ampere generation. Specifically, we assess its performance for sparse and batch computations, as these routines are relied upon in many scientific applications, and compare to the p}, keywords = {Batched linear algebra, NVIDIA A100 GPU, sparse linear algebra, Sparse Matrix Vector Product}, author = {Hartwig Anzt and Yuhsiang M. Tsai and Ahmad Abdelfattah and Terry Cojean and Jack Dongarra} } @conference {, title = {High-Order Finite Element Method using Standard and Device-Level Batch GEMM on GPUs}, booktitle = {2020 IEEE/ACM 11th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {We present new GPU implementations of the tensor contractions arising from basis-related computations for highorder finite element methods. We consider both tensor and nontensor bases. In the case of tensor bases, we introduce new kernels based on a series of fused device-level matrix multiplications (GEMMs), specifically designed to utilize the fast memory of the GPU. For non-tensor bases, we develop a tuned framework for choosing standard batch-BLAS GEMMs that will maximize performance across groups of elements. The implementations are included in a backend of the libCEED library. We present benchmark results for the diffusion and mass operators using libCEED integration through the MFEM finite element library and compare to those of the previously best-performing GPU backends for stand-alone basis computations. In tensor cases, we see improvements of approximately 10-30\% for some cases, particularly for higher basis orders. For the non-tensor tests, the new batch-GEMMs implementation is twice as fast as what was previously available for basis function order greater than five and greater than approximately 105 degrees of freedom in the mesh; up to ten times speedup is seen for eighth-order basis functions.}, keywords = {Batched linear algebra, finite elements, gpu, high-order methods, matrix-free FEM, Tensor contractions}, author = {Natalie Beams and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra and Tzanio Kolev and Yohann Dudouit} } @booklet {, title = {hipMAGMA v1.0}, year = {2020}, month = {2020-03}, publisher = {Zenodo}, doi = {10.5281/zenodo.3908549}, url = {https://doi.org/10.5281/zenodo.3908549}, author = {Cade Brown and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @booklet {, title = {hipMAGMA v2.0}, year = {2020}, month = {2020-07}, publisher = {Zenodo}, doi = {10.5281/zenodo.3928667}, url = {https://doi.org/10.5281/zenodo.3928667}, author = {Cade Brown and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {1480, title = {Investigating the Benefit of FP16-Enabled Mixed-Precision Solvers for Symmetric Positive Definite Matrices using GPUs}, booktitle = {International Conference on Computational Science (ICCS 2020)}, year = {2020}, month = {2020-06}, publisher = {Springer, Cham}, organization = {Springer, Cham}, address = {Amsterdam, Netherlands}, abstract = {Half-precision computation refers to performing floating-point operations in a 16-bit format. While half-precision has been driven largely by machine learning applications, recent algorithmic advances in numerical linear algebra have discovered beneficial use cases for half precision in accelerating the solution of linear systems of equations at higher precisions. In this paper, we present a high-performance, mixed-precision linear solver (Ax = b) for symmetric positive definite systems in double-precision using graphics processing units (GPUs). The solver is based on a mixed-precision Cholesky factorization that utilizes the high-performance tensor core units in CUDA-enabled GPUs. Since the Cholesky factors are affected by the low precision, an iterative refinement (IR) solver is required to recover the solution back to double-precision accuracy. Two different types of IR solvers are discussed on a wide range of test matrices. A preprocessing step is also developed, which scales and shifts the matrix, if necessary, in order to preserve its positive-definiteness in lower precisions. Our experiments on the V100 GPU show that performance speedups are up to 4.7{\texttimes} against a direct double-precision solver. However, matrix properties such as the condition number and the eigenvalue distribution can affect the convergence rate, which would consequently affect the overall performance.}, doi = {https://doi.org/10.1007/978-3-030-50417-5_18}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {, title = {MAGMA Templates for Scalable Linear Algebra on Emerging Architectures}, journal = {The International Journal of High Performance Computing Applications}, volume = {34}, year = {2020}, month = {2020-11}, pages = {645-658}, abstract = {With the acquisition and widespread use of more resources that rely on accelerator/wide vector{\textendash}based computing, there has been a strong demand for science and engineering applications to take advantage of these latest assets. This, however, has been extremely challenging due to the diversity of systems to support their extreme concurrency, complex memory hierarchies, costly data movement, and heterogeneous node architectures. To address these challenges, we design a programming model and describe its ease of use in the development of a new MAGMA Templates library that delivers high-performance scalable linear algebra portable on current and emerging architectures. MAGMA Templates derives its performance and portability by (1) building on existing state-of-the-art linear algebra libraries, like MAGMA, SLATE, Trilinos, and vendor-optimized math libraries, and (2) providing access (seamlessly to the users) to the latest algorithms and architecture-specific optimizations through a single, easy-to-use C++-based API.}, issn = {1094-3420}, doi = {https://doi.org/10.1177/1094342020938421}, author = {Mohammed Al Farhan and Ahmad Abdelfattah and Stanimire Tomov and Mark Gates and Dalal Sukkari and Azzam Haidar and Robert Rosenberg and Jack Dongarra} } @article {, title = {Matrix Multiplication on Batches of Small Matrices in Half and Half-Complex Precisions}, journal = {Journal of Parallel and Distributed Computing}, volume = {145}, year = {2020}, month = {2020-11}, pages = {188-201}, abstract = {Machine learning and artificial intelligence (AI) applications often rely on performing many small matrix operations{\textemdash}in particular general matrix{\textendash}matrix multiplication (GEMM). These operations are usually performed in a reduced precision, such as the 16-bit floating-point format (i.e., half precision or FP16). The GEMM operation is also very important for dense linear algebra algorithms, and half-precision GEMM operations can be used in mixed-precision linear solvers. Therefore, high-performance batched GEMM operations in reduced precision are significantly important, not only for deep learning frameworks, but also for scientific applications that rely on batched linear algebra, such as tensor contractions and sparse direct solvers. This paper presents optimized batched GEMM kernels for graphics processing units (GPUs) in FP16 arithmetic. The paper addresses both real and complex half-precision computations on the GPU. The proposed design takes advantage of the Tensor Core technology that was recently introduced in CUDA-enabled GPUs. With eight tuning parameters introduced in the design, the developed kernels have a high degree of flexibility that overcomes the limitations imposed by the hardware and software (in the form of discrete configurations for the Tensor Core APIs). For real FP16 arithmetic, performance speedups are observed against cuBLAS for sizes up to 128, and range between and . For the complex FP16 GEMM kernel, the speedups are between and thanks to a design that uses the standard interleaved matrix layout, in contrast with the planar layout required by the vendor{\textquoteright}s solution. The paper also discusses special optimizations for extremely small matrices, where even higher performance gains are achievable.}, doi = {https://doi.org/10.1016/j.jpdc.2020.07.001}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {, title = {A Set of Batched Basic Linear Algebra Subprograms}, journal = {ACM Transactions on Mathematical Software}, year = {2020}, month = {2020-10}, abstract = {This paper describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular half precision is used in many very large scale applications, such as those associated with machine learning.}, author = {Ahmad Abdelfattah and Timothy Costa and Jack Dongarra and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Mawussi Zounon} } @techreport {, title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic}, journal = {SLATE Working Notes}, number = {15, ICL-UT-20-08}, year = {2020}, month = {2020-07}, publisher = {University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang} } @article {1262, title = {Algorithms and Optimization Techniques for High-Performance Matrix-Matrix Multiplications of Very Small Matrices}, journal = {Parallel Computing}, volume = {81}, year = {2019}, month = {2019-01}, pages = {1{\textendash}21}, abstract = {Expressing scientific computations in terms of BLAS, and in particular the general dense matrix-matrix multiplication (GEMM), is of fundamental importance for obtaining high performance portability across architectures. However, GEMMs for small matrices of sizes smaller than 32 are not sufficiently optimized in existing libraries. We consider the computation of many small GEMMs and its performance portability for a wide range of computer architectures, including Intel CPUs, ARM, IBM, Intel Xeon Phi, and GPUs. These computations often occur in applications like big data analytics, machine learning, high-order finite element methods (FEM), and others. The GEMMs are grouped together in a single batched routine. For these cases, we present algorithms and their optimization techniques that are specialized for the matrix sizes and architectures of interest. We derive a performance model and show that the new developments can be tuned to obtain performance that is within 90\% of the optimal for any of the architectures of interest. For example, on a V100 GPU for square matrices of size 32, we achieve an execution rate of about 1600 gigaFLOP/s in double-precision arithmetic, which is 95\% of the theoretically derived peak for this computation on a V100 GPU. We also show that these results outperform currently available state-of-the-art implementations such as vendor-tuned math libraries, including Intel MKL and NVIDIA CUBLAS, as well as open-source libraries like OpenBLAS and Eigen.}, keywords = {Autotuning, Batched GEMM, HPC, Matrix-matrix product, optimization, Small matrices}, doi = {https://doi.org/10.1016/j.parco.2018.10.003}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Marc Baboulin and Jo{\"e}l Falcou and Jack Dongarra} } @techreport {1433, title = {CEED ECP Milestone Report: Performance Tuning of CEED Software and 1st and 2nd Wave Apps}, year = {2019}, month = {2019-10}, publisher = {Zenodo}, doi = {https://doi.org/10.5281/zenodo.3477618}, author = {Stanimire Tomov and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jed Brown and Jean-Sylvain Camier and Veselin Dobrev and Jack Dongarra and Yohann Dudouit and Paul Fischer and Ali Karakus and Stefan Kerkemeier and Tzanio Kolev and YuHsiang Lan and Elia Merzari and Misun Min and Aleks Obabko and Scott Parker and Thilina Ratnayaka and Jeremy Thompson and Ananias Tomboulides and Vladimir Tomov and Tim Warburton} } @techreport {1434, title = {CEED ECP Milestone Report: Public release of CEED 2.0}, year = {2019}, month = {2019-04}, publisher = {Zenodo}, doi = {10.5281/zenodo.2641316}, url = {https://doi.org/10.5281/zenodo.2641316}, author = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Veselin Dobrev and Yohann Dudouit and Paul Fischer and Tzanio Kolev and David Medina and Misun Min and Thilina Ratnayaka and Cameron Smith and Jeremy Thompson and Stanimire Tomov and Vladimir Tomov and Tim Warburton} } @techreport {1396, title = {An Empirical View of SLATE Algorithms on Scalable Hybrid System}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-08}, year = {2019}, month = {2019-09}, publisher = {University of Tennessee, Knoxville}, author = {Asim YarKhan and Jakub Kurzak and Ahmad Abdelfattah and Jack Dongarra} } @conference {1323, title = {Fast Batched Matrix Multiplication for Small Sizes using Half Precision Arithmetic on GPUs}, booktitle = {33rd IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2019}, month = {2019-05}, publisher = {IEEE}, organization = {IEEE}, address = {Rio de Janeiro, Brazil}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {1373, title = {Massively Parallel Automated Software Tuning}, booktitle = {48th International Conference on Parallel Processing (ICPP 2019)}, year = {2019}, month = {2019-08}, publisher = {ACM Press}, organization = {ACM Press}, address = {Kyoto, Japan}, abstract = {This article presents an implementation of a distributed autotuning engine developed as part of the Bench-testing OpenN Software Autotuning Infrastructure project. The system is geared towards performance optimization of computational kernels for graphics processing units, and allows for the deployment of vast autotuning sweeps to massively parallel machines. The software implements dynamic work scheduling to distributed-memory resources and takes advantage of multithreading for parallel compilation and dispatches kernel launches to multiple accelerators. This paper lays out the main design principles of the system and discusses the basic mechanics of the initial implementation. Preliminary performance results are presented, encountered challenges are discussed, and the future directions are outlined.}, doi = {https://doi.org/10.1145/3337821.3337908}, author = {Jakub Kurzak and Yaohung Tsai and Mark Gates and Ahmad Abdelfattah and Jack Dongarra} } @article {1374, title = {Optimizing Batch HGEMM on Small Sizes Using Tensor Cores}, year = {2019}, month = {2019-03}, publisher = {GPU Technology Conference (GTC)}, address = {San Jose, CA}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {1376, title = {Progressive Optimization of Batched LU Factorization on GPUs}, booktitle = { IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}19)}, year = {2019}, month = {2019-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {1435, title = {Towards Half-Precision Computation for Complex Matrices: A Case Study for Mixed Precision Solvers on GPUs}, booktitle = {ScalA19: 10th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, year = {2019}, month = {2019-11}, publisher = {IEEE}, organization = {IEEE}, address = {Denver, CO}, keywords = {Half precision, mixed-precision solvers, Tensor cores FP16 arithmetic}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @techreport {1229, title = {Algorithms and Optimization Techniques for High-Performance Matrix-Matrix Multiplications of Very Small Matrices}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-09}, year = {2018}, month = {2018-09}, publisher = {Innovative Computing Laboratory, University of Tennessee}, abstract = {Expressing scientific computations in terms of BLAS, and in particular the general dense matrix-matrix multiplication (GEMM), is of fundamental importance for obtaining high performance portability across architectures. However, GEMMs for small matrices of sizes smaller than 32 are not sufficiently optimized in existing libraries. We consider the computation of many small GEMMs and its performance portability for a wide range of computer architectures, including Intel CPUs, ARM, IBM, Intel Xeon Phi, and GPUs. These computations often occur in applications like big data analytics, machine learning, high-order finite element methods (FEM), and others. The GEMMs are grouped together in a single batched routine. For these cases, we present algorithms and their optimization techniques that are specialized for the matrix sizes and architectures of interest. We derive a performance model and show that the new developments can be tuned to obtain performance that is within 90\% of the optimal for any of the architectures of interest. For example, on a V100 GPU for square matrices of size 32, we achieve an execution rate of about 1; 600 gigaFLOP/s in double-precision arithmetic, which is 95\% of the theoretically derived peak for this computation on a V100 GPU. We also show that these results outperform currently available state-of-the-art implementations such as vendor-tuned math libraries, including Intel MKL and NVIDIA CUBLAS, as well as open-source libraries like OpenBLAS and Eigen.}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Marc Baboulin and Jo{\"e}l Falcou and Jack Dongarra} } @article {1260, title = {Analysis and Design Techniques towards High-Performance and Energy-Efficient Dense Linear Solvers on GPUs}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {29}, year = {2018}, month = {2018-12}, pages = {2700{\textendash}2712}, abstract = {Graphics Processing Units (GPUs) are widely used in accelerating dense linear solvers. The matrix factorizations, which dominate the runtime for these solvers, are often designed using a hybrid scheme, where GPUs perform trailing matrix updates, while the CPUs perform the panel factorizations. Consequently, hybrid solutions require high-end CPUs and optimized CPU software in order to deliver high performance. Furthermore, they lack the energy efficiency inherent for GPUs due to the use of less energy-efficient CPUs, as well as CPU-GPU communications. This paper presents analysis and design techniques that overcome the shortcomings of the hybrid algorithms, and allow the design of high-performance and energy-efficient dense LU and Cholesky factorizations that use GPUs only. The full GPU solution eliminates the need for a high-end CPU and optimized CPU software, which leads to a better energy efficiency. We discuss different design choices, and introduce optimized GPU kernels for panel factorizations. The developed solutions achieve 90+ percent of the performance of optimized hybrid solutions, while improving the energy efficiency by 50 percent. They outperform the vendor library by 30-50 percent in single precision, and 15-50 percent in double precision. We also show that hybrid designs trail the proposed solutions in performance when optimized CPU software is not available.}, keywords = {Dense linear solvers, energy efficiency, GPU computing}, doi = {10.1109/TPDS.2018.2842785}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {1195, title = {Analyzing Performance of BiCGStab with Hierarchical Matrix on GPU Clusters}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2018}, month = {2018-05}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {ppohBEM is an open-source software package im- plementing the boundary element method. One of its main software tasks is the solution of the dense linear system of equations, for which, ppohBEM relies on another software package called HACApK. To reduce the cost of solving the linear system, HACApK hierarchically compresses the coefficient matrix using adaptive cross approximation. This hierarchical compression greatly reduces the storage and time complexities of the solver and enables the solution of large-scale boundary value problems. To extend the capability of ppohBEM, in this paper, we carefully port the HACApK{\textquoteright}s linear solver onto GPU clusters. Though the potential of the GPUs has been widely accepted in high-performance computing, it is still a challenge to utilize the GPUs for a solver, like HACApK{\textquoteright}s, that requires fine-grained computation and global communication. First, to utilize the GPUs, we integrate the batched GPU kernel that was recently released in the MAGMA software package. We discuss several techniques to improve the performance of the batched kernel. We then study various techniques to address the inter-GPU communication and study their effects on state-of- the-art GPU clusters. We believe that the techniques studied in this paper are of interest to a wide range of software packages running on GPUs, especially with the increasingly complex node architectures and the growing costs of the communication. We also hope that our efforts to integrate the GPU kernel or to setup the inter-GPU communication will influence the design of the future-generation batched kernels or the communication layer within a software stack.}, author = {Ichitaro Yamazaki and Ahmad Abdelfattah and Akihiro Ida and Satoshi Ohshima and Stanimire Tomov and Rio Yokota and Jack Dongarra} } @article {1209, title = {Batched One-Sided Factorizations of Tiny Matrices Using GPUs: Challenges and Countermeasures}, journal = {Journal of Computational Science}, volume = {26}, year = {2018}, month = {2018-05}, pages = {226{\textendash}236}, abstract = {The use of batched matrix computations recently gained a lot of interest for applications, where the same operation is applied to many small independent matrices. The batched computational pattern is frequently encountered in applications of data analytics, direct/iterative solvers and preconditioners, computer vision, astrophysics, and more, and often requires specific designs for vectorization and extreme parallelism to map well on today{\textquoteright}s high-end many-core architectures. This has led to the development of optimized software for batch computations, and to an ongoing community effort to develop standard interfaces for batched linear algebra software. Furthering these developments, we present GPU design and optimization techniques for high-performance batched one-sided factorizations of millions of tiny matrices (of size 32 and less). We quantify the effects and relevance of different techniques in order to select the best-performing LU, QR, and Cholesky factorization designs. While we adapt common optimization techniques, such as optimal memory traffic, register blocking, and concurrency control, we also show that a different mindset and techniques are needed when matrices are tiny, and in particular, sub-vector/warp in size. The proposed routines are part of the MAGMA library and deliver significant speedups compared to their counterparts in currently available vendor-optimized libraries. Notably, we tune the developments for the newest V100 GPU from NVIDIA to show speedups of up to 11.8{\texttimes}.}, keywords = {batch computation, GPU computing, matrix factorization}, doi = {https://doi.org/10.1016/j.jocs.2018.01.005}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @inproceedings {1259, title = {The Design of Fast and Energy-Efficient Linear Solvers: On the Potential of Half-Precision Arithmetic and Iterative Refinement Techniques}, journal = {International Conference on Computational Science (ICCS 2018)}, volume = {10860}, year = {2018}, month = {2018-06}, pages = {586{\textendash}600}, publisher = {Springer}, address = {Wuxi, China}, abstract = {As parallel computers approach exascale, power efficiency in high-performance computing (HPC) systems is of increasing concern. Exploiting both the hardware features and algorithms is an effective solution to achieve power efficiency, and to address the energy constraints in modern and future HPC systems. In this work, we present a novel design and implementation of an energy-efficient solution for dense linear systems of equations, which are at the heart of large-scale HPC applications. The proposed energy-efficient linear system solvers are based on two main components: (1) iterative refinement techniques, and (2) reduced-precision computing features in modern accelerators and coprocessors. While most of the energy efficiency approaches aim to reduce the consumption with a minimal performance penalty, our method improves both the performance and the energy efficiency. Compared to highly-optimized linear system solvers, our kernels deliver the same accuracy solution up to 2{\texttimes} faster and reduce the energy consumption up to half on Intel Knights Landing (KNL) architectures. By efficiently using the Tensor Cores available in the NVIDIA V100 PCIe GPUs, the speedups can be up to 4{\texttimes} , with more than 80\% reduction in the energy consumption.}, doi = {https://doi.org/10.1007/978-3-319-93698-7_45}, url = {https://rdcu.be/bcKSC}, author = {Azzam Haidar and Ahmad Abdelfattah and Mawussi Zounon and Panruo Wu and Srikara Pranesh and Stanimire Tomov and Jack Dongarra} } @article {1208, title = {A Guide for Achieving High Performance with Very Small Matrices on GPUs: A Case Study of Batched LU and Cholesky Factorizations}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {29}, year = {2018}, month = {2018-05}, pages = {973{\textendash}984}, abstract = {We present a high-performance GPU kernel with a substantial speedup over vendor libraries for very small matrix computations. In addition, we discuss most of the challenges that hinder the design of efficient GPU kernels for small matrix algorithms. We propose relevant algorithm analysis to harness the full power of a GPU, and strategies for predicting the performance, before introducing a proper implementation. We develop a theoretical analysis and a methodology for high-performance linear solvers for very small matrices. As test cases, we take the Cholesky and LU factorizations and show how the proposed methodology enables us to achieve a performance close to the theoretical upper bound of the hardware. This work investigates and proposes novel algorithms for designing highly optimized GPU kernels for solving batches of hundreds of thousands of small-size Cholesky and LU factorizations. Our focus on efficient batched Cholesky and batched LU kernels is motivated by the increasing need for these kernels in scientific simulations (e.g., astrophysics applications). Techniques for optimal memory traffic, register blocking, and tunable concurrency are incorporated in our proposed design. The proposed GPU kernels achieve performance speedups versus CUBLAS of up to 6x for the factorizations, using double precision arithmetic on an NVIDIA Pascal P100 GPU.}, doi = {10.1109/TPDS.2017.2783929}, author = {Azzam Haidar and Ahmad Abdelfattah and Mawussi Zounon and Stanimire Tomov and Jack Dongarra} } @article {1335, title = {Harnessing GPU{\textquoteright}s Tensor Cores Fast FP16 Arithmetic to Speedup Mixed-Precision Iterative Refinement Solvers and Achieve 74 Gflops/Watt on Nvidia V100}, year = {2018}, month = {2018-03}, publisher = {GPU Technology Conference (GTC), Poster}, address = {San Jose, CA}, author = {Azzam Haidar and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @techreport {1203, title = {Implementation of the C++ API for Batch BLAS}, journal = {SLATE Working Notes}, number = {07, ICL-UT-18-04}, year = {2018}, month = {2018-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Ahmad Abdelfattah and Mark Gates and Jakub Kurzak and Piotr Luszczek and Jack Dongarra} } @article {1330, title = {MATEDOR: MAtrix, TEnsor, and Deep-learning Optimized Routines}, year = {2018}, month = {2018-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC18), Research Poster}, address = {Dallas, TX}, author = {Ahmad Abdelfattah and Jack Dongarra and Azzam Haidar and Stanimire Tomov and Ichitaro Yamazaki} } @article {1333, title = {MAtrix, TEnsor, and Deep-learning Optimized Routines (MATEDOR)}, year = {2018}, month = {2018-04}, publisher = {NSF PI Meeting, Poster}, address = {Washington, DC}, doi = {https://doi.org/10.6084/m9.figshare.6174143.v3}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Ichitaro Yamazaki and Jack Dongarra} } @conference {1210, title = {Optimizing GPU Kernels for Irregular Batch Workloads: A Case Study for Cholesky Factorization}, booktitle = {IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}18)}, year = {2018}, month = {2018-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {This paper introduces several frameworks for the design and implementation of high performance GPU kernels that target batch workloads with irregular sizes. Such workloads are ubiquitous in many scientific applications, including sparse direct solvers, astrophysics, and quantum chemistry. The paper addresses two main categories of frameworks, taking the Cholesky factorization as a case study. The first uses hostside kernel launches, and the second uses device-side launches. Within each category, different design options are introduced, with an emphasis on the advantages and the disadvantages of each approach. Our best performing design outperforms the state-of-the-art CPU implementation, scoring up to 4.7{\texttimes} speedup in double precision on a Pascal P100 GPU.}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1334, title = {Tensor Contractions using Optimized Batch GEMM Routines}, year = {2018}, month = {2018-03}, publisher = {GPU Technology Conference (GTC), Poster}, address = {San Jose, CA}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1332, title = {Using GPU FP16 Tensor Cores Arithmetic to Accelerate Mixed-Precision Iterative Refinement Solvers and Reduce Energy Consumption}, year = {2018}, month = {2018-06}, publisher = {ISC High Performance (ISC18), Best Poster Award}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Mawussi Zounon and Jack Dongarra} } @conference {1265, title = {Using GPU FP16 Tensor Cores Arithmetic to Accelerate Mixed-Precision Iterative Refinement Solvers and Reduce Energy Consumption}, booktitle = {ISC High Performance (ISC{\textquoteright}18), Best Poster}, year = {2018}, month = {2018-06}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Mawussi Zounon and Jack Dongarra} } @article {1341, title = {Accelerating Tensor Contractions in High-Order FEM with MAGMA Batched}, year = {2017}, month = {2017-03}, publisher = {SIAM Conference on Computer Science and Engineering (SIAM CSE17), Presentation}, address = {Atlanta, GA}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @techreport {1175, title = {C++ API for Batch BLAS}, journal = {SLATE Working Notes}, number = {04, ICL-UT-17-12}, year = {2017}, month = {2017-12}, publisher = {University of Tennessee}, author = {Ahmad Abdelfattah and Konstantin Arturov and Cris Cecka and Jack Dongarra and Chip Freitag and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Panruo Wu} } @techreport {1081, title = {C++ API for BLAS and LAPACK}, journal = {SLATE Working Notes}, number = {02, ICL-UT-17-03}, year = {2017}, note = {Revision 02-21-2018}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Mark Gates and Piotr Luszczek and Ahmad Abdelfattah and Jakub Kurzak and Jack Dongarra and Konstantin Arturov and Cris Cecka and Chip Freitag} } @article {1103, title = {Factorization and Inversion of a Million Matrices using GPUs: Challenges and Countermeasures}, journal = {Procedia Computer Science}, volume = {108}, year = {2017}, month = {2017-06}, pages = {606{\textendash}615}, abstract = {This paper presents new algorithmic approaches and optimization techniques for LU factorization and matrix inversion of millions of very small matrices using GPUs. These problems appear in many scientific applications including astrophysics and generation of block-Jacobi preconditioners. We show that, for very small problem sizes, design and optimization of GPU kernels require a mindset different from the one usually used when designing LAPACK algorithms for GPUs. Techniques for optimal memory traffic, register blocking, and tunable concurrency are incorporated in our proposed design. We also take advantage of the small matrix sizes to eliminate the intermediate row interchanges in both the factorization and inversion kernels. The proposed GPU kernels achieve performance speedups vs. CUBLAS of up to 6{\texttimes} for the factorization, and 14{\texttimes} for the inversion, using double precision arithmetic on a Pascal P100 GPU.}, doi = {https://doi.org/10.1016/j.procs.2017.05.250}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1102, title = {Fast Cholesky Factorization on GPUs for Batch and Native Modes in MAGMA}, journal = {Journal of Computational Science}, volume = {20}, year = {2017}, month = {2017-05}, pages = {85{\textendash}93}, abstract = {This paper presents a GPU-accelerated Cholesky factorization for two different modes of operation. The first one is the batch mode, where many independent factorizations on small matrices can be performed concurrently. This mode supports fixed size and variable size problems, and is found in many scientific applications. The second mode is the native mode, where one factorization is performed on a large matrix without any CPU involvement, which allows the CPU do other useful work. We show that, despite the different workloads, both modes of operation share a common code-base that uses the GPU only. We also show that the developed routines achieve significant speedups against a multicore CPU using the MKL library, and against a GPU implementation by cuSOLVER. This work is part of the MAGMA library.}, keywords = {GPU computing; Cholesky factorization; Batched execution}, doi = {https://doi.org/10.1016/j.jocs.2016.12.009}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {1142, title = {High-performance Cholesky Factorization for GPU-only Execution}, booktitle = {Proceedings of the General Purpose GPUs (GPGPU-10)}, year = {2017}, month = {2017-02}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, abstract = {We present our performance analysis, algorithm designs, and the optimizations needed for the development of high-performance GPU-only algorithms, and in particular, for the dense Cholesky factorization. In contrast to currently promoted designs that solve parallelism challenges on multicore architectures by representing algorithms as Directed Acyclic Graphs (DAGs), where nodes are tasks of fine granularity and edges are the dependencies between the tasks, our designs explicitly target manycore architectures like GPUs and feature coarse granularity tasks (that can be hierarchically split into fine grain data-parallel subtasks). Furthermore, in contrast to hybrid algorithms that schedule difficult to parallelize tasks on CPUs, we develop highly-efficient code for entirely GPU execution. GPU-only codes remove the expensive CPU-to-GPU communications and the tuning challenges related to slow CPU and/or low CPU-to-GPU bandwidth. We show that on latest GPUs, like the P100, this becomes so important that the GPU-only code even outperforms the hybrid MAGMA algorithms when the CPU tasks and communications can not be entirely overlapped with GPU computations. We achieve up to 4,300 GFlop/s in double precision on a P100 GPU, which is about 7-8{\texttimes} faster than high-end multicore CPUs, e.g., two 10-cores Intel Xeon E5-2650 v3 Haswell CPUs, where MKL runs up to about 500-600 Gflop/s. The new algorithm also outperforms significantly the GPU-only implementation currently available in the NVIDIA cuSOLVER library.}, doi = {https://doi.org/10.1145/3038228.3038237}, author = {Azzam Haidar and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {1084, title = {Novel HPC Techniques to Batch Execution of Many Variable Size BLAS Computations on GPUs}, booktitle = {International Conference on Supercomputing (ICS {\textquoteright}17)}, year = {2017}, month = {2017-06}, publisher = {ACM}, organization = {ACM}, address = {Chicago, Illinois}, doi = {10.1145/3079079.3079103}, url = {http://dl.acm.org/citation.cfm?id=3079103}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @techreport {1080, title = {Roadmap for the Development of a Linear Algebra Library for Exascale Computing: SLATE: Software for Linear Algebra Targeting Exascale}, journal = {SLATE Working Notes}, number = {01, ICL-UT-17-02}, year = {2017}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Aurelien Bouteiller and Anthony Danalis and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Stephen Wood and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan} } @techreport {1082, title = {Small Tensor Operations on Advanced Architectures for High-Order Applications}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-17-749}, year = {2017}, month = {2017-04}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @article {, title = {With Extreme Computing, the Rules Have Changed}, journal = {Computing in Science \& Engineering}, volume = {19}, year = {2017}, month = {2017-05}, pages = {52-62}, abstract = {On the eve of exascale computing, traditional wisdom no longer applies. High-performance computing is gone as we know it. This article discusses a range of new algorithmic techniques emerging in the context of exascale computing, many of which defy the common wisdom of high-performance computing and are considered unorthodox, but could turn out to be a necessity in near future.}, doi = {https://doi.org/10.1109/MCSE.2017.48}, author = {Jack Dongarra and Stanimire Tomov and Piotr Luszczek and Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Hartwig Anzt and Azzam Haidar and Ahmad Abdelfattah} } @article {1342, title = {Accelerating Tensor Contractions for High-Order FEM on CPUs, GPUs, and KNLs}, year = {2016}, month = {2016-09}, publisher = {moky Mountains Computational Sciences and Engineering Conference (SMC16), Poster}, address = {Gatlinburg, TN}, author = {Azzam Haidar and Ahmad Abdelfattah and Veselin Dobrev and Ian Karlin and Tzanio Kolev and Stanimire Tomov and Jack Dongarra} } @article {1344, title = {Cholesky Factorization on Batches of Matrices with Fixed and Variable Sizes}, year = {2016}, month = {2016-04}, publisher = {GPU Technology Conference (GTC16), Poster}, address = {San Jose, CA}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {940, title = {On the Development of Variable Size Batched Computation for Heterogeneous Parallel Architectures}, booktitle = {The 17th IEEE International Workshop on Parallel and Distributed Scientific and Engineering Computing (PDSEC 2016), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {
Many scientific applications, ranging from national security to medical advances, require solving a number of relatively small-size independent problems. As the size of each individual problem does not provide sufficient parallelism for the underlying hardware, especially accelerators, these problems must be solved concurrently as a batch in order to saturate the hardware with enough work, hence the name batched computation. A possible simplification is to assume a uniform size for all problems. However, real applications do not necessarily satisfy such assumption. Consequently, an efficient solution for variable-size batched computations is required.
This paper proposes a foundation for high performance variable-size batched matrix computation based on Graphics Processing Units (GPUs). Being throughput-oriented processors, GPUs favor regular computation and less divergence among threads, in order to achieve high performance. Therefore, the development of high performance numerical software for this kind of problems is challenging. As a case study, we developed efficient batched Cholesky factorization algorithms for relatively small matrices of different sizes. However, most of the strategies and the software developed, and in particular a set of variable size batched BLAS kernels, can be used in many other dense matrix factorizations, large scale sparse direct multifrontal solvers, and applications. We propose new interfaces and mechanisms to handle the irregular computation pattern on the GPU. According to the authors{\textquoteright} knowledge, this is the first attempt to develop high performance software for this class of problems. Using a K40c GPU, our performance tests show speedups of up to 2:5 against two Sandy Bridge CPUs (8-core each) running Intel MKL library.
}, keywords = {batched computation, GPUs, variable small sizes}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {964, title = {High-performance Matrix-matrix Multiplications of Very Small Matrices}, booktitle = {22nd International European Conference on Parallel and Distributed Computing (Euro-Par{\textquoteright}16)}, year = {2016}, month = {2016-08}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Grenoble, France}, abstract = {The use of the general dense matrix-matrix multiplication (GEMM) is fundamental for obtaining high performance in many scientific computing applications. GEMMs for small matrices (of sizes less than 32) however, are not sufficiently optimized in existing libraries. In this paper we consider the case of many small GEMMs on either CPU or GPU architectures. This is a case that often occurs in applications like big data analytics, machine learning, high-order FEM, and others. The GEMMs are grouped together in a single batched routine. We present specialized for these cases algorithms and optimization techniques to obtain performance that is within 90\% of the optimal. We show that these results outperform currently available state-of-the-art implementations and vendor-tuned math libraries.}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jo{\"e}l Falcou and Jack Dongarra} } @conference {942, title = {High-Performance Tensor Contractions for GPUs}, booktitle = {International Conference on Computational Science (ICCS{\textquoteright}16)}, year = {2016}, month = {2016-06}, address = {San Diego, CA}, abstract = {We present a computational framework for high-performance tensor contractions on GPUs. High-performance is difficult to obtain using existing libraries, especially for many independent contractions where each contraction is very small, e.g., sub-vector/warp in size. However, using our framework to batch contractions plus application-specifics, we demonstrate close to peak performance results. In particular, to accelerate large scale tensor-formulated high-order finite element method (FEM) simulations, which is the main focus and motivation for this work, we represent contractions as tensor index reordering plus matrix-matrix multiplications (GEMMs). This is a key factor to achieve algorithmically many-fold acceleration (vs. not using it) due to possible reuse of data loaded in fast memory. In addition to using this context knowledge, we design tensor data-structures, tensor algebra interfaces, and new tensor contraction algorithms and implementations to achieve 90+\% of a theoretically derived peak on GPUs. On a K40c GPU for contractions resulting in GEMMs on square matrices of size 8 for example, we are 2.8{\texttimes} faster than CUBLAS, and 8.5{\texttimes} faster than MKL on 16 cores of Intel Xeon E5-2670 (Sandy Bridge) 2.60GHz CPUs. Finally, we apply autotuning and code generation techniques to simplify tuning and provide an architecture-aware, user-friendly interface.}, keywords = {Applications, Batched linear algebra, FEM, gpu, Tensor contractions, Tensor HPC}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @techreport {929, title = {High-Performance Tensor Contractions for GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-738}, year = {2016}, month = {2016-01}, publisher = {University of Tennessee}, abstract = {We present a computational framework for high-performance tensor contractions on GPUs. High-performance is difficult to obtain using existing libraries, especially for many independent contractions where each contraction is very small, e.g., sub-vector/warp in size. However, using our framework to batch contractions plus application-specifics, we demonstrate close to peak performance results. In particular, to accelerate large scale tensor-formulated high-order finite element method (FEM) simulations, which is the main focus and motivation for this work, we represent contractions as tensor index reordering plus matrix-matrix multiplications (GEMMs). This is a key factor to achieve algorithmically many-fold acceleration (vs. not using it) due to possible reuse of data loaded in fast memory. In addition to using this context knowledge, we design tensor data-structures, tensor algebra interfaces, and new tensor contraction algorithms and implementations to achieve 90+\% of a theoretically derived peak on GPUs. On a K40c GPU for contractions resulting in GEMMs on square matrices of size 8 for example, we are 2.8{\texttimes} faster than CUBLAS, and 8.5{\texttimes} faster than MKL on 16 cores of Intel Xeon ES-2670 (Sandy Bridge) 2.60GHz CPUs. Finally, we apply autotuning and code generation techniques to simplify tuning and provide an architecture-aware, user-friendly interface.}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @article {1472, title = {Linear Algebra Software for Large-Scale Accelerated Multicore Computing}, journal = {Acta Numerica}, volume = {25}, year = {2016}, month = {2016-05}, pages = {1-160}, abstract = {Many crucial scientific computing applications, ranging from national security to medical advances, rely on high-performance linear algebra algorithms and technologies, underscoring their importance and broad impact. Here we present the state-of-the-art design and implementation practices for the acceleration of the predominant linear algebra algorithms on large-scale accelerated multicore systems. Examples are given with fundamental dense linear algebra algorithms {\textendash} from the LU, QR, Cholesky, and LDLT factorizations needed for solving linear systems of equations, to eigenvalue and singular value decomposition (SVD) problems. The implementations presented are readily available via the open-source PLASMA and MAGMA libraries, which represent the next generation modernization of the popular LAPACK library for accelerated multicore systems. To generate the extreme level of parallelism needed for the efficient use of these systems, algorithms of interest are redesigned and then split into well-chosen computational tasks. The task execution is scheduled over the computational components of a hybrid system of multicore CPUs with GPU accelerators and/or Xeon Phi coprocessors, using either static scheduling or light-weight runtime systems. The use of light-weight runtime systems keeps scheduling overheads low, similar to static scheduling, while enabling the expression of parallelism through sequential-like code. This simplifies the development effort and allows exploration of the unique strengths of the various hardware components. Finally, we emphasize the development of innovative linear algebra algorithms using three technologies {\textendash} mixed precision arithmetic, batched operations, and asynchronous iterations {\textendash} that are currently of high interest for accelerated multicore systems.}, doi = {10.1017/S0962492916000015}, author = {Ahmad Abdelfattah and Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and undefined and Asim YarKhan} } @techreport {, title = {MAGMA Batched: A Batched BLAS Approach for Small Matrix Factorizations and Applications on GPUs}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-16-02}, year = {2016}, month = {2016-08}, publisher = {University of Tennessee}, abstract = {A particularly challenging class of problems arising in many applications, called batched problems, involves linear algebra operations on many small-sized matrices. We proposed and designed batched BLAS (Basic Linear Algebra Subroutines), Level-2 GEMV and Level-3 GEMM, to solve them. We illustrate how batched GEMV and GEMM to be able to assist batched advance factorization (e.g. bi-diagonalization) and other BLAS routines (e.g. triangular solve) to achieve optimal performance on GPUs. Our solutions achieved up to 2.8-3{\texttimes} speedups compared to CUBLAS and MKL solutions, wherever possible. We illustrated the batched methodology on a real-world Hydrodynamic application by reformulating the tensor operations into batched BLAS GEMV and GEMM operations. A 2.5{\texttimes} speedup and a 1.4{\texttimes} greenup are obtained by changing 10\% of the code. We accelerated and scaled it on Titan supercomputer to 4096 nodes.}, author = {Tingxing Dong and Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Ahmad Abdelfattah and Jack Dongarra} } @techreport {934, title = {Performance, Design, and Autotuning of Batched GEMM for GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-739}, year = {2016}, month = {2016-02}, publisher = {University of Tennessee}, abstract = {Abstract. The general matrix-matrix multiplication (GEMM) is the most important numerical kernel in dense linear algebra. It is the key component for obtaining high performance in most LAPACK routines. As batched computations on relatively small problems continue to gain interest in many scientific applications, there becomes a need to have a high performance GEMM kernel for a batch of small matrices. Such kernel should be well designed and tuned to handle small sizes, and to maintain high performance for realistic test cases found in the higher level LAPACK routines, and scientific computing applications in general. This paper presents a high performance batched GEMM kernel on Graphics Processing Units (GPUs). We address batched problems with both xed and variable sizes, and show that specialized GEMM designs and a comprehensive autotuning process are needed to handle problems of small sizes. For most performance test reported in this paper, the proposed kernels outperform state-of-the-art approaches using a K40c GPU.}, keywords = {Autotuning, Batched GEMM, GEMM, GPU computing, HPC}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {944, title = {Performance, Design, and Autotuning of Batched GEMM for GPUs}, booktitle = {The International Supercomputing Conference (ISC High Performance 2016)}, year = {2016}, month = {2016-06}, address = {Frankfurt, Germany}, abstract = {The general matrix-matrix multiplication (GEMM) is the most important numerical kernel in dense linear algebra, and is the key component for obtaining high performance in most LAPACK routines. As batched computations on relatively small problems continue to gain interest in many scientific applications, a need arises for a high performance GEMM kernel for batches of small matrices. Such a kernel should be well designed and tuned to handle small sizes, and to maintain high performance for realistic test cases found in the higher level LAPACK routines, and scientific computing applications in general. This paper presents a high performance batched GEMM kernel on Graphics Processing Units (GPUs). We address batched problems with both fixed and variable sizes, and show that specialized GEMM designs and a comprehensive autotuning process are needed to handle problems of small sizes. For most performance tests reported in this paper, the proposed kernels outperform state-of-the-art approaches using a K40c GPU.}, keywords = {Autotuning, Batched GEMM, GEMM, GPU computing, HPC}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @inbook {997, title = {Performance, Design, and Autotuning of Batched GEMM for GPUs}, booktitle = {High Performance Computing: 31st International Conference, ISC High Performance 2016, Frankfurt, Germany, June 19-23, 2016, Proceedings}, number = {9697}, year = {2016}, pages = {21{\textendash}38}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {The general matrix-matrix multiplication (GEMM) is the most important numerical kernel in dense linear algebra, and is the key component for obtaining high performance in most LAPACK routines. As batched computations on relatively small problems continue to gain interest in many scientific applications, a need arises for a high performance GEMM kernel for batches of small matrices. Such a kernel should be well designed and tuned to handle small sizes, and to maintain high performance for realistic test cases found in the higher level LAPACK routines, and scientific computing applications in general. This paper presents a high performance batched GEMM kernel on Graphics Processing Units (GPUs). We address batched problems with both fixed and variable sizes, and show that specialized GEMM designs and a comprehensive autotuning process are needed to handle problems of small sizes. For most performance tests reported in this paper, the proposed kernels outperform state-of-the-art approaches using a K40c GPU.}, isbn = {978-3-319-41321-1}, doi = {10.1007/978-3-319-41321-1_2}, url = {http://dx.doi.org/10.1007/978-3-319-41321-1_2}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra}, editor = {Julian M. Kunkel and Pavan Balaji and Jack Dongarra} } @article {985, title = {Performance optimization of Sparse Matrix-Vector Multiplication for multi-component PDE-based applications using GPUs}, journal = {Concurrency and Computation: Practice and Experience}, volume = {28}, year = {2016}, month = {2016-05}, pages = {3447 - 3465}, abstract = {Simulations of many multi-component PDE-based applications, such as petroleum reservoirs or reacting flows, are dominated by the solution, on each time step and within each Newton step, of large sparse linear systems. The standard solver is a preconditioned Krylov method. Along with application of the preconditioner, memory-bound Sparse Matrix-Vector Multiplication (SpMV) is the most time-consuming operation in such solvers. Multi-species models produce Jacobians with a dense block structure, where the block size can be as large as a few dozen. Failing to exploit this dense block structure vastly underutilizes hardware capable of delivering high performance on dense BLAS operations. This paper presents a GPU-accelerated SpMV kernel for block-sparse matrices. Dense matrix-vector multiplications within the sparse-block structure leverage optimization techniques from the KBLAS library, a high performance library for dense BLAS kernels. The design ideas of KBLAS can be applied to block-sparse matrices. Furthermore, a technique is proposed to balance the workload among thread blocks when there are large variations in the lengths of nonzero rows. Multi-GPU performance is highlighted. The proposed SpMV kernel outperforms existing state-of-the-art implementations using matrices with real structures from different applications.}, doi = {10.1002/cpe.v28.1210.1002/cpe.3874}, url = {http://onlinelibrary.wiley.com/doi/10.1002/cpe.3874/full}, author = {Ahmad Abdelfattah and Hatem Ltaeif and David Keyes and Jack Dongarra} } @conference {943, title = {Performance Tuning and Optimization Techniques of Fixed and Variable Size Batched Cholesky Factorization on GPUs}, booktitle = {International Conference on Computational Science (ICCS{\textquoteright}16)}, year = {2016}, month = {2016-06}, address = {San Diego, CA}, abstract = {Solving a large number of relatively small linear systems has recently drawn more attention in the HPC community, due to the importance of such computational workloads in many scientific applications, including sparse multifrontal solvers. Modern hardware accelerators and their architecture require a set of optimization techniques that are very different from the ones used in solving one relatively large matrix. In order to impose concurrency on such throughput-oriented architectures, a common practice is to batch the solution of these matrices as one task offloaded to the underlying hardware, rather than solving them individually.
This paper presents a high performance batched Cholesky factorization on large sets of relatively small matrices using Graphics Processing Units (GPUs), and addresses both fixed and variable size batched problems. We investigate various algorithm designs and optimization techniques, and show that it is essential to combine kernel design with performance tuning in order to achieve the best possible performance. We compare our approaches against state-of-the-art CPU solutions as well as GPU-based solutions using existing libraries, and show that, on a K40c GPU for example, our kernels are more than 2 faster.
}, keywords = {batched computation, Cholesky Factorization, GPUs, Tuning}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {895, title = {Batched Matrix Computations on Hardware Accelerators Based on GPUs}, booktitle = {2015 SIAM Conference on Applied Linear Algebra (SIAM LA)}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {We will present techniques for small matrix computations on GPUs and their use for energy efficient, high-performance solvers. Work on small problems delivers high performance through improved data reuse. Many numerical libraries and applications need this functionality further developed. We describe the main factorizations LU, QR, and Cholesky for a set of small dense matrices in parallel. We achieve significant acceleration and reduced energy consumption against other solutions. Our techniques are of interest to GPU application developers in general.}, author = {Azzam Haidar and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {936, title = {Parallel Programming Models for Dense Linear Algebra on Heterogeneous Systems}, journal = {Supercomputing Frontiers and Innovations}, volume = {2}, number = {4}, year = {2015}, month = {2015-10}, abstract = {We present a review of the current best practices in parallel programming models for dense linear algebra (DLA) on heterogeneous architectures. We consider multicore CPUs, stand alone manycore coprocessors, GPUs, and combinations of these. Of interest is the evolution of the programming models for DLA libraries {\textendash} in particular, the evolution from the popular LAPACK and ScaLAPACK libraries to their modernized counterparts PLASMA (for multicore CPUs) and MAGMA (for heterogeneous architectures), as well as other programming models and libraries. Besides providing insights into the programming techniques of the libraries considered, we outline our view of the current strengths and weaknesses of their programming models {\textendash} especially in regards to hardware trends and ease of programming high-performance numerical software that current applications need {\textendash} in order to motivate work and future directions for the next generation of parallel programming models for high-performance linear algebra libraries on heterogeneous systems.}, keywords = {dense linear algebra, gpu, HPC, Multicore, plasma, Programming models, runtime}, doi = {10.14529/jsfi1504}, author = {Maksims Abalenkovs and Ahmad Abdelfattah and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki and Asim YarKhan} } @article {icl:709, title = {Optimizing Memory-Bound Numerical Kernels on GPU Hardware Accelerators}, journal = {VECPAR 2012}, year = {2012}, month = {2012-07}, address = {Kobe, Japan}, author = {Ahmad Abdelfattah and Jack Dongarra and David Keyes and Hatem Ltaeif} }