@article {1262, title = {Algorithms and Optimization Techniques for High-Performance Matrix-Matrix Multiplications of Very Small Matrices}, journal = {Parallel Computing}, volume = {81}, year = {2019}, month = {2019-01}, pages = {1{\textendash}21}, abstract = {Expressing scientific computations in terms of BLAS, and in particular the general dense matrix-matrix multiplication (GEMM), is of fundamental importance for obtaining high performance portability across architectures. However, GEMMs for small matrices of sizes smaller than 32 are not sufficiently optimized in existing libraries. We consider the computation of many small GEMMs and its performance portability for a wide range of computer architectures, including Intel CPUs, ARM, IBM, Intel Xeon Phi, and GPUs. These computations often occur in applications like big data analytics, machine learning, high-order finite element methods (FEM), and others. The GEMMs are grouped together in a single batched routine. For these cases, we present algorithms and their optimization techniques that are specialized for the matrix sizes and architectures of interest. We derive a performance model and show that the new developments can be tuned to obtain performance that is within 90\% of the optimal for any of the architectures of interest. For example, on a V100 GPU for square matrices of size 32, we achieve an execution rate of about 1600 gigaFLOP/s in double-precision arithmetic, which is 95\% of the theoretically derived peak for this computation on a V100 GPU. We also show that these results outperform currently available state-of-the-art implementations such as vendor-tuned math libraries, including Intel MKL and NVIDIA CUBLAS, as well as open-source libraries like OpenBLAS and Eigen.}, keywords = {Autotuning, Batched GEMM, HPC, Matrix-matrix product, optimization, Small matrices}, doi = {https://doi.org/10.1016/j.parco.2018.10.003}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Marc Baboulin and Jo{\"e}l Falcou and Jack Dongarra} } @techreport {1229, title = {Algorithms and Optimization Techniques for High-Performance Matrix-Matrix Multiplications of Very Small Matrices}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-09}, year = {2018}, month = {2018-09}, publisher = {Innovative Computing Laboratory, University of Tennessee}, abstract = {Expressing scientific computations in terms of BLAS, and in particular the general dense matrix-matrix multiplication (GEMM), is of fundamental importance for obtaining high performance portability across architectures. However, GEMMs for small matrices of sizes smaller than 32 are not sufficiently optimized in existing libraries. We consider the computation of many small GEMMs and its performance portability for a wide range of computer architectures, including Intel CPUs, ARM, IBM, Intel Xeon Phi, and GPUs. These computations often occur in applications like big data analytics, machine learning, high-order finite element methods (FEM), and others. The GEMMs are grouped together in a single batched routine. For these cases, we present algorithms and their optimization techniques that are specialized for the matrix sizes and architectures of interest. We derive a performance model and show that the new developments can be tuned to obtain performance that is within 90\% of the optimal for any of the architectures of interest. For example, on a V100 GPU for square matrices of size 32, we achieve an execution rate of about 1; 600 gigaFLOP/s in double-precision arithmetic, which is 95\% of the theoretically derived peak for this computation on a V100 GPU. We also show that these results outperform currently available state-of-the-art implementations such as vendor-tuned math libraries, including Intel MKL and NVIDIA CUBLAS, as well as open-source libraries like OpenBLAS and Eigen.}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Marc Baboulin and Jo{\"e}l Falcou and Jack Dongarra} } @article {1341, title = {Accelerating Tensor Contractions in High-Order FEM with MAGMA Batched}, year = {2017}, month = {2017-03}, publisher = {SIAM Conference on Computer Science and Engineering (SIAM CSE17), Presentation}, address = {Atlanta, GA}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @techreport {1082, title = {Small Tensor Operations on Advanced Architectures for High-Order Applications}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-17-749}, year = {2017}, month = {2017-04}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @article {1019, title = {Solving Dense Symmetric Indefinite Systems using GPUs}, journal = {Concurrency and Computation: Practice and Experience}, volume = {29}, year = {2017}, month = {2017-03}, abstract = {This paper studies the performance of different algorithms for solving a dense symmetric indefinite linear system of equations on multicore CPUs with a Graphics Processing Unit (GPU). To ensure the numerical stability of the factorization, pivoting is required. Obtaining high performance of such algorithms on the GPU is difficult because all the existing pivoting strategies lead to frequent synchronizations and irregular data accesses. Until recently, there has not been any implementation of these algorithms on a hybrid CPU/GPU architecture. To improve their performance on the hybrid architecture, we explore different techniques to reduce the expensive data transfer and synchronization between the CPU and GPU, or on the GPU (e.g., factorizing the matrix entirely on the GPU or in a communication-avoiding fashion). We also study the performance of the solver using iterative refinements along with the factorization without pivoting combined with the preprocessing technique based on random butterfly transformations, or with the mixed-precision algorithm where the matrix is factorized in single precision. This randomization algorithm only has a probabilistic proof on the numerical stability, and for this paper, we only focused on the mixed-precision algorithm without pivoting. However, they demonstrate that we can obtain good performance on the GPU by avoiding the pivoting and using the lower precision arithmetics, respectively. As illustrated with the application in acoustics studied in this paper, in many practical cases, the matrices can be factorized without pivoting. Because the componentwise backward error computed in the iterative refinement signals when the algorithm failed to obtain the desired accuracy, the user can use these potentially unstable but efficient algorithms in most of the cases and fall back to a more stable algorithm with pivoting only in the case of the failure.}, doi = {10.1002/cpe.4055}, url = {http://onlinelibrary.wiley.com/doi/10.1002/cpe.4055/full}, author = {Marc Baboulin and Jack Dongarra and Adrien Remy and Stanimire Tomov and Ichitaro Yamazaki} } @article {994, title = {Accelerating the Conjugate Gradient Algorithm with GPU in CFD Simulations}, journal = {VECPAR}, year = {2016}, abstract = {This paper illustrates how GPU computing can be used to accelerate computational fluid dynamics (CFD) simulations. For sparse linear systems arising from finite volume discretization, we evaluate and optimize the performance of Conjugate Gradient (CG) routines designed for manycore accelerators and compare against an industrial CPU-based implementation. We also investigate how the recent advances in preconditioning, such as iterative Incomplete Cholesky (IC, as symmetric case of ILU) preconditioning, match the requirements for solving real world problems.}, url = {http://hgpu.org/?p=16264}, author = {Hartwig Anzt and Marc Baboulin and Jack Dongarra and Yvan Fournier and Frank Hulsemann and Amal Khabou and Yushan Wang} } @inbook {883, title = {Dense Symmetric Indefinite Factorization on GPU Accelerated Architectures}, booktitle = {Lecture Notes in Computer Science}, series = {11th International Conference, PPAM 2015, Krakow, Poland, September 6-9, 2015. Revised Selected Papers, Part I}, volume = {9573}, year = {2016}, month = {2015-09}, pages = {86-95}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, chapter = {Parallel Processing and Applied Mathematics}, abstract = {We study the performance of dense symmetric indefinite factorizations (Bunch-Kaufman and Aasen{\textquoteright}s algorithms) on multicore CPUs with a Graphics Processing Unit (GPU). Though such algorithms are needed in many scientific and engineering simulations, obtaining high performance of the factorization on the GPU is difficult because the pivoting that is required to ensure the numerical stability of the factorization leads to frequent synchronizations and irregular data accesses. As a result, until recently, there has not been any implementation of these algorithms on hybrid CPU/GPU architectures. To improve their performance on the hybrid architecture, we explore different techniques to reduce the expensive communication and synchronization between the CPU and GPU, or on the GPU. We also study the performance of an LDL^T factorization with no pivoting combined with the preprocessing technique based on Random Butterfly Transformations. Though such transformations only have probabilistic results on the numerical stability, they avoid the pivoting and obtain a great performance on the GPU. }, keywords = {Communication-avoiding, Dense symmetric indefinite factorization, gpu computation, randomization}, isbn = {978-3-319-32149-3}, doi = {10.1007/978-3-319-32149-3_9}, author = {Marc Baboulin and Jack Dongarra and Adrien Remy and Stanimire Tomov and Ichitaro Yamazaki}, editor = {Roman Wyrzykowski and Ewa Deelman and Konrad Karczewski and Jacek Kitowski and Kazimierz Wiatr} } @conference {942, title = {High-Performance Tensor Contractions for GPUs}, booktitle = {International Conference on Computational Science (ICCS{\textquoteright}16)}, year = {2016}, month = {2016-06}, address = {San Diego, CA}, abstract = {We present a computational framework for high-performance tensor contractions on GPUs. High-performance is difficult to obtain using existing libraries, especially for many independent contractions where each contraction is very small, e.g., sub-vector/warp in size. However, using our framework to batch contractions plus application-specifics, we demonstrate close to peak performance results. In particular, to accelerate large scale tensor-formulated high-order finite element method (FEM) simulations, which is the main focus and motivation for this work, we represent contractions as tensor index reordering plus matrix-matrix multiplications (GEMMs). This is a key factor to achieve algorithmically many-fold acceleration (vs. not using it) due to possible reuse of data loaded in fast memory. In addition to using this context knowledge, we design tensor data-structures, tensor algebra interfaces, and new tensor contraction algorithms and implementations to achieve 90+\% of a theoretically derived peak on GPUs. On a K40c GPU for contractions resulting in GEMMs on square matrices of size 8 for example, we are 2.8{\texttimes} faster than CUBLAS, and 8.5{\texttimes} faster than MKL on 16 cores of Intel Xeon E5-2670 (Sandy Bridge) 2.60GHz CPUs. Finally, we apply autotuning and code generation techniques to simplify tuning and provide an architecture-aware, user-friendly interface.}, keywords = {Applications, Batched linear algebra, FEM, gpu, Tensor contractions, Tensor HPC}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @techreport {929, title = {High-Performance Tensor Contractions for GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-738}, year = {2016}, month = {2016-01}, publisher = {University of Tennessee}, abstract = {We present a computational framework for high-performance tensor contractions on GPUs. High-performance is difficult to obtain using existing libraries, especially for many independent contractions where each contraction is very small, e.g., sub-vector/warp in size. However, using our framework to batch contractions plus application-specifics, we demonstrate close to peak performance results. In particular, to accelerate large scale tensor-formulated high-order finite element method (FEM) simulations, which is the main focus and motivation for this work, we represent contractions as tensor index reordering plus matrix-matrix multiplications (GEMMs). This is a key factor to achieve algorithmically many-fold acceleration (vs. not using it) due to possible reuse of data loaded in fast memory. In addition to using this context knowledge, we design tensor data-structures, tensor algebra interfaces, and new tensor contraction algorithms and implementations to achieve 90+\% of a theoretically derived peak on GPUs. On a K40c GPU for contractions resulting in GEMMs on square matrices of size 8 for example, we are 2.8{\texttimes} faster than CUBLAS, and 8.5{\texttimes} faster than MKL on 16 cores of Intel Xeon ES-2670 (Sandy Bridge) 2.60GHz CPUs. Finally, we apply autotuning and code generation techniques to simplify tuning and provide an architecture-aware, user-friendly interface.}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @article {1346, title = {Towards a High-Performance Tensor Algebra Package for Accelerators}, year = {2015}, month = {2015-09}, publisher = {moky Mountains Computational Sciences and Engineering Conference (SMC15)}, address = {Gatlinburg, TN}, author = {Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @conference {832, title = {Computing Least Squares Condition Numbers on Hybrid Multicore/GPU Systems}, booktitle = {International Interdisciplinary Conference on Applied Mathematics, Modeling and Computational Science (AMMCS)}, year = {2014}, month = {2014-08}, address = {Waterloo, Ontario, CA}, abstract = {This paper presents an efficient computation for least squares conditioning or estimates of it. We propose performance results using new routines on top of the multicore-GPU library MAGMA. This set of routines is based on an efficient computation of the variance-covariance matrix for which, to our knowledge, there is no implementation in current public domain libraries LAPACK and ScaLAPACK.}, author = {Marc Baboulin and Jack Dongarra and Remi Lacroix} } @article {821, title = {An Efficient Distributed Randomized Algorithm for Solving Large Dense Symmetric Indefinite Linear Systems}, journal = {Parallel Computing}, volume = {40}, year = {2014}, month = {2014-07}, pages = {213-223}, abstract = {Randomized algorithms are gaining ground in high-performance computing applications as they have the potential to outperform deterministic methods, while still providing accurate results. We propose a randomized solver for distributed multicore architectures to efficiently solve large dense symmetric indefinite linear systems that are encountered, for instance, in parameter estimation problems or electromagnetism simulations. The contribution of this paper is to propose efficient kernels for applying random butterfly transformations and a new distributed implementation combined with a runtime (PaRSEC) that automatically adjusts data structures, data mappings, and the scheduling as systems scale up. Both the parallel distributed solver and the supporting runtime environment are innovative. To our knowledge, the randomization approach associated with this solver has never been used in public domain software for symmetric indefinite systems. The underlying runtime framework allows seamless data mapping and task scheduling, mapping its capabilities to the underlying hardware features of heterogeneous distributed architectures. The performance of our software is similar to that obtained for symmetric positive definite systems, but requires only half the execution time and half the amount of data storage of a general dense solver.}, keywords = {Distributed linear algebra solvers, LDLT factorization, PaRSEC runtime, plasma, Randomized algorithms, Symmetric indefinite systems}, doi = {10.1016/j.parco.2013.12.003}, author = {Marc Baboulin and Du Becker and George Bosilca and Anthony Danalis and Jack Dongarra} } @article {icl:721, title = {Accelerating Linear System Solutions Using Randomization Techniques}, journal = {ACM Transactions on Mathematical Software (also LAWN 246)}, volume = {39}, year = {2013}, month = {2013-02}, abstract = {We illustrate how linear algebra calculations can be enhanced by statistical techniques in the case of a square linear system Ax = b. We study a random transformation of A that enables us to avoid pivoting and then to reduce the amount of communication. Numerical experiments show that this randomization can be performed at a very affordable computational price while providing us with a satisfying accuracy when compared to partial pivoting. This random transformation called Partial Random Butterfly Transformation (PRBT) is optimized in terms of data storage and flops count. We propose a solver where PRBT and the LU factorization with no pivoting take advantage of the current hybrid multicore/GPU machines and we compare its Gflop/s performance with a solver implemented in a current parallel library.}, keywords = {algorithms, dense linear algebra, experimentation, graphics processing units, linear systems, lu factorization, multiplicative preconditioning, numerical linear algebra, performance, plasma, randomization}, doi = {10.1145/2427023.2427025}, url = {http://dl.acm.org/citation.cfm?id=2427025}, author = {Marc Baboulin and Jack Dongarra and Julien Herrmann and Stanimire Tomov} } @conference {763, title = {A Parallel Solver for Incompressible Fluid Flows}, booktitle = {International Conference on Computational Science (ICCS 2013)}, year = {2013}, month = {2013-06}, publisher = {Elsevier B.V.}, organization = {Elsevier B.V.}, address = {Barcelona, Spain}, abstract = {The Navier-Stokes equations describe a large class of fluid flows but are difficult to solve analytically because of their nonlin- earity. We present in this paper a parallel solver for the 3-D Navier-Stokes equations of incompressible unsteady flows with constant coefficients, discretized by the finite difference method. We apply the prediction-projection method which transforms the Navier-Stokes equations into three Helmholtz equations and one Poisson equation. For each Helmholtz system, we apply the Alternating Direction Implicit (ADI) method resulting in three tridiagonal systems. The Poisson equation is solved using partial diagonalization which transforms the Laplacian operator into a tridiagonal one. We describe an implementation based on MPI where the computations are performed on each subdomain and information is exchanged on the interfaces, and where the tridiagonal system solutions are accelerated using vectorization techniques. We present performance results on a current multicore system.}, keywords = {ADI, Navier-Stokes equations, Parallel computing, Partial diagonalization, Prediction-projection, SIMD}, doi = {DOI: 10.1016/j.procs.2013.05.207}, author = {Yushan Wang and Marc Baboulin and Jo{\"e}l Falcou and Yann Fraigneau and Olivier Le Ma{\^\i}tre} } @inproceedings {icl:685, title = {A Class of Communication-Avoiding Algorithms for Solving General Dense Linear Systems on CPU/GPU Parallel Machines}, journal = {Proc. of the International Conference on Computational Science (ICCS)}, volume = {9}, year = {2012}, month = {2012-06}, pages = {17-26}, keywords = {magma}, author = {Marc Baboulin and Simplice Donfack and Jack Dongarra and Laura Grigori and Adrien Remi and Stanimire Tomov} } @techreport {icl:683, title = {An efficient distributed randomized solver with application to large dense linear systems}, journal = {ICL Technical Report}, number = {ICL-UT-12-02}, year = {2012}, month = {2012-07}, keywords = {dague, dplasma, parsec, plasma}, author = {Marc Baboulin and Dulceneia Becker and George Bosilca and Anthony Danalis and Jack Dongarra} } @article {icl:696, title = {A Parallel Tiled Solver for Symmetric Indefinite Systems On Multicore Architectures}, journal = {IPDPS 2012}, year = {2012}, month = {2012-05}, address = {Shanghai, China}, author = {Marc Baboulin and Dulceneia Becker and Jack Dongarra} } @article {icl:722, title = {Reducing the Amount of Pivoting in Symmetric Indefinite Systems}, journal = {Parallel Processing and Applied Mathematics, Lecture Notes in Computer Science (PPAM 2011)}, volume = {7203}, year = {2012}, month = {2012-00}, pages = {133-142}, publisher = {Springer-Verlag Berlin Heidelberg}, author = {Dulceneia Becker and Marc Baboulin and Jack Dongarra}, editor = {Roman Wyrzykowski and Jack Dongarra and Konrad Karczewski and Jerzy Wasniewski} } @article {icl:637, title = {Accelerating Linear System Solutions Using Randomization Techniques}, journal = {INRIA RR-7616 / LAWN $\#$246 (presented at International AMMCS{\textquoteright}11)}, year = {2011}, month = {2011-07}, address = {Waterloo, Ontario, Canada}, keywords = {magma}, author = {Marc Baboulin and Jack Dongarra and Julien Herrmann and Stanimire Tomov} } @techreport {icl:641, title = {A parallel tiled solver for dense symmetric indefinite systems on multicore architectures}, journal = {University of Tennessee Computer Science Technical Report}, number = {ICL-UT-11-07}, year = {2011}, month = {2011-10}, keywords = {plasma, quark}, author = {Marc Baboulin and Dulceneia Becker and Jack Dongarra} } @techreport {icl:613, title = {Reducing the Amount of Pivoting in Symmetric Indefinite Systems}, journal = {University of Tennessee Innovative Computing Laboratory Technical Report}, number = {ICL-UT-11-06}, year = {2011}, month = {2011-05}, publisher = {Submitted to PPAM 2011}, address = {Knoxville, TN}, author = {Dulceneia Becker and Marc Baboulin and Jack Dongarra} } @article {icl:564, title = {Towards Dense Linear Algebra for Hybrid GPU Accelerated Manycore Systems}, journal = {Parallel Computing}, volume = {36}, number = {5-6}, year = {2010}, month = {2010-00}, pages = {232-240}, keywords = {magma}, author = {Stanimire Tomov and Jack Dongarra and Marc Baboulin} } @article {, title = {Accelerating Scientific Computations with Mixed Precision Algorithms}, journal = {Computer Physics Communications}, volume = {180}, year = {2009}, month = {2009-12}, pages = {2526-2533}, abstract = {On modern architectures, the performance of 32-bit operations is often at least twice as fast as the performance of 64-bit operations. By using a combination of 32-bit and 64-bit floating point arithmetic, the performance of many dense and sparse linear algebra algorithms can be significantly enhanced while maintaining the 64-bit accuracy of the resulting solution. The approach presented here can apply not only to conventional processors but also to other technologies such as Field Programmable Gate Arrays (FPGA), Graphical Processing Units (GPU), and the STI Cell BE processor. Results on modern processor architectures and the STI Cell BE are presented.}, doi = {https://doi.org/10.1016/j.cpc.2008.11.005}, author = {Marc Baboulin and Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julie Langou and Julien Langou and Piotr Luszczek and Stanimire Tomov} } @article {icl:482, title = {Computing the Conditioning of the Components of a Linear Least-squares Solution}, journal = {Numerical Linear Algebra with Applications}, volume = {16}, number = {7}, year = {2009}, month = {2009-00}, pages = {517-533}, author = {Marc Baboulin and Jack Dongarra and Serge Gratton and Julien Langou} } @article {icl:457, title = {Computing the Conditioning of the Components of a Linear Least Squares Solution}, journal = {VECPAR {\textquoteright}08, High Performance Computing for Computational Science}, year = {2008}, month = {2008-01}, address = {Toulouse, France}, author = {Marc Baboulin and Jack Dongarra and Serge Gratton and Julien Langou} } @article {1353, title = {Enhancing the Performance of Dense Linear Algebra Solvers on GPUs (in the MAGMA Project)}, year = {2008}, month = {2008-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC08)}, address = {Austin, TX}, author = {Marc Baboulin and James Demmel and Jack Dongarra and Stanimire Tomov and Vasily Volkov} } @inproceedings {icl:516, title = {Some Issues in Dense Linear Algebra for Multicore and Special Purpose Architectures}, journal = {PARA 2008, 9th International Workshop on State-of-the-Art in Scientific and Parallel Computing}, year = {2008}, month = {2008-05}, address = {Trondheim Norway}, keywords = {magma}, author = {Marc Baboulin and Stanimire Tomov and Jack Dongarra} } @techreport {icl:415, title = {Some Issues in Dense Linear Algebra for Multicore and Special Purpose Architectures}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-615 (also LAPACK Working Note 200)}, year = {2008}, month = {2008-01}, keywords = {magma}, author = {Marc Baboulin and Jack Dongarra and Stanimire Tomov} } @techreport {icl:443, title = {Towards Dense Linear Algebra for Hybrid GPU Accelerated Manycore Systems}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-632 (also LAPACK Working Note 210)}, year = {2008}, month = {2008-01}, keywords = {magma}, author = {Stanimire Tomov and Jack Dongarra and Marc Baboulin} } @techreport {icl:430, title = {Using dual techniques to derive componentwise and mixed condition numbers for a linear functional of a linear least squares solution}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-622 (also LAPACK Working Note 207)}, year = {2008}, month = {2008-01}, author = {Marc Baboulin and Serge Gratton} } @techreport {icl:391, title = {Computing the Conditioning of the Components of a Linear Least Squares Solution}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-07-604, (also LAPACK Working Note 193)}, year = {2007}, month = {2007-01}, author = {Marc Baboulin and Jack Dongarra and Serge Gratton and Julien Langou} }