@article {1262, title = {Algorithms and Optimization Techniques for High-Performance Matrix-Matrix Multiplications of Very Small Matrices}, journal = {Parallel Computing}, volume = {81}, year = {2019}, month = {2019-01}, pages = {1{\textendash}21}, abstract = {Expressing scientific computations in terms of BLAS, and in particular the general dense matrix-matrix multiplication (GEMM), is of fundamental importance for obtaining high performance portability across architectures. However, GEMMs for small matrices of sizes smaller than 32 are not sufficiently optimized in existing libraries. We consider the computation of many small GEMMs and its performance portability for a wide range of computer architectures, including Intel CPUs, ARM, IBM, Intel Xeon Phi, and GPUs. These computations often occur in applications like big data analytics, machine learning, high-order finite element methods (FEM), and others. The GEMMs are grouped together in a single batched routine. For these cases, we present algorithms and their optimization techniques that are specialized for the matrix sizes and architectures of interest. We derive a performance model and show that the new developments can be tuned to obtain performance that is within 90\% of the optimal for any of the architectures of interest. For example, on a V100 GPU for square matrices of size 32, we achieve an execution rate of about 1600 gigaFLOP/s in double-precision arithmetic, which is 95\% of the theoretically derived peak for this computation on a V100 GPU. We also show that these results outperform currently available state-of-the-art implementations such as vendor-tuned math libraries, including Intel MKL and NVIDIA CUBLAS, as well as open-source libraries like OpenBLAS and Eigen.}, keywords = {Autotuning, Batched GEMM, HPC, Matrix-matrix product, optimization, Small matrices}, doi = {https://doi.org/10.1016/j.parco.2018.10.003}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Marc Baboulin and Jo{\"e}l Falcou and Jack Dongarra} } @techreport {934, title = {Performance, Design, and Autotuning of Batched GEMM for GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-739}, year = {2016}, month = {2016-02}, publisher = {University of Tennessee}, abstract = {Abstract. The general matrix-matrix multiplication (GEMM) is the most important numerical kernel in dense linear algebra. It is the key component for obtaining high performance in most LAPACK routines. As batched computations on relatively small problems continue to gain interest in many scientific applications, there becomes a need to have a high performance GEMM kernel for a batch of small matrices. Such kernel should be well designed and tuned to handle small sizes, and to maintain high performance for realistic test cases found in the higher level LAPACK routines, and scientific computing applications in general. This paper presents a high performance batched GEMM kernel on Graphics Processing Units (GPUs). We address batched problems with both xed and variable sizes, and show that specialized GEMM designs and a comprehensive autotuning process are needed to handle problems of small sizes. For most performance test reported in this paper, the proposed kernels outperform state-of-the-art approaches using a K40c GPU.}, keywords = {Autotuning, Batched GEMM, GEMM, GPU computing, HPC}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {944, title = {Performance, Design, and Autotuning of Batched GEMM for GPUs}, booktitle = {The International Supercomputing Conference (ISC High Performance 2016)}, year = {2016}, month = {2016-06}, address = {Frankfurt, Germany}, abstract = {The general matrix-matrix multiplication (GEMM) is the most important numerical kernel in dense linear algebra, and is the key component for obtaining high performance in most LAPACK routines. As batched computations on relatively small problems continue to gain interest in many scientific applications, a need arises for a high performance GEMM kernel for batches of small matrices. Such a kernel should be well designed and tuned to handle small sizes, and to maintain high performance for realistic test cases found in the higher level LAPACK routines, and scientific computing applications in general. This paper presents a high performance batched GEMM kernel on Graphics Processing Units (GPUs). We address batched problems with both fixed and variable sizes, and show that specialized GEMM designs and a comprehensive autotuning process are needed to handle problems of small sizes. For most performance tests reported in this paper, the proposed kernels outperform state-of-the-art approaches using a K40c GPU.}, keywords = {Autotuning, Batched GEMM, GEMM, GPU computing, HPC}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {982, title = {Experiences in Autotuning Matrix Multiplication for Energy Minimization on GPUs}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2015}, month = {12-Oct}, pages = {5096 - 5113}, abstract = {In this paper, we report extensive results and analysis of autotuning the computationally intensive graphics processing units kernel for dense matrix{\textendash}matrix multiplication in double precision. In contrast to traditional autotuning and/or optimization for runtime performance only, we also take the energy efficiency into account. For kernels achieving equal performance, we show significant differences in their energy balance. We also identify the memory throughput as the most influential metric that trades off performance and energy efficiency. As a result, the performance optimal case ends up not being the most efficient kernel in overall resource use.}, keywords = {Autotuning, energy efficiency, hardware accelerators, matrix multiplication, power}, doi = {10.1002/cpe.3516}, url = {http://doi.wiley.com/10.1002/cpe.3516https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002\%2Fcpe.3516}, author = {Hartwig Anzt and Blake Haugen and Jakub Kurzak and Piotr Luszczek and Jack Dongarra} }