@article {, title = {A Set of Batched Basic Linear Algebra Subprograms}, journal = {ACM Transactions on Mathematical Software}, year = {2020}, month = {2020-10}, abstract = {This paper describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular half precision is used in many very large scale applications, such as those associated with machine learning.}, author = {Ahmad Abdelfattah and Timothy Costa and Jack Dongarra and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Mawussi Zounon} } @article {1300, title = {Batched BLAS (Basic Linear Algebra Subprograms) 2018 Specification}, year = {2018}, month = {2018-07}, abstract = {This document describes an API for Batch Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). We focus on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The extensions beyond the original BLAS standard are considered that specify a programming interface not only for routines with uniformly-sized matrices and/or vectors but also for the situation where the sizes vary. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance manycore platforms. These include multicore and many-core CPU processors; GPUs and coprocessors; as well as other hardware accelerators with floating-point compute facility.}, author = {Jack Dongarra and Iain Duff and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jonathan Hogg and Pedro Valero Lara and Piotr Luszczek and Mawussi Zounon and Samuel D. Relton and Stanimire Tomov and Timothy Costa and Sarah Knepper} } @inproceedings {1259, title = {The Design of Fast and Energy-Efficient Linear Solvers: On the Potential of Half-Precision Arithmetic and Iterative Refinement Techniques}, journal = {International Conference on Computational Science (ICCS 2018)}, volume = {10860}, year = {2018}, month = {2018-06}, pages = {586{\textendash}600}, publisher = {Springer}, address = {Wuxi, China}, abstract = {As parallel computers approach exascale, power efficiency in high-performance computing (HPC) systems is of increasing concern. Exploiting both the hardware features and algorithms is an effective solution to achieve power efficiency, and to address the energy constraints in modern and future HPC systems. In this work, we present a novel design and implementation of an energy-efficient solution for dense linear systems of equations, which are at the heart of large-scale HPC applications. The proposed energy-efficient linear system solvers are based on two main components: (1) iterative refinement techniques, and (2) reduced-precision computing features in modern accelerators and coprocessors. While most of the energy efficiency approaches aim to reduce the consumption with a minimal performance penalty, our method improves both the performance and the energy efficiency. Compared to highly-optimized linear system solvers, our kernels deliver the same accuracy solution up to 2{\texttimes} faster and reduce the energy consumption up to half on Intel Knights Landing (KNL) architectures. By efficiently using the Tensor Cores available in the NVIDIA V100 PCIe GPUs, the speedups can be up to 4{\texttimes} , with more than 80\% reduction in the energy consumption.}, doi = {https://doi.org/10.1007/978-3-319-93698-7_45}, url = {https://rdcu.be/bcKSC}, author = {Azzam Haidar and Ahmad Abdelfattah and Mawussi Zounon and Panruo Wu and Srikara Pranesh and Stanimire Tomov and Jack Dongarra} } @article {1208, title = {A Guide for Achieving High Performance with Very Small Matrices on GPUs: A Case Study of Batched LU and Cholesky Factorizations}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {29}, year = {2018}, month = {2018-05}, pages = {973{\textendash}984}, abstract = {We present a high-performance GPU kernel with a substantial speedup over vendor libraries for very small matrix computations. In addition, we discuss most of the challenges that hinder the design of efficient GPU kernels for small matrix algorithms. We propose relevant algorithm analysis to harness the full power of a GPU, and strategies for predicting the performance, before introducing a proper implementation. We develop a theoretical analysis and a methodology for high-performance linear solvers for very small matrices. As test cases, we take the Cholesky and LU factorizations and show how the proposed methodology enables us to achieve a performance close to the theoretical upper bound of the hardware. This work investigates and proposes novel algorithms for designing highly optimized GPU kernels for solving batches of hundreds of thousands of small-size Cholesky and LU factorizations. Our focus on efficient batched Cholesky and batched LU kernels is motivated by the increasing need for these kernels in scientific simulations (e.g., astrophysics applications). Techniques for optimal memory traffic, register blocking, and tunable concurrency are incorporated in our proposed design. The proposed GPU kernels achieve performance speedups versus CUBLAS of up to 6x for the factorizations, using double precision arithmetic on an NVIDIA Pascal P100 GPU.}, doi = {10.1109/TPDS.2017.2783929}, author = {Azzam Haidar and Ahmad Abdelfattah and Mawussi Zounon and Stanimire Tomov and Jack Dongarra} } @article {1189, title = {Symmetric Indefinite Linear Solver using OpenMP Task on Multicore Architectures}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {29}, year = {2018}, month = {2018-08}, pages = {1879{\textendash}1892}, abstract = {Recently, the Open Multi-Processing (OpenMP) standard has incorporated task-based programming, where a function call with input and output data is treated as a task. At run time, OpenMP{\textquoteright}s superscalar scheduler tracks the data dependencies among the tasks and executes the tasks as their dependencies are resolved. On a shared-memory architecture with multiple cores, the independent tasks are executed on different cores in parallel, thereby enabling parallel execution of a seemingly sequential code. With the emergence of many-core architectures, this type of programming paradigm is gaining attention-not only because of its simplicity, but also because it breaks the artificial synchronization points of the program and improves its thread-level parallelization. In this paper, we use these new OpenMP features to develop a portable high-performance implementation of a dense symmetric indefinite linear solver. Obtaining high performance from this kind of solver is a challenge because the symmetric pivoting, which is required to maintain numerical stability, leads to data dependencies that prevent us from using some common performance-improving techniques. To fully utilize a large number of cores through tasking, while conforming to the OpenMP standard, we describe several techniques. Our performance results on current many-core architectures-including Intel{\textquoteright}s Broadwell, Intel{\textquoteright}s Knights Landing, IBM{\textquoteright}s Power8, and Arm{\textquoteright}s ARMv8-demonstrate the portable and superior performance of our implementation compared with the Linear Algebra PACKage (LAPACK). The resulting solver is now available as a part of the PLASMA software package.}, keywords = {linear algebra, multithreading, runtime, symmetric indefinite matrices}, doi = {10.1109/TPDS.2018.2808964}, author = {Ichitaro Yamazaki and Jakub Kurzak and Panruo Wu and Mawussi Zounon and Jack Dongarra} } @article {1332, title = {Using GPU FP16 Tensor Cores Arithmetic to Accelerate Mixed-Precision Iterative Refinement Solvers and Reduce Energy Consumption}, year = {2018}, month = {2018-06}, publisher = {ISC High Performance (ISC18), Best Poster Award}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Mawussi Zounon and Jack Dongarra} } @conference {1265, title = {Using GPU FP16 Tensor Cores Arithmetic to Accelerate Mixed-Precision Iterative Refinement Solvers and Reduce Energy Consumption}, booktitle = {ISC High Performance (ISC{\textquoteright}18), Best Poster}, year = {2018}, month = {2018-06}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Mawussi Zounon and Jack Dongarra} } @conference {1168, title = {The Design and Performance of Batched BLAS on Modern High-Performance Computing Systems}, booktitle = {International Conference on Computational Science (ICCS 2017)}, year = {2017}, month = {2017-06}, publisher = {Elsevier}, organization = {Elsevier}, address = {Z{\"u}rich, Switzerland}, abstract = {A current trend in high-performance computing is to decompose a large linear algebra problem into batches containing thousands of smaller problems, that can be solved independently, before collating the results. To standardize the interface to these routines, the community is developing an extension to the BLAS standard (the batched BLAS), enabling users to perform thousands of small BLAS operations in parallel whilst making efficient use of their hardware. We discuss the benefits and drawbacks of the current batched BLAS proposals and perform a number of experiments, focusing on a general matrix-matrix multiplication (GEMM), to explore their affect on the performance. In particular we analyze the effect of novel data layouts which, for example, interleave the matrices in memory to aid vectorization and prefetching of data. Utilizing these modifications our code outperforms both MKL1 CuBLAS2 by up to 6 times on the self-hosted Intel KNL (codenamed Knights Landing) and Kepler GPU architectures, for large numbers of double precision GEMM operations using matrices of size 2 {\texttimes} 2 to 20 {\texttimes} 20.}, keywords = {Batched BLAS, BLAS, High-performance computing, Memory management, Parallel processing, Scientific computing}, doi = {DOI:10.1016/j.procs.2017.05.138}, author = {Jack Dongarra and Sven Hammarling and Nicholas J. Higham and Samuel Relton and Pedro Valero-Lara and Mawussi Zounon} } @conference {1170, title = {Optimized Batched Linear Algebra for Modern Architectures}, booktitle = {Euro-Par 2017}, year = {2017}, month = {2017-08}, publisher = {Springer}, organization = {Springer}, address = {Santiago de Compostela, Spain}, abstract = {Solving large numbers of small linear algebra problems simultaneously is becoming increasingly important in many application areas. Whilst many researchers have investigated the design of efficient batch linear algebra kernels for GPU architectures, the common approach for many/multi-core CPUs is to use one core per subproblem in the batch. When solving batches of very small matrices, 2 {\texttimes} 2 for example, this design exhibits two main issues: it fails to fully utilize the vector units and the cache of modern architectures, since the matrices are too small. Our approach to resolve this is as follows: given a batch of small matrices spread throughout the primary memory, we first reorganize the elements of the matrices into a contiguous array, using a block interleaved memory format, which allows us to process the small independent problems as a single large matrix problem and enables cross-matrix vectorization. The large problem is solved using blocking strategies that attempt to optimize the use of the cache. The solution is then converted back to the original storage format. To explain our approach we focus on two BLAS routines: general matrix-matrix multiplication (GEMM) and the triangular solve (TRSM). We extend this idea to LAPACK routines using the Cholesky factorization and solve (POSV). Our focus is primarily on very small matrices ranging in size from 2 {\texttimes} 2 to 32 {\texttimes} 32. Compared to both MKL and OpenMP implementations, our approach can be up to 4 times faster for GEMM, up to 14 times faster for TRSM, and up to 40 times faster for POSV on the new Intel Xeon Phi processor, code-named Knights Landing (KNL). Furthermore, we discuss strategies to avoid data movement between sockets when using our interleaved approach on a NUMA node.}, doi = {https://doi.org/10.1007/978-3-319-64203-1_37}, author = {Jack Dongarra and Sven Hammarling and Nicholas J. Higham and Samuel Relton and Mawussi Zounon} } @techreport {1173, title = {PLASMA 17 Performance Report}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-11}, year = {2017}, month = {2017-06}, publisher = {University of Tennessee}, abstract = {PLASMA (Parallel Linear Algebra for Multicore Architectures) is a dense linear algebra package at the forefront of multicore computing. PLASMA is designed to deliver the highest possible performance from a system with multiple sockets of multicore processors. PLASMA achieves this objective by combining state of the art solutions in parallel algorithms, scheduling, and software engineering. PLASMA currently offers a collection of routines for solving linear systems of equations and least square problems.}, author = {Maksims Abalenkovs and Negin Bagherpour and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Samuel Relton and Jakub Sistek and David Stevens and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Mawussi Zounon} } @techreport {1172, title = {PLASMA 17.1 Functionality Report}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-10}, year = {2017}, month = {2017-06}, publisher = {University of Tennessee}, abstract = {PLASMA (Parallel Linear Algebra for Multicore Architectures) is a dense linear algebra package at the forefront of multicore computing. PLASMA is designed to deliver the highest possible performance from a system with multiple sockets of multicore processors. PLASMA achieves this objective by combining state of the art solutions in parallel algorithms, scheduling, and software engineering. PLASMA currently offers a collection of routines for solving linear systems of equations and least square problems.}, author = {Maksims Abalenkovs and Negin Bagherpour and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Samuel Relton and Jakub Sistek and David Stevens and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan and Mawussi Zounon} } @article {1343, title = {A Standard for Batched BLAS Routines}, year = {2016}, month = {2016-04}, publisher = {17th SIAM Conference on Parallel Processing for Scientific Computing (SIAM PP16)}, address = {Paris, France}, author = {Pedro Valero-Lara and Jack Dongarra and Azzam Haidar and Samuel D. Relton and Stanimire Tomov and Mawussi Zounon} }