@inproceedings {, title = {Addressing Irregular Patterns of Matrix Computations on GPUs and Their Impact on Applications Powered by Sparse Direct Solvers}, journal = {2022 International Conference for High Performance Computing, Networking, Storage and Analysis (SC22)}, year = {2022}, month = {2022-11}, pages = {354-367}, publisher = {IEEE Computer Society}, address = {Dallas, TX}, abstract = {Many scientific applications rely on sparse direct solvers for their numerical robustness. However, performance optimization for these solvers remains a challenging task, especially on GPUs. This is due to workloads of small dense matrices that are different in size. Matrix decompositions on such irregular workloads are rarely addressed on GPUs. This paper addresses irregular workloads of matrix computations on GPUs, and their application to accelerate sparse direct solvers. We design an interface for the basic matrix operations supporting problems of different sizes. The interface enables us to develop irrLU-GPU, an LU decomposition on matrices of different sizes. We demonstrate the impact of irrLU-GPU on sparse direct LU solvers using NVIDIA and AMD GPUs. Experimental results are shown for a sparse direct solver based on a multifrontal sparse LU decomposition applied to linear systems arising from the simulation, using finite element discretization on unstructured meshes, of a high-frequency indefinite Maxwell problem.}, keywords = {GPU computing, irregular computational workloads, lu factorization, multifrontal solvers, sparse direct solvers}, url = {https://dl.acm.org/doi/abs/10.5555/3571885.3571919}, author = {Ahmad Abdelfattah and Pieter Ghysels and Wajih Boukaram and Stanimire Tomov and Xiaoye Sherry Li and Jack Dongarra} } @techreport {, title = {Analysis of the Communication and Computation Cost of FFT Libraries towards Exascale}, journal = {ICL Technical Report}, number = {ICL-UT-22-07}, year = {2022}, month = {2022-07}, publisher = {Innovative Computing Laboratory}, author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Sebastien Cayrols and Gerald Ragghianti and Jack Dongarra} } @inbook {, title = {Batch QR Factorization on GPUs: Design, Optimization, and Tuning}, booktitle = { Lecture Notes in Computer Science}, volume = {13350}, year = {2022}, month = {2022-06}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Cham}, abstract = {QR factorization of dense matrices is a ubiquitous tool in high performance computing (HPC). From solving linear systems and least squares problems to eigenvalue problems, and singular value decompositions, the impact of a high performance QR factorization is fundamental to computer simulations and many applications. More importantly, the QR factorization on a batch of relatively small matrices has acquired a lot of attention in sparse direct solvers and low-rank approximations for Hierarchical matrices. To address this interest and demand, we developed and present a high performance batch QR factorization for Graphics Processing Units (GPUs). We present a multi-level blocking strategy that adjusts various algorithmic designs to the size of the input matrices. We also show that following the LAPACK QR design convention, while still useful, is significantly outperformed by unconventional code structures that increase data reuse. The performance results show multi-fold speedups against the state of the art libraries on the latest GPU architectures from both NVIDIA and AMD.}, keywords = {Batch linear algebra, GPU computing, QR factorization}, isbn = {978-3-031-08750-9}, doi = {10.1007/978-3-031-08751-6_5}, url = {https://link.springer.com/chapter/10.1007/978-3-031-08751-6_5}, author = {Abdelfattah, Ahmad and Stanimire Tomov and Dongarra, Jack}, editor = {Groen, Derek and de Mulatier, C{\'e}lia and Paszy{\'n}ski, Maciej and Krzhizhanovskaya, Valeria V. and Dongarra, Jack J. and Sloot, Peter M. A.} } @conference {, title = {Extending MAGMA Portability with OneAPI}, booktitle = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC22), Ninth Workshop on Accelerator Programming Using Directives (WACCPD 2022)}, year = {2022}, month = {2022-11}, address = {Dallas, TX}, author = {Anna Fortenberry and Stanimire Tomov} } @article {, title = {Extending MAGMA Portability with OneAPI}, year = {2022}, month = {2022-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC22), ACM Student Research Competition}, address = {Dallas, TX}, url = {https://sc22.supercomputing.org/proceedings/src_poster/poster_files/spostu105s3-file1.pdf}, author = {Anna Fortenberry and Stanimire Tomov and Kwai Wong} } @techreport {, title = {FFT Benchmark Performance Experiments on Systems Targeting Exascale}, journal = {ICL Technical Report}, number = {ICL-UT-22-02}, year = {2022}, month = {2022-03}, author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Sebastien Cayrols and Gerald Ragghianti and Jack Dongarra} } @inproceedings {, title = {Lossy all-to-all exchange for accelerating parallel 3-D FFTs on hybrid architectures with GPUs}, journal = {2022 IEEE International Conference on Cluster Computing (CLUSTER)}, year = {2022}, month = {2022-09}, pages = {152-160}, abstract = {In the context of parallel applications, communication is a critical part of the infrastructure and a potential bottleneck. The traditional approach to tackle communication challenges consists of redesigning algorithms so that the complexity or the communication volume is reduced. However, there are algorithms like the Fast Fourier Transform (FFT) where reducing the volume of communication is very challenging yet can reap large benefit in terms of time-to-completion. In this paper, we revisit the implementation of the MPI all-to-all routine at the core of 3D FFTs by using advanced MPI features, such as One-Sided Communication, and integrate data compression during communication to reduce the volume of data exchanged. Since some compression techniques are {\textquoteleft}lossy{\textquoteright} in the sense that they involve a loss of accuracy, we study the impact of lossy compression in heFFTe, the state-of-the-art FFT library for large scale 3D FFTs on hybrid architectures with GPUs. Consequently, we design an approximate FFT algorithm that trades off user-controlled accuracy for speed. We show that we speedup the 3D FFTs proportionally to the compression rate. In terms of accuracy, comparing our approach with a reduced precision execution, where both the data and the computation are in reduced precision, we show that when the volume of communication is compressed to the size of the reduced precision data, the approximate FFT algorithm is as fast as the one in reduced precision while the accuracy is one order of magnitude better.}, doi = {10.1109/CLUSTER51413.2022.00029}, author = {Cayrols, Sebastien and Li, Jiali and George Bosilca and Stanimire Tomov and Ayala, Alan and Dongarra, Jack} } @techreport {, title = {Mixed precision and approximate 3D FFTs: Speed for accuracy trade-off with GPU-aware MPI and run-time data compression}, journal = {ICL Technical Report}, number = {ICL-UT-22-04}, year = {2022}, month = {2022-05}, keywords = {All-to-all, Approximate FFTs, ECP, heFFTe, Lossy compression, mixed-precision algorithms, MPI}, author = {Sebastien Cayrols and Jiali Li and George Bosilca and Stanimire Tomov and Alan Ayala and Jack Dongarra} } @techreport {, title = {PAQR: Pivoting Avoiding QR factorization}, journal = {ICL Technical Report}, number = {ICL-UT-22-06}, year = {2022}, month = {2022-06}, abstract = {The solution of linear least-squares problems is at the heart of many scientific and engineering applications. While any method able to minimize the backward error of such problems is considered numerically stable, the theory states that the forward error depends on the condition number of the matrix in the system of equations. On the one hand, the QR factorization is an efficient method to solve such problems, but the solutions it produces may have large forward errors when the matrix is deficient. On the other hand, QR with column pivoting (QRCP) is able to produce smaller forward errors on deficient matrices, but its cost is prohibitive compared to QR. The aim of this paper is to propose PAQR, an alternative solution method with the same cost (or smaller) as QR and as accurate as QRCP in practical cases, for the solution of rank-deficient linear least-squares problems. After presenting the algorithm and its implementations on different architectures, we compare its accuracy and performance results on a variety of application problems. }, author = {Wissam M. Sid-Lakhdar and Sebastien Cayrols and Daniel Bielich and Ahmad Abdelfattah and Piotr Luszczek and Mark Gates and Stanimire Tomov and Hans Johansen and David Williams-Young and Timothy A. Davis and Jack Dongarra} } @conference {, title = {Performance Analysis of Parallel FFT on Large Multi-GPU Systems}, booktitle = {2022 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2022}, month = {2022-08}, publisher = {IEEE}, organization = {IEEE}, address = {Lyon, France}, doi = {10.1109/IPDPSW55747.2022.00072}, url = {https://ieeexplore.ieee.org/document/9835388/}, author = {Ayala, Alan and Stanimire Tomov and Stoyanov, Miroslav and Haidar, Azzam and Dongarra, Jack} } @conference {, title = {A Python Library for Matrix Algebra on GPU and Multicore Architectures}, booktitle = {2022 IEEE 19th International Conference on Mobile Ad Hoc and Smart Systems (MASS)}, year = {2022}, month = {2022-12}, publisher = {IEEE}, organization = {IEEE}, address = {Denver, CO}, doi = {10.1109/MASS56207.2022.00121}, url = {https://ieeexplore.ieee.org/document/9973474/}, author = {Nance, Delario and Stanimire Tomov and Wong, Kwai} } @article {, title = {Accelerating FFT towards Exascale Computing}, year = {2021}, publisher = {NVIDIA GPU Technology Conference (GTC2021)}, author = {Alan Ayala and Stanimire Tomov and Haidar, Azzam and Stoyanov, M. and Cayrols, Sebastien and Li, Jiali and George Bosilca and Jack Dongarra} } @article {, title = {Efficient exascale discretizations: High-order finite element methods}, journal = {The International Journal of High Performance Computing Applications}, year = {2021}, pages = {10943420211020803}, abstract = {Efficient exploitation of exascale architectures requires rethinking of the numerical algorithms used in many large-scale applications. These architectures favor algorithms that expose ultra fine-grain parallelism and maximize the ratio of floating point operations to energy intensive data movement. One of the few viable approaches to achieve high efficiency in the area of PDE discretizations on unstructured grids is to use matrix-free/partially assembled high-order finite element methods, since these methods can increase the accuracy and/or lower the computational time due to reduced data motion. In this paper we provide an overview of the research and development activities in the Center for Efficient Exascale Discretizations (CEED), a co-design center in the Exascale Computing Project that is focused on the development of next-generation discretization software and algorithms to enable a wide range of finite element applications to run efficiently on future hardware. CEED is a research partnership involving more than 30 computational scientists from two US national labs and five universities, including members of the Nek5000, MFEM, MAGMA and PETSc projects. We discuss the CEED co-design activities based on targeted benchmarks, miniapps and discretization libraries and our work on performance optimizations for large-scale GPU architectures. We also provide a broad overview of research and development activities in areas such as unstructured adaptive mesh refinement algorithms, matrix-free linear solvers, high-order data visualization, and list examples of collaborations with several ECP and external applications.}, keywords = {co-design, high-order discretizations, High-performance computing, PDEs, unstructured grids}, doi = {10.1177/10943420211020803}, author = {Kolev, Tzanio and Fischer, Paul and Min, Misun and Jack Dongarra and Brown, Jed and Dobrev, Veselin and Warburton, Tim and Stanimire Tomov and Shephard, Mark S and Abdelfattah, Ahmad and others} } @article {, title = {Exploiting Block Structures of KKT Matrices for Efficient Solution of Convex Optimization Problems}, journal = {IEEE Access}, year = {2021}, doi = {10.1109/ACCESS.2021.3106054}, author = {Iqbal, Zafar and Nooshabadi, Saeid and Yamazaki, Ichitaro and Stanimire Tomov and Jack Dongarra} } @techreport {, title = {Interim Report on Benchmarking FFT Libraries on High Performance Systems}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-21-03}, year = {2021}, month = {2021-07}, publisher = {University of Tennessee}, type = {ICL Tech Report}, abstract = {The Fast Fourier Transform (FFT) is used in many applications such as molecular dynamics, spectrum estimation, fast convolution and correlation, signal modulation, and many wireless multimedia applications. FFTs are also heavily used in ECP applications, such as EXAALT, Copa, ExaSky-HACC, ExaWind, WarpX, and many others. As these applications{\textquoteright} accuracy and speed depend on the performance of the FFTs, we designed an FFT benchmark to mea- sure performance and scalability of currently available FFT packages and present the results from a pre-Exascale platform. Our benchmarking also stresses the overall capacity of system interconnect; thus, it may be considered as an indicator of the bisection bandwidth, communication contention noise, and the software overheads in MPI collectives that are of interest to many other ECP applications and libraries. This FFT benchmarking project aims to show the strengths and weaknesses of multiple FFT libraries and to indicate what can be done to improve their performance. In particular, we believe that the benchmarking results could help design and implement a fast and robust FFT library for 2D and 3D inputs, while targeting large-scale heterogeneous systems with multicore processors and hardware accelerators that are a co-designed in tandem with ECP applications. Our work involves studying and analyzing state-of-the-art FFT software both from vendors and available as open-source codes to better understand their performance.}, author = {Alan Ayala and Stanimire Tomov and Piotr Luszczek and Cayrols, Sebastien and Ragghianti, Gerald and Jack Dongarra} } @article {, title = {libCEED: Fast algebra for high-order element-based discretizations}, journal = {Journal of Open Source Software}, volume = {6}, number = {63}, year = {2021}, pages = {2945}, abstract = {Finite element methods are widely used to solve partial differential equations (PDE) in science and engineering, but their standard implementation (Arndt et al., 2020; Kirk et al., 2006; Logg et al., 2012) relies on assembling sparse matrices. Sparse matrix multiplication and triangular operations perform a scalar multiply and add for each nonzero entry, just 2 floating point operations (flops) per scalar that must be loaded from memory (Williams et al., 2009). Modern hardware is capable of nearly 100 flops per scalar streamed from memory (Rupp, 2020) so sparse matrix operations cannot achieve more than about 2\% utilization of arithmetic units. Matrix assembly becomes even more problematic when the polynomial degree p of the basis functions is increased, resulting in O(pd) storage and O(p2d) compute per degree of freedom (DoF) in d dimensions. Methods pioneered by the spectral element community (Deville et al., 2002; Orszag, 1980) exploit problem structure to reduce costs to O(1) storage and O(p) compute per DoF, with very high utilization of modern CPUs and GPUs. Unfortunately, highquality implementations have been relegated to applications and intrusive frameworks that are often difficult to extend to new problems or incorporate into legacy applications, especially when strong preconditioners are required. libCEED, the Code for Efficient Extensible Discretization (Abdelfattah et al., 2021), is a lightweight library that provides a purely algebraic interface for linear and nonlinear operators and preconditioners with element-based discretizations. libCEED provides portable performance via run-time selection of implementations optimized for CPUs and GPUs, including support for just-in-time (JIT) compilation. It is designed for convenient use in new and legacy software, and offers interfaces in C99 (International Standards Organisation, 1999), Fortran77 (ANSI, 1978), Python (Python, 2021), Julia (Bezanson et al., 2017), and Rust (Rust, 2021). Users and library developers can integrate libCEED at a low level into existing applications in place of existing matrix-vector products without significant refactoring of their own discretization infrastructure. Alternatively, users can utilize integrated libCEED support in MFEM (Anderson et al., 2020; MFEM, 2021). In addition to supporting applications and discretization libraries, libCEED provides a platform for performance engineering and co-design, as well as an algebraic interface for solvers research like adaptive p-multigrid, much like how sparse matrix libraries enable development and deployment of algebraic multigrid solvers}, keywords = {finite elements, high-order methods, High-performance computing, matrix-free, spectral elements}, doi = {10.21105/joss.02945}, url = {https://doi.org/10.21105/joss.02945}, author = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jean-Sylvain Camier and Veselin Dobrev and Yohann Dudouit and Leila Ghaffari and Tzanio Kolev and David Medina and Will Pazner and Thilina Ratnayaka and Jeremy Thompson and Stanimire Tomov} } @article {, title = {Linear Algebra Prepara.on for Emergent Neural Network Architectures: MAGMA, BLAS, and Batched GPU Computing}, year = {2021}, month = {2021-11}, publisher = {LAPENNA Workshop}, address = {Virtual}, author = {Stanimire Tomov and Kwai Wong and Rocco Febbo and Julian Halloy} } @article {, title = {MAGMA: Evolution and Revolution}, year = {2021}, month = {2021-07}, publisher = {ICL Lunch Talk Seminar}, address = {Knoxville, TN}, author = {Stan Tomov} } @techreport {, title = {A More Portable HeFFTe: Implementing a Fallback Algorithm for Scalable Fourier Transforms}, journal = {ICL Technical Report}, number = {ICL-UT-21-04}, year = {2021}, note = {accepted at HPEC{\textquoteright}21}, month = {2021-08}, publisher = {University of Tennessee}, type = {ICL Tech Report}, author = {Daniel Sharp and Miroslav Stoyanov and Stanimire Tomov and Jack Dongarra} } @inproceedings {, title = {Scalability Issues in FFT Computation}, journal = {International Conference on Parallel Computing Technologies}, year = {2021}, pages = {279{\textendash}287}, publisher = {Springer}, abstract = {The fast Fourier transform (FFT), is one the most important tools in mathematics, and it is widely required by several applications of science and engineering. State-of-the-art parallel implementations of the FFT algorithm, based on Cooley-Tukey developments, are known to be communication-bound, which causes critical issues when scaling the computational and architectural capabilities. In this paper, we study the main performance bottleneck of FFT computations on hybrid CPU and GPU systems at large-scale. We provide numerical simulations and potential acceleration techniques that can be easily integrated into FFT distributed libraries. We present different experiments on performance scalability and runtime analysis on the world{\textquoteright}s most powerful supercomputers today: Summit, using up to 6,144 NVIDIA V100 GPUs, and Fugaku, using more than one million Fujitsu A64FX cores.}, keywords = {Hybrid systems, Parallel FFT, scalability}, isbn = {978-3-030-86359-3}, doi = {10.1007/978-3-030-86359-3_21}, author = {Alan Ayala and Stanimire Tomov and Stoyanov, Miroslav and Jack Dongarra} } @article {, title = {A Set of Batched Basic Linear Algebra Subprograms and LAPACK Routines}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {47}, number = {3}, year = {2021}, pages = {1{\textendash}23}, abstract = {This article describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular, half precision is used in many very large scale applications, such as those associated with machine learning.}, keywords = {Computations on matrices, Mathematical analysis, Mathematics of computing, Numerical analysis}, doi = {10.1145/3431921}, author = {Abdelfattah, Ahmad and Costa, Timothy and Jack Dongarra and Mark Gates and Haidar, Azzam and Hammarling, Sven and Higham, Nicholas J and Kurzak, Jakub and Piotr Luszczek and Stanimire Tomov and others} } @article {, title = {Translational process: Mathematical software perspective}, journal = {Journal of Computational Science}, volume = {52}, year = {2021}, pages = {101216}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators}, doi = {10.1016/j.jocs.2020.101216}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {1465, title = {Asynchronous SGD for DNN Training on Shared-Memory Parallel Architectures}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-04}, year = {2020}, month = {2020-03}, publisher = {University of Tennessee, Knoxville}, abstract = {We present a parallel asynchronous Stochastic Gradient Descent algorithm for shared memory architectures. Different from previous asynchronous algorithms, we consider the case where the gradient updates are not particularly sparse. In the context of the MagmaDNN framework, we compare the parallel efficiency of the asynchronous implementation with that of the traditional synchronous implementation. Tests are performed for training deep neural networks on multicore CPUs and GPU devices.}, keywords = {Asynchronous iterative methods, Deep learning, gpu, multicore CPU, Stochastic Gradient Descent}, author = {Florent Lopez and Edmond Chow and Stanimire Tomov and Jack Dongarra} } @conference {1485, title = {Asynchronous SGD for DNN Training on Shared-Memory Parallel Architectures}, booktitle = {Workshop on Scalable Deep Learning over Parallel And Distributed Infrastructures (ScaDL 2020)}, year = {2020}, month = {2020-05}, author = {Florent Lopez and Edmond Chow and Stanimire Tomov and Jack Dongarra} } @techreport {, title = {CEED ECP Milestone Report: Improve Performance and Capabilities of CEED-Enabled ECP Applications on Summit/Sierra}, journal = {ECP Milestone Reports}, year = {2020}, month = {2020-05}, publisher = {Zenodo}, doi = {https://doi.org/10.5281/zenodo.3860804}, url = {https://doi.org/10.5281/zenodo.3860804}, author = {Kolev, Tzanio and Fischer, Paul and Abdelfattah, Ahmad and Ananthan, Shreyas and Valeria Barra and Natalie Beams and Bleile, Ryan and Brown, Jed and Carson, Robert and Camier, Jean-Sylvain and Churchfield, Matthew and Dobrev, Veselin and Jack Dongarra and Dudouit, Yohann and Karakus, Ali and Kerkemeier, Stefan and Lan, YuHsiang and Medina, David and Merzari, Elia and Min, Misun and Parker, Scott and Ratnayaka, Thilina and Smith, Cameron and Sprague, Michael and Stitt, Thomas and Thompson, Jeremy and Tomboulides, Ananias and Stanimire Tomov and Tomov, Vladimir and Vargas, Arturo and Warburton, Tim and Weiss, Kenneth} } @article {, title = {Clover: Computational Libraries Optimized via Exascale Research}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Stanimire Tomov and Hartwig Anzt and Piotr Luszczek and Jack Dongarra} } @techreport {, title = {Design, Optimization, and Benchmarking of Dense Linear Algebra Algorithms on AMD GPUs}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-12}, year = {2020}, month = {2020-08}, publisher = {University of Tennessee}, abstract = {Dense linear algebra (DLA) has historically been in the vanguard of software that must be adapted first to hardware changes. This is because DLA is both critical to the accuracy and performance of so many different types of applications, and because they have proved to be outstanding vehicles for finding and implementing solutions to the problems that novel architectures pose. Therefore, in this paper we investigate the portability of the MAGMA DLA library to the latest AMD GPUs. We use auto tools to convert the CUDA code in MAGMA to the HeterogeneousComputing Interface for Portability (HIP) language. MAGMA provides LAPACK for GPUs and benchmarks for fundamental DLA routines ranging from BLAS to dense factorizations, linear systems and eigen-problem solvers. We port these routines to HIP and quantify currently achievable performance through the MAGMA benchmarks for the main workload algorithms on MI25 and MI50 AMD GPUs. Comparison with performance roofline models and theoretical expectations are used to identify current limitations and directions for future improvements.}, keywords = {AMD GPUs, GPU computing, HIP Runtime, HPC, numerical linear algebra, Portability}, author = {Cade Brown and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {, title = {Design, Optimization, and Benchmarking of Dense Linear Algebra Algorithms on AMD GPUs}, booktitle = {2020 IEEE High Performance Extreme Computing Virtual Conference}, year = {2020}, month = {2020-09}, publisher = {IEEE}, organization = {IEEE}, abstract = {Dense linear algebra (DLA) has historically been in the vanguard of software that must be adapted first to hardware changes. This is because DLA is both critical to the accuracy and performance of so many different types of applications, and because they have proved to be outstanding vehicles for finding and implementing solutions to the problems that novel architectures pose. Therefore, in this paper we investigate the portability of the MAGMA DLA library to the latest AMD GPUs. We use auto tools to convert the CUDA code in MAGMA to the HeterogeneousComputing Interface for Portability (HIP) language. MAGMA provides LAPACK for GPUs and benchmarks for fundamental DLA routines ranging from BLAS to dense factorizations, linear systems and eigen-problem solvers. We port these routines to HIP and quantify currently achievable performance through the MAGMA benchmarks for the main workload algorithms on MI25 and MI50 AMD GPUs. Comparison with performance roofline models and theoretical expectations are used to identify current limitations and directions for future improvements.}, author = {Cade Brown and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @techreport {1461, title = {FFT-ECP API and High-Performance Library Prototype for 2-D and 3-D FFTs on Large-Scale Heterogeneous Systems with GPUs}, journal = {ECP Milestone Report}, number = {FFT-ECP STML13-27}, year = {2020}, note = {revision 01-2020}, month = {2020-01}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {ECP WBS 2.3.3.13 Milestone Report}, author = {Stanimire Tomov and Alan Ayala and Azzam Haidar and Jack Dongarra} } @conference {1481, title = {heFFTe: Highly Efficient FFT for Exascale}, booktitle = {International Conference on Computational Science (ICCS 2020)}, year = {2020}, month = {2020-06}, address = {Amsterdam, Netherlands}, abstract = {Exascale computing aspires to meet the increasing demands from large scientific applications. Software targeting exascale is typically designed for heterogeneous architectures; henceforth, it is not only important to develop well-designed software, but also make it aware of the hardware architecture and efficiently exploit its power. Currently, several and diverse applications, such as those part of the Exascale Computing Project (ECP) in the United States, rely on efficient computation of the Fast Fourier Transform (FFT). In this context, we present the design and implementation of heFFTe (Highly Efficient FFT for Exascale) library, which targets the upcoming exascale supercomputers. We provide highly (linearly) scalable GPU kernels that achieve more than 40{\texttimes} speedup with respect to local kernels from CPU state-of-the-art libraries, and over 2{\texttimes} speedup for the whole FFT computation. A communication model for parallel FFTs is also provided to analyze the bottleneck for large-scale problems. We show experiments obtained on Summit supercomputer at Oak Ridge National Laboratory, using up to 24,576 IBM Power9 cores and 6,144 NVIDIA V-100 GPUs.}, keywords = {exascale, FFT, gpu, scalable algorithm}, doi = {https://doi.org/10.1007/978-3-030-50371-0_19}, author = {Alan Ayala and Stanimire Tomov and Azzam Haidar and Jack Dongarra} } @article {, title = {heFFTe: Highly Efficient FFT for Exascale (Poster)}, year = {2020}, month = {2020-10}, publisher = {NVIDIA GPU Technology Conference (GTC2020)}, author = {Alan Ayala and Stanimire Tomov and Azzam Haidar and Jack Dongarra} } @article {, title = {heFFTe: Highly Efficient FFT for Exascale (Poster)}, year = {2020}, month = {2020-02}, publisher = {SIAM Conference on Parallel Processing for Scientific Computing (SIAM PP20)}, address = {Seattle, WA}, abstract = {Considered one of the top 10 algorithms of the 20th century, the Fast Fourier Transform (FFT) is widely used by applications in science and engineering. Large scale parallel applications targeting exascale, such as those part of the DOE Exascale Computing Project (ECP), are designed for heterogeneous architectures and, currently, more than a dozen ECP applications use FFTs in their codes. To address the applications needs, we developed the highly efficient FFTs for exascale (heFFTe) library. The heFFTe library release features very good weak and strong scalability and performance that is close to 90\% of the roofline peak performance. We present these performance results on the Summit supercomputer. heFFTe is also integrated in a number of applications and we present how the overall performance gets improved by using hFFTe. Performance model, limitations, and challenges are discussed for current and upcoming computer architectures.}, author = {Alan Ayala and Stanimire Tomov and Azzam Haidar and Jack Dongarra} } @article {, title = {heFFTe: Highly Efficient FFT for Exascale (Poster)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Alan Ayala and Stanimire Tomov and Jack Dongarra and Azzam Haidar} } @conference {, title = {High-Order Finite Element Method using Standard and Device-Level Batch GEMM on GPUs}, booktitle = {2020 IEEE/ACM 11th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {We present new GPU implementations of the tensor contractions arising from basis-related computations for highorder finite element methods. We consider both tensor and nontensor bases. In the case of tensor bases, we introduce new kernels based on a series of fused device-level matrix multiplications (GEMMs), specifically designed to utilize the fast memory of the GPU. For non-tensor bases, we develop a tuned framework for choosing standard batch-BLAS GEMMs that will maximize performance across groups of elements. The implementations are included in a backend of the libCEED library. We present benchmark results for the diffusion and mass operators using libCEED integration through the MFEM finite element library and compare to those of the previously best-performing GPU backends for stand-alone basis computations. In tensor cases, we see improvements of approximately 10-30\% for some cases, particularly for higher basis orders. For the non-tensor tests, the new batch-GEMMs implementation is twice as fast as what was previously available for basis function order greater than five and greater than approximately 105 degrees of freedom in the mesh; up to ten times speedup is seen for eighth-order basis functions.}, keywords = {Batched linear algebra, finite elements, gpu, high-order methods, matrix-free FEM, Tensor contractions}, author = {Natalie Beams and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra and Tzanio Kolev and Yohann Dudouit} } @booklet {, title = {hipMAGMA v1.0}, year = {2020}, month = {2020-03}, publisher = {Zenodo}, doi = {10.5281/zenodo.3908549}, url = {https://doi.org/10.5281/zenodo.3908549}, author = {Cade Brown and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @booklet {, title = {hipMAGMA v2.0}, year = {2020}, month = {2020-07}, publisher = {Zenodo}, doi = {10.5281/zenodo.3928667}, url = {https://doi.org/10.5281/zenodo.3928667}, author = {Cade Brown and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {, title = {How to Build Your Own Deep Neural Network}, year = {2020}, month = {2020-07}, publisher = {PEARC20}, keywords = {AI, Deep Neural Networks, dense linear algebra, HPC, ML}, author = {Kwai Wong and Stanimire Tomov and Daniel Nichols and Rocco Febbo and Florent Lopez and Julian Halloy and Xianfeng Ma} } @article {, title = {Integrating Deep Learning in Domain Science at Exascale (MagmaDNN)}, year = {2020}, month = {2020-12}, publisher = {DOD HPCMP seminar}, address = {virtual}, abstract = {We will present some of the current challenges in the design and integration of deep learning AI with traditional HPC simulations. We evaluate existing packages for readiness to run efficiently deep learning models and applications on large scale HPC systems, identify challenges, and propose new asynchronous parallelization and optimization techniques for current large-scale heterogeneous systems and up-coming exascale systems. These developments, along with existing HPC AI software capabilities, have been integrated in MagmaDNN, an open source HPC deep learning framework. Many deep learning frameworks are targeted towards data scientists and fall short in providing quality integration into existing HPC workflows. This paper discusses the necessities of an HPC deep learning framework and how these can be provided, e.g., as in MagmaDNN, through a deep integration with existing HPC libraries such as MAGMA and its modular memory management, MPI, CuBLAS, CuDNN, MKL, and HIP. Advancements are also illustrated through the use of algorithmic enhancements in reduced and mixed-precision and asynchronous optimization methods. Finally, we present illustrations and potential solutions on enhancing traditional compute and data intensive applications at ORNL and UTK with AI. The approaches and future challenges are illustrated on materials science, imaging, and climate applications.}, author = {Stanimire Tomov and Kwai Wong and Jack Dongarra and Rick Archibald and Edmond Chow and Eduardo D{\textquoteright}Azevedo and Markus Eisenbach and Rocco Febbo and Florent Lopez and Daniel Nichols and Junqi Yin} } @techreport {, title = {Integrating Deep Learning in Domain Sciences at Exascale}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-10}, year = {2020}, month = {2020-08}, publisher = {University of Tennessee}, abstract = {This paper presents some of the current challenges in designing deep learning artificial intelligence (AI) and integrating it with traditional high-performance computing (HPC) simulations. We evaluate existing packages for their ability to run deep learning models and applications on large-scale HPC systems e ciently, identify challenges, and propose new asynchronous parallelization and optimization techniques for current large-scale heterogeneous systems and upcoming exascale systems. These developments, along with existing HPC AI software capabilities, have been integrated into MagmaDNN, an open-source HPC deep learning framework. Many deep learning frameworks are targeted at data scientists and fall short in providing quality integration into existing HPC workflows. This paper discusses the necessities of an HPC deep learning framework and how those needs can be provided (e.g., as in MagmaDNN) through a deep integration with existing HPC libraries, such as MAGMA and its modular memory management, MPI, CuBLAS, CuDNN, MKL, and HIP. Advancements are also illustrated through the use of algorithmic enhancements in reduced- and mixed-precision, as well as asynchronous optimization methods. Finally, we present illustrations and potential solutions for enhancing traditional compute- and data-intensive applications at ORNL and UTK with AI. The approaches and future challenges are illustrated in materials science, imaging, and climate applications.}, author = {Rick Archibald and Edmond Chow and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Markus Eisenbach and Rocco Febbo and Florent Lopez and Daniel Nichols and Stanimire Tomov and Kwai Wong and Junqi Yin} } @conference {, title = {Integrating Deep Learning in Domain Sciences at Exascale}, booktitle = {2020 Smoky Mountains Computational Sciences and Engineering Conference (SMC 2020)}, year = {2020}, month = {2020-08}, abstract = {This paper presents some of the current challenges in designing deep learning artificial intelligence (AI) and integrating it with traditional high-performance computing (HPC) simulations. We evaluate existing packages for their ability to run deep learning models and applications on large-scale HPC systems e ciently, identify challenges, and propose new asynchronous parallelization and optimization techniques for current large-scale heterogeneous systems and upcoming exascale systems. These developments, along with existing HPC AI software capabilities, have been integrated into MagmaDNN, an open-source HPC deep learning framework. Many deep learning frameworks are targeted at data scientists and fall short in providing quality integration into existing HPC workflows. This paper discusses the necessities of an HPC deep learning framework and how those needs can be provided (e.g., as in MagmaDNN) through a deep integration with existing HPC libraries, such as MAGMA and its modular memory management, MPI, CuBLAS, CuDNN, MKL, and HIP. Advancements are also illustrated through the use of algorithmic enhancements in reduced- and mixed-precision, as well as asynchronous optimization methods. Finally, we present illustrations and potential solutions for enhancing traditional compute- and data-intensive applications at ORNL and UTK with AI. The approaches and future challenges are illustrated in materials science, imaging, and climate applications.}, author = {Rick Archibald and Edmond Chow and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Markus Eisenbach and Rocco Febbo and Florent Lopez and Daniel Nichols and Stanimire Tomov and Kwai Wong and Junqi Yin} } @conference {1480, title = {Investigating the Benefit of FP16-Enabled Mixed-Precision Solvers for Symmetric Positive Definite Matrices using GPUs}, booktitle = {International Conference on Computational Science (ICCS 2020)}, year = {2020}, month = {2020-06}, publisher = {Springer, Cham}, organization = {Springer, Cham}, address = {Amsterdam, Netherlands}, abstract = {Half-precision computation refers to performing floating-point operations in a 16-bit format. While half-precision has been driven largely by machine learning applications, recent algorithmic advances in numerical linear algebra have discovered beneficial use cases for half precision in accelerating the solution of linear systems of equations at higher precisions. In this paper, we present a high-performance, mixed-precision linear solver (Ax = b) for symmetric positive definite systems in double-precision using graphics processing units (GPUs). The solver is based on a mixed-precision Cholesky factorization that utilizes the high-performance tensor core units in CUDA-enabled GPUs. Since the Cholesky factors are affected by the low precision, an iterative refinement (IR) solver is required to recover the solution back to double-precision accuracy. Two different types of IR solvers are discussed on a wide range of test matrices. A preprocessing step is also developed, which scales and shifts the matrix, if necessary, in order to preserve its positive-definiteness in lower precisions. Our experiments on the V100 GPU show that performance speedups are up to 4.7{\texttimes} against a direct double-precision solver. However, matrix properties such as the condition number and the eigenvalue distribution can affect the convergence rate, which would consequently affect the overall performance.}, doi = {https://doi.org/10.1007/978-3-030-50417-5_18}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {, title = {Load-Balancing Sparse Matrix Vector Product Kernels on GPUs}, journal = {ACM Transactions on Parallel Computing}, volume = {7}, year = {2020}, month = {2020-03}, abstract = {Efficient processing of Irregular Matrices on Single Instruction, Multiple Data (SIMD)-type architectures is a persistent challenge. Resolving it requires innovations in the development of data formats, computational techniques, and implementations that strike a balance between thread divergence, which is inherent for Irregular Matrices, and padding, which alleviates the performance-detrimental thread divergence but introduces artificial overheads. To this end, in this article, we address the challenge of designing high performance sparse matrix-vector product (SpMV) kernels designed for Nvidia Graphics Processing Units (GPUs). We present a compressed sparse row (CSR) format suitable for unbalanced matrices. We also provide a load-balancing kernel for the coordinate (COO) matrix format and extend it to a hybrid algorithm that stores part of the matrix in SIMD-friendly Ellpack format (ELL) format. The ratio between the ELL- and the COO-part is determined using a theoretical analysis of the nonzeros-per-row distribution. For the over 2,800 test matrices available in the Suite Sparse matrix collection, we compare the performance against SpMV kernels provided by NVIDIA{\textquoteright}s cuSPARSE library and a heavily-tuned sliced ELL (SELL-P) kernel that prevents unnecessary padding by considering the irregular matrices as a combination of matrix blocks stored in ELL format.}, doi = {https://doi.org/10.1145/3380930}, author = {Hartwig Anzt and Terry Cojean and Chen Yen-Chen and Jack Dongarra and Goran Flegar and Pratik Nayak and Stanimire Tomov and Yuhsiang M. Tsai and Weichung Wang} } @article {, title = {MAGMA Templates for Scalable Linear Algebra on Emerging Architectures}, journal = {The International Journal of High Performance Computing Applications}, volume = {34}, year = {2020}, month = {2020-11}, pages = {645-658}, abstract = {With the acquisition and widespread use of more resources that rely on accelerator/wide vector{\textendash}based computing, there has been a strong demand for science and engineering applications to take advantage of these latest assets. This, however, has been extremely challenging due to the diversity of systems to support their extreme concurrency, complex memory hierarchies, costly data movement, and heterogeneous node architectures. To address these challenges, we design a programming model and describe its ease of use in the development of a new MAGMA Templates library that delivers high-performance scalable linear algebra portable on current and emerging architectures. MAGMA Templates derives its performance and portability by (1) building on existing state-of-the-art linear algebra libraries, like MAGMA, SLATE, Trilinos, and vendor-optimized math libraries, and (2) providing access (seamlessly to the users) to the latest algorithms and architecture-specific optimizations through a single, easy-to-use C++-based API.}, issn = {1094-3420}, doi = {https://doi.org/10.1177/1094342020938421}, author = {Mohammed Al Farhan and Ahmad Abdelfattah and Stanimire Tomov and Mark Gates and Dalal Sukkari and Azzam Haidar and Robert Rosenberg and Jack Dongarra} } @article {, title = {MATEDOR: MAtrix, TEnsor, and Deep-learning Optimized Routines}, year = {2020}, month = {2020-02}, publisher = {2020 NSF Cyberinfrastructure for Sustained Scientific Innovation (CSSI) Principal Investigator Meeting}, address = {Seattle, WA}, author = {Stanimire Tomov} } @article {, title = {Matrix Multiplication on Batches of Small Matrices in Half and Half-Complex Precisions}, journal = {Journal of Parallel and Distributed Computing}, volume = {145}, year = {2020}, month = {2020-11}, pages = {188-201}, abstract = {Machine learning and artificial intelligence (AI) applications often rely on performing many small matrix operations{\textemdash}in particular general matrix{\textendash}matrix multiplication (GEMM). These operations are usually performed in a reduced precision, such as the 16-bit floating-point format (i.e., half precision or FP16). The GEMM operation is also very important for dense linear algebra algorithms, and half-precision GEMM operations can be used in mixed-precision linear solvers. Therefore, high-performance batched GEMM operations in reduced precision are significantly important, not only for deep learning frameworks, but also for scientific applications that rely on batched linear algebra, such as tensor contractions and sparse direct solvers. This paper presents optimized batched GEMM kernels for graphics processing units (GPUs) in FP16 arithmetic. The paper addresses both real and complex half-precision computations on the GPU. The proposed design takes advantage of the Tensor Core technology that was recently introduced in CUDA-enabled GPUs. With eight tuning parameters introduced in the design, the developed kernels have a high degree of flexibility that overcomes the limitations imposed by the hardware and software (in the form of discrete configurations for the Tensor Core APIs). For real FP16 arithmetic, performance speedups are observed against cuBLAS for sizes up to 128, and range between and . For the complex FP16 GEMM kernel, the speedups are between and thanks to a design that uses the standard interleaved matrix layout, in contrast with the planar layout required by the vendor{\textquoteright}s solution. The paper also discusses special optimizations for extremely small matrices, where even higher performance gains are achievable.}, doi = {https://doi.org/10.1016/j.jpdc.2020.07.001}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {, title = {Mixed-Precision Iterative Refinement using Tensor Cores on GPUs to Accelerate Solution of Linear Systems}, journal = {Proceedings of the Royal Society A}, volume = {476}, year = {2020}, month = {2020-11}, abstract = {Double-precision floating-point arithmetic (FP64) has been the de facto standard for engineering and scientific simulations for several decades. Problem complexity and the sheer volume of data coming from various instruments and sensors motivate researchers to mix and match various approaches to optimize compute resources, including different levels of floating-point precision. In recent years, machine learning has motivated hardware support for half-precision floating-point arithmetic. A primary challenge in high-performance computing is to leverage reduced-precision and mixed-precision hardware. We show how the FP16/FP32 Tensor Cores on NVIDIA GPUs can be exploited to accelerate the solution of linear systems of equations Ax = b without sacrificing numerical stability. The techniques we employ include multiprecision LU factorization, the preconditioned generalized minimal residual algorithm (GMRES), and scaling and auto-adaptive rounding to avoid overflow. We also show how to efficiently handle systems with multiple right-hand sides. On the NVIDIA Quadro GV100 (Volta) GPU, we achieve a 4{\texttimes}-5{\texttimes} performance increase and 5{\texttimes} better energy efficiency versus the standard FP64 implementation while maintaining an FP64 level of numerical stability.}, keywords = {GMRESLU factorization, GPU computing, half precision arithmetic, iterative refinement, mixed precision solvers}, issn = {1471-2946}, doi = {https://doi.org/10.1098/rspa.2020.0110}, author = {Azzam Haidar and Harun Bayraktar and Stanimire Tomov and Jack Dongarra and Nicholas J. Higham} } @techreport {1471, title = {Mixed-Precision Solution of Linear Systems Using Accelerator-Based Computing}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-05}, year = {2020}, month = {2020-05}, publisher = {University of Tennessee}, abstract = {Double-precision floating-point arithmetic (FP64) has been the de facto standard for engineering and scientific simulations for several decades. Problem complexity and the sheer volume of data coming from various instruments and sensors motivate researchers to mix and match various approaches to optimize compute resources, including different levels of floating-point precision. In recent years, machine learning has motivated hardware support for half-precision floating-point arithmetic. A primary challenge in high-performance computing is to leverage reduced- and mixed-precision hardware. We show how the FP16/FP32 Tensor Cores on NVIDIA GPUs can be exploited to accelerate the solution of linear systems of equations Ax = b without sacrificing numerical stability. We achieve a 4{\texttimes}{\textendash}5{\texttimes} performance increase and 5{\texttimes} better energy efficiency versus the standard FP64 implementation while maintaining an FP64 level of numerical stability.}, author = {Azzam Haidar and Harun Bayraktar and Stanimire Tomov and Jack Dongarra and Nicholas J. Higham} } @article {1456, title = {Project-Based Research and Training in High Performance Data Sciences, Data Analytics, and Machine Learning}, journal = {The Journal of Computational Science Education}, volume = {11}, year = {2020}, month = {2020-01}, pages = {36-44}, doi = {https://doi.org/10.22369/issn.2153-4136/11/1/7}, url = {http://www.jocse.org/articles/11/1/7/}, author = {Wong, Kwai and Stanimire Tomov and Jack Dongarra} } @article {1466, title = {Reducing the Amount of out-of-core Data Access for GPU-Accelerated Randomized SVD}, journal = {Concurrency and Computation: Practice and Experience}, year = {2020}, month = {2020-04}, keywords = {Divide and conquer, gpu, out-of-core computation, Singular value decomposition}, doi = { https://doi.org/10.1002/cpe.5754}, author = {Yuechao Lu and Ichitaro Yamazaki and Fumihiko Ino and Yasuyuki Matsushita and Stanimire Tomov and Jack Dongarra} } @article {, title = {A Set of Batched Basic Linear Algebra Subprograms}, journal = {ACM Transactions on Mathematical Software}, year = {2020}, month = {2020-10}, abstract = {This paper describes a standard API for a set of Batched Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). The focus is on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The matrices are grouped together in uniformly sized groups, with just one group if all the matrices are of equal size. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance many-core platforms. These include multicore and many-core CPU processors, GPUs and coprocessors, and other hardware accelerators with floating-point compute facility. As well as the standard types of single and double precision, we also include half and quadruple precision in the standard. In particular half precision is used in many very large scale applications, such as those associated with machine learning.}, author = {Ahmad Abdelfattah and Timothy Costa and Jack Dongarra and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Mawussi Zounon} } @techreport {, title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic}, journal = {SLATE Working Notes}, number = {15, ICL-UT-20-08}, year = {2020}, month = {2020-07}, publisher = {University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang} } @article {, title = {Translational Process: Mathematical Software Perspective}, journal = {Journal of Computational Science}, year = {2020}, month = {2020-09}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to world-class high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, DATAFLOW scheduling runtimes, hardware accelerators}, doi = {https://doi.org/10.1016/j.jocs.2020.101216}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {, title = {Translational Process: Mathematical Software Perspective}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-11}, year = {2020}, month = {2020-08}, abstract = {Each successive generation of computer architecture has brought new challenges to achieving high performance mathematical solvers, necessitating development and analysis of new algorithms, which are then embodied in software libraries. These libraries hide architectural details from applications, allowing them to achieve a level of portability across platforms from desktops to worldclass high performance computing (HPC) systems. Thus there has been an informal translational computer science process of developing algorithms and distributing them in open source software libraries for adoption by applications and vendors. With the move to exascale, increasing intentionality about this process will benefit the long-term sustainability of the scientific software stack.}, keywords = {communication avoiding algorithms, data flow scheduling runtimes, hardware accelerators}, author = {Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @article {1262, title = {Algorithms and Optimization Techniques for High-Performance Matrix-Matrix Multiplications of Very Small Matrices}, journal = {Parallel Computing}, volume = {81}, year = {2019}, month = {2019-01}, pages = {1{\textendash}21}, abstract = {Expressing scientific computations in terms of BLAS, and in particular the general dense matrix-matrix multiplication (GEMM), is of fundamental importance for obtaining high performance portability across architectures. However, GEMMs for small matrices of sizes smaller than 32 are not sufficiently optimized in existing libraries. We consider the computation of many small GEMMs and its performance portability for a wide range of computer architectures, including Intel CPUs, ARM, IBM, Intel Xeon Phi, and GPUs. These computations often occur in applications like big data analytics, machine learning, high-order finite element methods (FEM), and others. The GEMMs are grouped together in a single batched routine. For these cases, we present algorithms and their optimization techniques that are specialized for the matrix sizes and architectures of interest. We derive a performance model and show that the new developments can be tuned to obtain performance that is within 90\% of the optimal for any of the architectures of interest. For example, on a V100 GPU for square matrices of size 32, we achieve an execution rate of about 1600 gigaFLOP/s in double-precision arithmetic, which is 95\% of the theoretically derived peak for this computation on a V100 GPU. We also show that these results outperform currently available state-of-the-art implementations such as vendor-tuned math libraries, including Intel MKL and NVIDIA CUBLAS, as well as open-source libraries like OpenBLAS and Eigen.}, keywords = {Autotuning, Batched GEMM, HPC, Matrix-matrix product, optimization, Small matrices}, doi = {https://doi.org/10.1016/j.parco.2018.10.003}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Marc Baboulin and Jo{\"e}l Falcou and Jack Dongarra} } @techreport {1433, title = {CEED ECP Milestone Report: Performance Tuning of CEED Software and 1st and 2nd Wave Apps}, year = {2019}, month = {2019-10}, publisher = {Zenodo}, doi = {https://doi.org/10.5281/zenodo.3477618}, author = {Stanimire Tomov and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jed Brown and Jean-Sylvain Camier and Veselin Dobrev and Jack Dongarra and Yohann Dudouit and Paul Fischer and Ali Karakus and Stefan Kerkemeier and Tzanio Kolev and YuHsiang Lan and Elia Merzari and Misun Min and Aleks Obabko and Scott Parker and Thilina Ratnayaka and Jeremy Thompson and Ananias Tomboulides and Vladimir Tomov and Tim Warburton} } @techreport {1434, title = {CEED ECP Milestone Report: Public release of CEED 2.0}, year = {2019}, month = {2019-04}, publisher = {Zenodo}, doi = {10.5281/zenodo.2641316}, url = {https://doi.org/10.5281/zenodo.2641316}, author = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Veselin Dobrev and Yohann Dudouit and Paul Fischer and Tzanio Kolev and David Medina and Misun Min and Thilina Ratnayaka and Cameron Smith and Jeremy Thompson and Stanimire Tomov and Vladimir Tomov and Tim Warburton} } @techreport {1322, title = {Design and Implementation for FFT-ECP on Distributed Accelerated Systems}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-05}, year = {2019}, month = {2019-04}, publisher = {University of Tennessee}, type = {ECP WBS 2.3.3.09 Milestone Report}, author = {Stanimire Tomov and Azzam Haidar and Alan Ayala and Daniel Schultz and Jack Dongarra} } @article {1269, title = {Evaluation of Directive-Based Performance Portable Programming Models}, journal = {International Journal of High Performance Computing and Networking}, volume = {14}, year = {2019}, month = {2019{\textendash}07}, pages = {165-182}, abstract = {We present an extended exploration of the performance portability of directives provided by OpenMP 4 and OpenACC to program various types of node architecture with attached accelerators, both self-hosted multicore and offload multicore/GPU. Our goal is to examine how successful OpenACC and the newer offload features of OpenMP 4.5 are for moving codes between architectures, and we document how much tuning might be required and what lessons we can learn from these experiences. To do this, we use examples of algorithms with varying computational intensities for our evaluation, as both compute and data access efficiency are important considerations for overall application performance. To better understand fundamental compute vs. bandwidth bound characteristics, we add the compute-bound Level 3 BLAS GEMM kernel to our linear algebra evaluation. We implement the kernels of interest using various methods provided by newer OpenACC and OpenMP implementations, and we evaluate their performance on various platforms including both x86_64 and Power8 with attached NVIDIA GPUs, x86_64 multicores, self-hosted Intel Xeon Phi KNL, as well as an x86_64 host system with Intel Xeon Phi coprocessors. We update these evaluations with the newest version of the NVIDIA Pascal architecture (P100), Intel KNL 7230, Power8+, and the newest supporting compiler implementations. Furthermore, we present in detail what factors affected the performance portability, including how to pick the right programming model, its programming style, its availability on different platforms, and how well compilers can optimise and target multiple platforms.}, keywords = {OpenACC, OpenMP 4, performance portability, Programming models}, doi = {http://dx.doi.org/10.1504/IJHPCN.2017.10009064 }, author = {M. Graham Lopez and Wayne Joubert and Ver{\'o}nica Larrea and Oscar Hernandez and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {1323, title = {Fast Batched Matrix Multiplication for Small Sizes using Half Precision Arithmetic on GPUs}, booktitle = {33rd IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2019}, month = {2019-05}, publisher = {IEEE}, organization = {IEEE}, address = {Rio de Janeiro, Brazil}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {1329, title = {FFT-ECP Fast Fourier Transform}, year = {2019}, month = {2019-01}, publisher = {2019 ECP Annual Meeting (Research Poster)}, address = {Houston, TX}, author = {Stanimire Tomov and Azzam Haidar and Alan Ayala and Daniel Schultz and Jack Dongarra} } @techreport {1401, title = {FFT-ECP Implementation Optimizations and Features Phase}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-12}, year = {2019}, month = {2019-10}, publisher = {University of Tennessee}, author = {Stanimire Tomov and Azzam Haidar and Alan Ayala and Hejer Shaiek and Jack Dongarra} } @article {1385, title = {GPUDirect MPI Communications and Optimizations to Accelerate FFTs on Exascale Systems}, journal = {EuroMPI{\textquoteright}19 Posters, Zurich, Switzerland}, number = {icl-ut-19-06}, year = {2019}, month = {2019-09}, publisher = {ICL}, type = {Extended Abstract}, abstract = {Fast Fourier transforms (FFTs) are used in applications ranging from molecular dynamics and spectrum estimation to machine learn- ing, fast convolution and correlation, signal modulation, wireless multimedia applications, and others. However, FFTs are memory bound, and therefore, to accelerate them, it is crucial to avoid and optimize the FFTs{\textquoteright} communications. To this end, we present a 3-D FFT design for distributed graphics processing unit (GPU) systems that: (1) efficiently uses GPUs{\textquoteright} high bandwidth, (2) reduces global communications algorithmically, when possible, and (3) employs GPUDirect technologies as well as MPI optimizations in the development of high-performance FFTs for large-scale GPU-accelerated systems. We show that these developments and optimizations lead to very good strong scalability and a performance that is close to 90\% of the theoretical peak.}, keywords = {CUDA-Aware MPI, ECP, FFT, FFT-ECP, gpu, GPUDirect}, author = {Hejer Shaiek and Stanimire Tomov and Alan Ayala and Azzam Haidar and Jack Dongarra} } @conference {1325, title = {Hands-on Research and Training in High-Performance Data Sciences, Data Analytics, and Machine Learning for Emerging Environments}, booktitle = {ISC High Performance}, year = {2019}, month = {2019-06}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Frankfurt, Germany}, author = {Kwai Wong and Stanimire Tomov and Jack Dongarra} } @conference {1403, title = {Impacts of Multi-GPU MPI Collective Communications on Large FFT Computation}, booktitle = {Workshop on Exascale MPI (ExaMPI) at SC19}, year = {2019}, month = {2019-11}, address = {Denver, CO}, keywords = {Collective MPI, Exascale applications, FFT, Heterogeneous systems, scalable}, author = {Alan Ayala and Stanimire Tomov and Xi Luo and Hejer Shaiek and Azzam Haidar and George Bosilca and Jack Dongarra} } @article {1366, title = {MagmaDNN 0.2 High-Performance Data Analytics for Manycore GPUs and CPUs}, year = {2019}, month = {2019-01}, publisher = {University of Tennessee}, doi = {10.13140/RG.2.2.14906.64961}, author = {Lucien Ng and Sihan Chen and Alex Gessinger and Daniel Nichols and Sophia Cheng and Anu Meenasorna and Kwai Wong and Stanimire Tomov and Azzam Haidar and Eduardo D{\textquoteright}Azevedo and Jack Dongarra} } @conference {1326, title = {MagmaDNN: Accelerated Deep Learning Using MAGMA}, booktitle = {Practice and Experience in Advanced Research Computing (PEARC {\textquoteright}19)}, year = {2019}, month = {2019-07}, publisher = {ACM}, organization = {ACM}, address = {Chicago, IL}, author = {Daniel Nichols and Kwai Wong and Stanimire Tomov and Lucien Ng and Sihan Chen and Alex Gessinger} } @conference {1324, title = {MagmaDNN: Towards High-Performance Data Analytics and Machine Learning for Data-Driven Scientific Computing}, booktitle = {ISC High Performance}, year = {2019}, month = {2019-06}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Frankfurt, Germany}, abstract = {In this paper, we present work towards the development of a new data analytics and machine learning (ML) framework, called MagmaDNN. Our main goal is to provide scalable, high-performance data analytics and ML solutions for scientific applications running on current and upcoming heterogeneous many-core GPU-accelerated architectures. To this end, since many of the functionalities needed are based on standard linear algebra (LA) routines, we designed MagmaDNN to derive its performance power from the MAGMA library. The close integration provides the fundamental (scalable high-performance) LA routines available in MAGMA as a backend to MagmaDNN. We present some design issues for performance and scalability that are specific to ML using Deep Neural Networks (DNN), as well as the MagmaDNN designs towards overcoming them. In particular, MagmaDNN uses well established HPC techniques from the area of dense LA, including task-based parallelization, DAG representations, scheduling, mixed-precision algorithms, asynchronous solvers, and autotuned hyperparameter optimization. We illustrate these techniques and their incorporation and use to outperform other frameworks, currently available.}, doi = {https://doi.org/10.1007/978-3-030-34356-9_37}, author = {Daniel Nichols and Natalie-Sofia Tomov and Frank Betancourt and Stanimire Tomov and Kwai Wong and Jack Dongarra} } @conference {1327, title = {OpenDIEL: A Parallel Workflow Engine and DataAnalytics Framework}, booktitle = {Practice and Experience in Advanced Research Computing (PEARC {\textquoteright}19)}, year = {2019}, month = {2019-07}, publisher = {ACM}, organization = {ACM}, address = {Chicago, IL}, author = {Frank Betancourt and Kwai Wong and Efosa Asemota and Quindell Marshall and Daniel Nichols and Stanimire Tomov} } @article {1374, title = {Optimizing Batch HGEMM on Small Sizes Using Tensor Cores}, year = {2019}, month = {2019-03}, publisher = {GPU Technology Conference (GTC)}, address = {San Jose, CA}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {1376, title = {Progressive Optimization of Batched LU Factorization on GPUs}, booktitle = { IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}19)}, year = {2019}, month = {2019-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {1237, title = {Solving Linear Diophantine Systems on Parallel Architectures}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {30}, year = {2019}, month = {2019-05}, pages = {1158-1169}, abstract = {Solving linear Diophantine systems of equations is applied in discrete-event systems, model checking, formal languages and automata, logic programming, cryptography, networking, signal processing, and chemistry. For modeling discrete systems with Petri nets, a solution in non-negative integer numbers is required, which represents an intractable problem. For this reason, solving such kinds of tasks with significant speedup is highly appreciated. In this paper we design a new solver of linear Diophantine systems based on the parallel-sequential composition of the system clans. The solver is studied and implemented to run on parallel architectures using a two-level parallelization concept based on MPI and OpenMP. A decomposable system is usually represented by a sparse matrix; a minimal clan size of the decomposition restricts the granulation of the technique. MPI is applied for solving systems for clans using a parallel-sequential composition on distributed-memory computing nodes, while OpenMP is applied in solving a single indecomposable system on a single node using multiple cores. A dynamic task-dispatching subsystem is developed for distributing systems on nodes in the process of compositional solution. Computational speedups are obtained on a series of test examples, e.g., illustrating that the best value constitutes up to 45 times speedup obtained on 5 nodes with 20 cores each.}, keywords = {Mathematical model, Matrix decomposition, Parallel architectures, Petri nets, Software algorithms, Sparse matrices, Task analysis}, doi = {http://dx.doi.org/10.1109/TPDS.2018.2873354}, url = {https://ieeexplore.ieee.org/document/8482295}, author = {Dmitry Zaitsev and Stanimire Tomov and Jack Dongarra} } @conference {1435, title = {Towards Half-Precision Computation for Complex Matrices: A Case Study for Mixed Precision Solvers on GPUs}, booktitle = {ScalA19: 10th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, year = {2019}, month = {2019-11}, publisher = {IEEE}, organization = {IEEE}, address = {Denver, CO}, keywords = {Half precision, mixed-precision solvers, Tensor cores FP16 arithmetic}, author = {Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {1331, title = {Accelerating 2D FFT: Exploit GPU Tensor Cores through Mixed-Precision}, year = {2018}, month = {2018-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC18), ACM Student Research Poster}, address = {Dallas, TX}, author = {Xaiohe Cheng and Anumeena Soma and Eduardo D{\textquoteright}Azevedo and Kwai Wong and Stanimire Tomov} } @article {1336, title = {Accelerating Linear Algebra with MAGMA}, year = {2018}, month = {2018-02}, publisher = {ECP Annual Meeting 2018, Tutorial}, address = {Knoxville, TN}, author = {Stanimire Tomov and Mark Gates and Azzam Haidar} } @article {1266, title = {Accelerating the SVD Bi-Diagonalization of a Batch of Small Matrices using GPUs}, journal = {Journal of Computational Science}, volume = {26}, year = {2018}, month = {2018-05}, pages = {237{\textendash}245}, abstract = {The acceleration of many small-sized linear algebra problems has become extremely challenging for current many-core architectures, and in particular GPUs. Standard interfaces have been proposed for some of these problems, called batched problems, so that they get targeted for optimization and used in a standard way in applications, calling them directly from highly optimized, standard numerical libraries, like (batched) BLAS and LAPACK. While most of the developments have been for one-sided factorizations and solvers, many important applications {\textendash} from big data analytics to information retrieval, low-rank approximations for solvers and preconditioners {\textendash} require two-sided factorizations, and most notably the SVD factorization. To address these needs and the parallelization challenges related to them, we developed a number of new batched computing techniques and designed batched Basic Linear Algebra Subroutines (BLAS) routines, and in particular the Level-2 BLAS GEMV and the Level-3 BLAS GEMM routines, to solve them. We propose a device functions-based methodology and big-tile setting techniques in our batched BLAS design. The different optimization techniques result in many software versions that must be tuned, for which we adopt an auto-tuning strategy to automatically derive the optimized instances of the routines. We illustrate our batched BLAS approach to optimize batched SVD bi-diagonalization progressively on GPUs. The progression is illustrated on an NVIDIA K40c GPU, and also, ported and presented on AMD Fiji Nano GPU, using AMD{\textquoteright}s Heterogeneous{\textendash}Compute Interface for Portability (HIP) C++ runtime API. We demonstrate achieving 80\% of the theoretically achievable peak performance for the overall algorithm, and significant acceleration of the Level-2 BLAS GEMV and Level-3 BLAS GEMM needed compared to vendor-optimized libraries on GPUs and multicore CPUs. The optimization techniques in this paper are applicable to the other two-sided factorizations as well.}, keywords = {Batched, Eigenvalue and singular value problems, hardware accelerators, numerical linear algebra, Two-sided factorization algorithms}, doi = {https://doi.org/10.1016/j.jocs.2018.01.007}, author = {Tingxing Dong and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1161, title = {Accelerating the SVD Two Stage Bidiagonal Reduction and Divide and Conquer Using GPUs}, journal = {Parallel Computing}, volume = {74}, year = {2018}, month = {2018-05}, pages = {3{\textendash}18}, abstract = {The increasing gap between memory bandwidth and computation speed motivates the choice of algorithms to take full advantage of today{\textquoteright}s high performance computers. For dense matrices, the classic algorithm for the singular value decomposition (SVD) uses a one stage reduction to bidiagonal form, which is limited in performance by the memory bandwidth. To overcome this limitation, a two stage reduction to bidiagonal has been gaining popularity. It first reduces the matrix to band form using high performance Level 3 BLAS, then reduces the band matrix to bidiagonal form. As accelerators such as GPUs and co-processors are becoming increasingly widespread in high-performance computing, a question of great interest to many SVD users is how much the employment of a two stage reduction, as well as other current best practices in GPU computing, can accelerate this important routine. To fulfill this interest, we have developed an accelerated SVD employing a two stage reduction to bidiagonal and a number of other algorithms that are highly optimized for GPUs. Notably, we also parallelize and accelerate the divide and conquer algorithm used to solve the subsequent bidiagonal SVD. By accelerating all phases of the SVD algorithm, we provide a significant speedup compared to existing multi-core and GPU-based SVD implementations. In particular, using a P100 GPU, we illustrate a performance of up to 804 Gflop/s in double precision arithmetic to compute the full SVD of a 20k {\texttimes} 20k matrix in 90 seconds, which is 8.9 {\texttimes} faster than MKL on two 10 core Intel Haswell E5-2650 v3 CPUs, 3.7 {\texttimes} over the multi-core PLASMA two stage version, and 2.6 {\texttimes} over the previously accelerated one stage MAGMA version.}, keywords = {2-stage, accelerator, Divide and conquer, gpu, Singular value decomposition, SVD}, issn = {01678191}, doi = {10.1016/j.parco.2017.10.004}, url = {https://www.sciencedirect.com/science/article/pii/S0167819117301758}, author = {Mark Gates and Stanimire Tomov and Jack Dongarra} } @techreport {1229, title = {Algorithms and Optimization Techniques for High-Performance Matrix-Matrix Multiplications of Very Small Matrices}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-09}, year = {2018}, month = {2018-09}, publisher = {Innovative Computing Laboratory, University of Tennessee}, abstract = {Expressing scientific computations in terms of BLAS, and in particular the general dense matrix-matrix multiplication (GEMM), is of fundamental importance for obtaining high performance portability across architectures. However, GEMMs for small matrices of sizes smaller than 32 are not sufficiently optimized in existing libraries. We consider the computation of many small GEMMs and its performance portability for a wide range of computer architectures, including Intel CPUs, ARM, IBM, Intel Xeon Phi, and GPUs. These computations often occur in applications like big data analytics, machine learning, high-order finite element methods (FEM), and others. The GEMMs are grouped together in a single batched routine. For these cases, we present algorithms and their optimization techniques that are specialized for the matrix sizes and architectures of interest. We derive a performance model and show that the new developments can be tuned to obtain performance that is within 90\% of the optimal for any of the architectures of interest. For example, on a V100 GPU for square matrices of size 32, we achieve an execution rate of about 1; 600 gigaFLOP/s in double-precision arithmetic, which is 95\% of the theoretically derived peak for this computation on a V100 GPU. We also show that these results outperform currently available state-of-the-art implementations such as vendor-tuned math libraries, including Intel MKL and NVIDIA CUBLAS, as well as open-source libraries like OpenBLAS and Eigen.}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Marc Baboulin and Jo{\"e}l Falcou and Jack Dongarra} } @article {1260, title = {Analysis and Design Techniques towards High-Performance and Energy-Efficient Dense Linear Solvers on GPUs}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {29}, year = {2018}, month = {2018-12}, pages = {2700{\textendash}2712}, abstract = {Graphics Processing Units (GPUs) are widely used in accelerating dense linear solvers. The matrix factorizations, which dominate the runtime for these solvers, are often designed using a hybrid scheme, where GPUs perform trailing matrix updates, while the CPUs perform the panel factorizations. Consequently, hybrid solutions require high-end CPUs and optimized CPU software in order to deliver high performance. Furthermore, they lack the energy efficiency inherent for GPUs due to the use of less energy-efficient CPUs, as well as CPU-GPU communications. This paper presents analysis and design techniques that overcome the shortcomings of the hybrid algorithms, and allow the design of high-performance and energy-efficient dense LU and Cholesky factorizations that use GPUs only. The full GPU solution eliminates the need for a high-end CPU and optimized CPU software, which leads to a better energy efficiency. We discuss different design choices, and introduce optimized GPU kernels for panel factorizations. The developed solutions achieve 90+ percent of the performance of optimized hybrid solutions, while improving the energy efficiency by 50 percent. They outperform the vendor library by 30-50 percent in single precision, and 15-50 percent in double precision. We also show that hybrid designs trail the proposed solutions in performance when optimized CPU software is not available.}, keywords = {Dense linear solvers, energy efficiency, GPU computing}, doi = {10.1109/TPDS.2018.2842785}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {1195, title = {Analyzing Performance of BiCGStab with Hierarchical Matrix on GPU Clusters}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2018}, month = {2018-05}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {ppohBEM is an open-source software package im- plementing the boundary element method. One of its main software tasks is the solution of the dense linear system of equations, for which, ppohBEM relies on another software package called HACApK. To reduce the cost of solving the linear system, HACApK hierarchically compresses the coefficient matrix using adaptive cross approximation. This hierarchical compression greatly reduces the storage and time complexities of the solver and enables the solution of large-scale boundary value problems. To extend the capability of ppohBEM, in this paper, we carefully port the HACApK{\textquoteright}s linear solver onto GPU clusters. Though the potential of the GPUs has been widely accepted in high-performance computing, it is still a challenge to utilize the GPUs for a solver, like HACApK{\textquoteright}s, that requires fine-grained computation and global communication. First, to utilize the GPUs, we integrate the batched GPU kernel that was recently released in the MAGMA software package. We discuss several techniques to improve the performance of the batched kernel. We then study various techniques to address the inter-GPU communication and study their effects on state-of- the-art GPU clusters. We believe that the techniques studied in this paper are of interest to a wide range of software packages running on GPUs, especially with the increasingly complex node architectures and the growing costs of the communication. We also hope that our efforts to integrate the GPU kernel or to setup the inter-GPU communication will influence the design of the future-generation batched kernels or the communication layer within a software stack.}, author = {Ichitaro Yamazaki and Ahmad Abdelfattah and Akihiro Ida and Satoshi Ohshima and Stanimire Tomov and Rio Yokota and Jack Dongarra} } @article {1300, title = {Batched BLAS (Basic Linear Algebra Subprograms) 2018 Specification}, year = {2018}, month = {2018-07}, abstract = {This document describes an API for Batch Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). We focus on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The extensions beyond the original BLAS standard are considered that specify a programming interface not only for routines with uniformly-sized matrices and/or vectors but also for the situation where the sizes vary. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance manycore platforms. These include multicore and many-core CPU processors; GPUs and coprocessors; as well as other hardware accelerators with floating-point compute facility.}, author = {Jack Dongarra and Iain Duff and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jonathan Hogg and Pedro Valero Lara and Piotr Luszczek and Mawussi Zounon and Samuel D. Relton and Stanimire Tomov and Timothy Costa and Sarah Knepper} } @article {1209, title = {Batched One-Sided Factorizations of Tiny Matrices Using GPUs: Challenges and Countermeasures}, journal = {Journal of Computational Science}, volume = {26}, year = {2018}, month = {2018-05}, pages = {226{\textendash}236}, abstract = {The use of batched matrix computations recently gained a lot of interest for applications, where the same operation is applied to many small independent matrices. The batched computational pattern is frequently encountered in applications of data analytics, direct/iterative solvers and preconditioners, computer vision, astrophysics, and more, and often requires specific designs for vectorization and extreme parallelism to map well on today{\textquoteright}s high-end many-core architectures. This has led to the development of optimized software for batch computations, and to an ongoing community effort to develop standard interfaces for batched linear algebra software. Furthering these developments, we present GPU design and optimization techniques for high-performance batched one-sided factorizations of millions of tiny matrices (of size 32 and less). We quantify the effects and relevance of different techniques in order to select the best-performing LU, QR, and Cholesky factorization designs. While we adapt common optimization techniques, such as optimal memory traffic, register blocking, and concurrency control, we also show that a different mindset and techniques are needed when matrices are tiny, and in particular, sub-vector/warp in size. The proposed routines are part of the MAGMA library and deliver significant speedups compared to their counterparts in currently available vendor-optimized libraries. Notably, we tune the developments for the newest V100 GPU from NVIDIA to show speedups of up to 11.8{\texttimes}.}, keywords = {batch computation, GPU computing, matrix factorization}, doi = {https://doi.org/10.1016/j.jocs.2018.01.005}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1263, title = {Computational Benefit of GPU Optimization for Atmospheric Chemistry Modeling}, journal = {Journal of Advances in Modeling Earth Systems}, volume = {10}, year = {2018}, month = {2018-08}, pages = {1952{\textendash}1969}, abstract = {Global chemistry-climate models are computationally burdened as the chemical mechanisms become more complex and realistic. Optimization for graphics processing units (GPU) may make longer global simulation with regional detail possible, but limited study has been done to explore the potential benefit for the atmospheric chemistry modeling. Hence, in this study, the second-order Rosenbrock solver of the chemistry module of CAM4-Chem is ported to the GPU to gauge potential speed-up. We find that on the CPU, the fastest performance is achieved using the Intel compiler with a block interleaved memory layout. Different combinations of compiler and memory layout lead to ~11.02{\texttimes} difference in the computational time. In contrast, the GPU version performs the best when using a combination of fully interleaved memory layout with block size equal to the warp size, CUDA streams for independent kernels, and constant memory. Moreover, the most efficient data transfer between CPU and GPU is gained by allocating the memory contiguously during the data initialization on the GPU. Compared to one CPU core, the speed-up of using one GPU alone reaches a factor of ~11.7{\texttimes} for the computation alone and ~3.82{\texttimes} when the data transfer between CPU and GPU is considered. Using one GPU alone is also generally faster than the multithreaded implementation for 16 CPU cores in a compute node and the single-source solution (OpenACC). The best performance is achieved by the implementation of the hybrid CPU/GPU version, but rescheduling the workload among the CPU cores is required before the practical CAM4-Chem simulation.}, keywords = {compiler, CUDA, data transfer, gpu, hybrid, memory layout}, doi = {https://doi.org/10.1029/2018MS001276}, author = {Jian Sun and Joshua Fu and John Drake and Qingzhao Zhu and Azzam Haidar and Mark Gates and Stanimire Tomov and Jack Dongarra} } @inproceedings {1259, title = {The Design of Fast and Energy-Efficient Linear Solvers: On the Potential of Half-Precision Arithmetic and Iterative Refinement Techniques}, journal = {International Conference on Computational Science (ICCS 2018)}, volume = {10860}, year = {2018}, month = {2018-06}, pages = {586{\textendash}600}, publisher = {Springer}, address = {Wuxi, China}, abstract = {As parallel computers approach exascale, power efficiency in high-performance computing (HPC) systems is of increasing concern. Exploiting both the hardware features and algorithms is an effective solution to achieve power efficiency, and to address the energy constraints in modern and future HPC systems. In this work, we present a novel design and implementation of an energy-efficient solution for dense linear systems of equations, which are at the heart of large-scale HPC applications. The proposed energy-efficient linear system solvers are based on two main components: (1) iterative refinement techniques, and (2) reduced-precision computing features in modern accelerators and coprocessors. While most of the energy efficiency approaches aim to reduce the consumption with a minimal performance penalty, our method improves both the performance and the energy efficiency. Compared to highly-optimized linear system solvers, our kernels deliver the same accuracy solution up to 2{\texttimes} faster and reduce the energy consumption up to half on Intel Knights Landing (KNL) architectures. By efficiently using the Tensor Cores available in the NVIDIA V100 PCIe GPUs, the speedups can be up to 4{\texttimes} , with more than 80\% reduction in the energy consumption.}, doi = {https://doi.org/10.1007/978-3-319-93698-7_45}, url = {https://rdcu.be/bcKSC}, author = {Azzam Haidar and Ahmad Abdelfattah and Mawussi Zounon and Panruo Wu and Srikara Pranesh and Stanimire Tomov and Jack Dongarra} } @techreport {1232, title = {Evaluation and Design of FFT for Distributed Accelerated Systems}, journal = {ECP WBS 2.3.3.09 Milestone Report}, number = {FFT-ECP ST-MS-10-1216}, year = {2018}, month = {2018-10}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Stanimire Tomov and Azzam Haidar and Daniel Schultz and Jack Dongarra} } @article {1208, title = {A Guide for Achieving High Performance with Very Small Matrices on GPUs: A Case Study of Batched LU and Cholesky Factorizations}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {29}, year = {2018}, month = {2018-05}, pages = {973{\textendash}984}, abstract = {We present a high-performance GPU kernel with a substantial speedup over vendor libraries for very small matrix computations. In addition, we discuss most of the challenges that hinder the design of efficient GPU kernels for small matrix algorithms. We propose relevant algorithm analysis to harness the full power of a GPU, and strategies for predicting the performance, before introducing a proper implementation. We develop a theoretical analysis and a methodology for high-performance linear solvers for very small matrices. As test cases, we take the Cholesky and LU factorizations and show how the proposed methodology enables us to achieve a performance close to the theoretical upper bound of the hardware. This work investigates and proposes novel algorithms for designing highly optimized GPU kernels for solving batches of hundreds of thousands of small-size Cholesky and LU factorizations. Our focus on efficient batched Cholesky and batched LU kernels is motivated by the increasing need for these kernels in scientific simulations (e.g., astrophysics applications). Techniques for optimal memory traffic, register blocking, and tunable concurrency are incorporated in our proposed design. The proposed GPU kernels achieve performance speedups versus CUBLAS of up to 6x for the factorizations, using double precision arithmetic on an NVIDIA Pascal P100 GPU.}, doi = {10.1109/TPDS.2017.2783929}, author = {Azzam Haidar and Ahmad Abdelfattah and Mawussi Zounon and Stanimire Tomov and Jack Dongarra} } @conference {1264, title = {Harnessing GPU Tensor Cores for Fast FP16 Arithmetic to Speed up Mixed-Precision Iterative Refinement Solvers}, booktitle = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC18)}, year = {2018}, month = {2018-11}, publisher = {IEEE}, organization = {IEEE}, address = {Dallas, TX}, abstract = {Low-precision floating-point arithmetic is a powerful tool for accelerating scientific computing applications, especially those in artificial intelligence. Here, we present an investigation showing that other high-performance computing (HPC) applications can also harness this power. Specifically, we use the general HPC problem, Ax = b, where A is a large dense matrix, and a double precision (FP64) solution is needed for accuracy. Our approach is based on mixed-precision (FP16-FP64) iterative refinement, and we generalize and extend prior advances into a framework, for which we develop architecture-specific algorithms and highly tuned implementations. These new methods show how using half-precision Tensor Cores (FP16-TC) for the arithmetic can provide up to 4{\texttimes} speedup. This is due to the performance boost that the FP16-TC provide as well as to the improved accuracy over the classical FP16 arithmetic that is obtained because the GEMM accumulation occurs in FP32 arithmetic.}, doi = {https://doi.org/10.1109/SC.2018.00050}, author = {Azzam Haidar and Stanimire Tomov and Jack Dongarra and Nicholas J. Higham} } @article {1335, title = {Harnessing GPU{\textquoteright}s Tensor Cores Fast FP16 Arithmetic to Speedup Mixed-Precision Iterative Refinement Solvers and Achieve 74 Gflops/Watt on Nvidia V100}, year = {2018}, month = {2018-03}, publisher = {GPU Technology Conference (GTC), Poster}, address = {San Jose, CA}, author = {Azzam Haidar and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {1199, title = {Investigating Power Capping toward Energy-Efficient Scientific Applications}, journal = {Concurrency Computation: Practice and Experience}, volume = {2018}, year = {2018}, month = {2018-04}, pages = {1-14}, abstract = {The emergence of power efficiency as a primary constraint in processor and system design poses new challenges concerning power and energy awareness for numerical libraries and scientific applications. Power consumption also plays a major role in the design of data centers, which may house petascale or exascale-level computing systems. At these extreme scales, understanding and improving the energy efficiency of numerical libraries and their related applications becomes a crucial part of the successful implementation and operation of the computing system. In this paper, we study and investigate the practice of controlling a compute system{\textquoteright}s power usage, and we explore how different power caps affect the performance of numerical algorithms with different computational intensities. Further, we determine the impact, in terms of performance and energy usage, that these caps have on a system running scientific applications. This analysis will enable us to characterize the types of algorithms that benefit most from these power management schemes. Our experiments are performed using a set of representative kernels and several popular scientific benchmarks. We quantify a number of power and performance measurements and draw observations and conclusions that can be viewed as a roadmap to achieving energy efficiency in the design and execution of scientific algorithms.}, keywords = {energy efficiency, High Performance Computing, Intel Xeon Phi, Knights landing, papi, performance analysis, Performance Counters, power efficiency}, doi = {https://doi.org/10.1002/cpe.4485}, url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4485}, author = {Azzam Haidar and Heike Jagode and Phil Vaccaro and Asim YarKhan and Stanimire Tomov and Jack Dongarra} } @article {1330, title = {MATEDOR: MAtrix, TEnsor, and Deep-learning Optimized Routines}, year = {2018}, month = {2018-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC18), Research Poster}, address = {Dallas, TX}, author = {Ahmad Abdelfattah and Jack Dongarra and Azzam Haidar and Stanimire Tomov and Ichitaro Yamazaki} } @article {1333, title = {MAtrix, TEnsor, and Deep-learning Optimized Routines (MATEDOR)}, year = {2018}, month = {2018-04}, publisher = {NSF PI Meeting, Poster}, address = {Washington, DC}, doi = {https://doi.org/10.6084/m9.figshare.6174143.v3}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Ichitaro Yamazaki and Jack Dongarra} } @conference {1210, title = {Optimizing GPU Kernels for Irregular Batch Workloads: A Case Study for Cholesky Factorization}, booktitle = {IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}18)}, year = {2018}, month = {2018-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {This paper introduces several frameworks for the design and implementation of high performance GPU kernels that target batch workloads with irregular sizes. Such workloads are ubiquitous in many scientific applications, including sparse direct solvers, astrophysics, and quantum chemistry. The paper addresses two main categories of frameworks, taking the Cholesky factorization as a case study. The first uses hostside kernel launches, and the second uses device-side launches. Within each category, different design options are introduced, with an emphasis on the advantages and the disadvantages of each approach. Our best performing design outperforms the state-of-the-art CPU implementation, scoring up to 4.7{\texttimes} speedup in double precision on a Pascal P100 GPU.}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1258, title = {The Singular Value Decomposition: Anatomy of Optimizing an Algorithm for Extreme Scale}, journal = {SIAM Review}, volume = {60}, year = {2018}, month = {2018-11}, pages = {808{\textendash}865}, abstract = {The computation of the singular value decomposition, or SVD, has a long history with many improvements over the years, both in its implementations and algorithmically. Here, we survey the evolution of SVD algorithms for dense matrices, discussing the motivation and performance impacts of changes. There are two main branches of dense SVD methods: bidiagonalization and Jacobi. Bidiagonalization methods started with the implementation by Golub and Reinsch in Algol60, which was subsequently ported to Fortran in the EISPACK library, and was later more efficiently implemented in the LINPACK library, targeting contemporary vector machines. To address cache-based memory hierarchies, the SVD algorithm was reformulated to use Level 3 BLAS in the LAPACK library. To address new architectures, ScaLAPACK was introduced to take advantage of distributed computing, and MAGMA was developed for accelerators such as GPUs. Algorithmically, the divide and conquer and MRRR algorithms were developed to reduce the number of operations. Still, these methods remained memory bound, so two-stage algorithms were developed to reduce memory operations and increase the computational intensity, with efficient implementations in PLASMA, DPLASMA, and MAGMA. Jacobi methods started with the two-sided method of Kogbetliantz and the one-sided method of Hestenes. They have likewise had many developments, including parallel and block versions and preconditioning to improve convergence. In this paper, we investigate the impact of these changes by testing various historical and current implementations on a common, modern multicore machine and a distributed computing platform. We show that algorithmic and implementation improvements have increased the speed of the SVD by several orders of magnitude, while using up to 40 times less energy.}, keywords = {bidiagonal matrix, bisection, Divide and conquer, Hestenes method, Jacobi method, Kogbetliantz method, MRRR, QR iteration, Singular value decomposition, SVD}, issn = {0036-1445}, doi = {10.1137/17M1117732}, url = {https://epubs.siam.org/doi/10.1137/17M1117732}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @techreport {1275, title = {Software-Defined Events (SDEs) in MAGMA-Sparse}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-12}, year = {2018}, month = {2018-12}, publisher = {University of Tennessee}, author = {Heike Jagode and Anthony Danalis and Hartwig Anzt and Ichitaro Yamazaki and Mark Hoemmen and Erik Boman and Stanimire Tomov and Jack Dongarra} } @article {1334, title = {Tensor Contractions using Optimized Batch GEMM Routines}, year = {2018}, month = {2018-03}, publisher = {GPU Technology Conference (GTC), Poster}, address = {San Jose, CA}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1332, title = {Using GPU FP16 Tensor Cores Arithmetic to Accelerate Mixed-Precision Iterative Refinement Solvers and Reduce Energy Consumption}, year = {2018}, month = {2018-06}, publisher = {ISC High Performance (ISC18), Best Poster Award}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Mawussi Zounon and Jack Dongarra} } @conference {1265, title = {Using GPU FP16 Tensor Cores Arithmetic to Accelerate Mixed-Precision Iterative Refinement Solvers and Reduce Energy Consumption}, booktitle = {ISC High Performance (ISC{\textquoteright}18), Best Poster}, year = {2018}, month = {2018-06}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Stanimire Tomov and Ahmad Abdelfattah and Mawussi Zounon and Jack Dongarra} } @article {1341, title = {Accelerating Tensor Contractions in High-Order FEM with MAGMA Batched}, year = {2017}, month = {2017-03}, publisher = {SIAM Conference on Computer Science and Engineering (SIAM CSE17), Presentation}, address = {Atlanta, GA}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @inbook {1167, title = {Bringing High Performance Computing to Big Data Algorithms}, booktitle = {Handbook of Big Data Technologies}, year = {2017}, publisher = {Springer}, organization = {Springer}, isbn = {978-3-319-49339-8}, doi = {10.1007/978-3-319-49340-4}, author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @techreport {1175, title = {C++ API for Batch BLAS}, journal = {SLATE Working Notes}, number = {04, ICL-UT-17-12}, year = {2017}, month = {2017-12}, publisher = {University of Tennessee}, author = {Ahmad Abdelfattah and Konstantin Arturov and Cris Cecka and Jack Dongarra and Chip Freitag and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Panruo Wu} } @article {1103, title = {Factorization and Inversion of a Million Matrices using GPUs: Challenges and Countermeasures}, journal = {Procedia Computer Science}, volume = {108}, year = {2017}, month = {2017-06}, pages = {606{\textendash}615}, abstract = {This paper presents new algorithmic approaches and optimization techniques for LU factorization and matrix inversion of millions of very small matrices using GPUs. These problems appear in many scientific applications including astrophysics and generation of block-Jacobi preconditioners. We show that, for very small problem sizes, design and optimization of GPU kernels require a mindset different from the one usually used when designing LAPACK algorithms for GPUs. Techniques for optimal memory traffic, register blocking, and tunable concurrency are incorporated in our proposed design. We also take advantage of the small matrix sizes to eliminate the intermediate row interchanges in both the factorization and inversion kernels. The proposed GPU kernels achieve performance speedups vs. CUBLAS of up to 6{\texttimes} for the factorization, and 14{\texttimes} for the inversion, using double precision arithmetic on a Pascal P100 GPU.}, doi = {https://doi.org/10.1016/j.procs.2017.05.250}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1102, title = {Fast Cholesky Factorization on GPUs for Batch and Native Modes in MAGMA}, journal = {Journal of Computational Science}, volume = {20}, year = {2017}, month = {2017-05}, pages = {85{\textendash}93}, abstract = {This paper presents a GPU-accelerated Cholesky factorization for two different modes of operation. The first one is the batch mode, where many independent factorizations on small matrices can be performed concurrently. This mode supports fixed size and variable size problems, and is found in many scientific applications. The second mode is the native mode, where one factorization is performed on a large matrix without any CPU involvement, which allows the CPU do other useful work. We show that, despite the different workloads, both modes of operation share a common code-base that uses the GPU only. We also show that the developed routines achieve significant speedups against a multicore CPU using the MKL library, and against a GPU implementation by cuSOLVER. This work is part of the MAGMA library.}, keywords = {GPU computing; Cholesky factorization; Batched execution}, doi = {https://doi.org/10.1016/j.jocs.2016.12.009}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1083, title = {A Framework for Out of Memory SVD Algorithms}, journal = {ISC High Performance 2017}, year = {2017}, month = {2017-06}, pages = {158{\textendash}178}, abstract = {Many important applications {\textendash} from big data analytics to information retrieval, gene expression analysis, and numerical weather prediction {\textendash} require the solution of large dense singular value decompositions (SVD). In many cases the problems are too large to fit into the computer{\textquoteright}s main memory, and thus require specialized out-of-core algorithms that use disk storage. In this paper, we analyze the SVD communications, as related to hierarchical memories, and design a class of algorithms that minimizes them. This class includes out-of-core SVDs but can also be applied between other consecutive levels of the memory hierarchy, e.g., GPU SVD using the CPU memory for large problems. We call these out-of-memory (OOM) algorithms. To design OOM SVDs, we first study the communications for both classical one-stage blocked SVD and two-stage tiled SVD. We present the theoretical analysis and strategies to design, as well as implement, these communication avoiding OOM SVD algorithms. We show performance results for multicore architecture that illustrate our theoretical findings and match our performance models.}, doi = {https://doi.org/10.1007/978-3-319-58667-0_9}, author = {Khairul Kabir and Azzam Haidar and Stanimire Tomov and Aurelien Bouteiller and Jack Dongarra} } @conference {1142, title = {High-performance Cholesky Factorization for GPU-only Execution}, booktitle = {Proceedings of the General Purpose GPUs (GPGPU-10)}, year = {2017}, month = {2017-02}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, abstract = {We present our performance analysis, algorithm designs, and the optimizations needed for the development of high-performance GPU-only algorithms, and in particular, for the dense Cholesky factorization. In contrast to currently promoted designs that solve parallelism challenges on multicore architectures by representing algorithms as Directed Acyclic Graphs (DAGs), where nodes are tasks of fine granularity and edges are the dependencies between the tasks, our designs explicitly target manycore architectures like GPUs and feature coarse granularity tasks (that can be hierarchically split into fine grain data-parallel subtasks). Furthermore, in contrast to hybrid algorithms that schedule difficult to parallelize tasks on CPUs, we develop highly-efficient code for entirely GPU execution. GPU-only codes remove the expensive CPU-to-GPU communications and the tuning challenges related to slow CPU and/or low CPU-to-GPU bandwidth. We show that on latest GPUs, like the P100, this becomes so important that the GPU-only code even outperforms the hybrid MAGMA algorithms when the CPU tasks and communications can not be entirely overlapped with GPU computations. We achieve up to 4,300 GFlop/s in double precision on a P100 GPU, which is about 7-8{\texttimes} faster than high-end multicore CPUs, e.g., two 10-cores Intel Xeon E5-2650 v3 Haswell CPUs, where MKL runs up to about 500-600 Gflop/s. The new algorithm also outperforms significantly the GPU-only implementation currently available in the NVIDIA cuSOLVER library.}, doi = {https://doi.org/10.1145/3038228.3038237}, author = {Azzam Haidar and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @conference {1140, title = {Investigating Half Precision Arithmetic to Accelerate Dense Linear System Solvers}, booktitle = {ScalA17: 8th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, year = {2017}, month = {11/2017}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, abstract = {The use of low-precision arithmetic in mixed-precision computing methods has been a powerful tool to accelerate numerous scientific computing applications. Artificial intelligence (AI) in particular has pushed this to current extremes, making use of half-precision floating-point arithmetic (FP16) in approaches based on neural networks. The appeal of FP16 is in the high performance that can be achieved using it on today{\textquoteright}s powerful manycore GPU accelerators, e.g., like the NVIDIA V100, that can provide 120 TeraFLOPS alone in FP16. We present an investigation showing that other HPC applications can harness this power too, and in particular, the general HPC problem of solving Ax = b, where A is a large dense matrix, and the solution is needed in FP32 or FP64 accuracy. Our approach is based on the mixed-precision iterative refinement technique {\textendash} we generalize and extend prior advances into a framework, for which we develop architecture-specific algorithms and highly-tuned implementations that resolve the main computational challenges of efficiently parallelizing, scaling, and using FP16 arithmetic in the approach on high-end GPUs. Subsequently, we show for a first time how the use of FP16 arithmetic can significantly accelerate, as well as make more energy efficient, FP32 or FP64-precision Ax = b solvers. Our results are reproducible and the developments will be made available through the MAGMA library. We quantify in practice the performance, and limitations of the approach.}, author = {Azzam Haidar and Panruo Wu and Stanimire Tomov and Jack Dongarra} } @article {1340, title = {MAGMA Tensors and Batched Computing for Accelerating Applications on GPUs}, year = {2017}, month = {2017-05}, publisher = {GPU Technology Conference (GTC17), Presentation in Session S7728}, address = {San Jose, CA}, author = {Stanimire Tomov and Azzam Haidar} } @article {1337, title = {MagmaDNN {\textendash} High-Performance Data Analytics for Manycore GPUs and CPUs}, year = {2017}, month = {2017-12}, publisher = {2017 Summer Research Experiences for Undergraduate (REU), Presentation}, address = {Knoxville, TN}, author = {Lucien Ng and Kwai Wong and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @techreport {1130, title = {MAGMA-sparse Interface Design Whitepaper}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-05}, year = {2017}, month = {2017-09}, type = {Technical Report}, abstract = {In this report we describe the logic and interface we develop for the MAGMA-sparse library to allow for easy integration as third-party library into a top-level software ecosystem. The design choices are based on extensive consultation with other software library developers, in particular the Trilinos software development team. The interface documentation is at this point not exhaustive, but a first proposal for setting a standard. Although the interface description targets the MAGMA-sparse software module, we hope that the design choices carry beyond this specific library, and are attractive for adoption in other packages. This report is not intended as static document, but will be updated over time to reflect the agile software development in the ECP 1.3.3.11 STMS11-PEEKS project.}, author = {Hartwig Anzt and Erik Boman and Jack Dongarra and Goran Flegar and Mark Gates and Mike Heroux and Mark Hoemmen and Jakub Kurzak and Piotr Luszczek and Sivasankaran Rajamanickam and Stanimire Tomov and Stephen Wood and Ichitaro Yamazaki} } @conference {1084, title = {Novel HPC Techniques to Batch Execution of Many Variable Size BLAS Computations on GPUs}, booktitle = {International Conference on Supercomputing (ICS {\textquoteright}17)}, year = {2017}, month = {2017-06}, publisher = {ACM}, organization = {ACM}, address = {Chicago, Illinois}, doi = {10.1145/3079079.3079103}, url = {http://dl.acm.org/citation.cfm?id=3079103}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {1085, title = {Optimizing the SVD Bidiagonalization Process for a Batch of Small Matrices}, booktitle = {International Conference on Computational Science (ICCS 2017)}, year = {2017}, month = {2017-06}, publisher = {Procedia Computer Science}, organization = {Procedia Computer Science}, address = {Zurich, Switzerland}, abstract = {A challenging class of problems arising in many GPU applications, called batched problems, involves linear algebra operations on many small-sized matrices. We designed batched BLAS (Basic Linear Algebra Subroutines) routines, and in particular the Level-2 BLAS GEMV and the Level-3 BLAS GEMM routines, to solve them. We proposed device functions and big-tile settings in our batched BLAS design. We adopted auto-tuning to optimize different instances of GEMV routines. We illustrated our batched BLAS approach to optimize batched bi-diagonalization progressively on a K40c GPU. The optimization techniques in this paper are applicable to the other two-sided factorizations as well.}, doi = {https://doi.org/10.1016/j.procs.2017.05.237}, url = {http://www.sciencedirect.com/science/article/pii/S1877050917308645}, author = {Tingxing Dong and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {1141, title = {Out of Memory SVD Solver for Big Data}, booktitle = {2017 IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}17)}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {Many applications {\textendash} from data compression to numerical weather prediction and information retrieval {\textendash} need to compute large dense singular value decompositions (SVD). When the problems are too large to fit into the computer{\textquoteright}s main memory, specialized out-of-core algorithms that use disk storage are required. A typical example is when trying to analyze a large data set through tools like MATLAB or Octave, but the data is just too large to be loaded. To overcome this, we designed a class of out-of-memory (OOM) algorithms to reduce, as well as overlap communication with computation. Of particular interest is OOM algorithms for matrices of size m{\texttimes}n, where m >> n or m << n, e.g., corresponding to cases of too many variables, or too many observations. To design OOM SVDs, we first study the communications cost for the SVD techniques as well as for the QR/LQ factorization followed by SVD. We present the theoretical analysis about the data movement cost and strategies to design OOM SVD algorithms. We show performance results for multicore architecture that illustrate our theoretical findings and match our performance models. Moreover, our experimental results show the feasibility and superiority of the OOM SVD.}, author = {Azzam Haidar and Khairul Kabir and Diana Fayad and Stanimire Tomov and Jack Dongarra} } @techreport {1171, title = {POMPEI: Programming with OpenMP4 for Exascale Investigations}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-09}, year = {2017}, month = {2017-12}, publisher = {University of Tennessee}, abstract = {The objective of the Programming with OpenMP4 for Exascale Investigations (POMPEI) project is to explore new task-based programming techniques together with data structure centric programming for scientific applications to harness the potential of extreme-scale systems. Tasking is a well established by now approach on such systems as it has been used successfully to handle their large-scale parallelism and heterogeneity, which are leading challenges on the way to exascale computing. The approach is to harness the latest features of OpenMP4.5 and OpenACC2.5 to design abstractions shared among tasks and mapped efficiently to data-structure driven programming paradigms. This technical report describes the approach, along with its reference implementation and results for dense linear algebra algorithms.}, author = {Jack Dongarra and Azzam Haidar and Oscar Hernandez and Stanimire Tomov and Manjunath Gorentla Venkata} } @conference {1134, title = {Power-aware Computing: Measurement, Control, and Performance Analysis for Intel Xeon Phi}, booktitle = {2017 IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}17), Best Paper Finalist}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {The emergence of power efficiency as a primary constraint in processor and system designs poses new challenges concerning power and energy awareness for numerical libraries and scientific applications. Power consumption also plays a major role in the design of data centers in particular for peta- and exa- scale systems. Understanding and improving the energy efficiency of numerical simulation becomes very crucial. We present a detailed study and investigation toward control- ling power usage and exploring how different power caps affect the performance of numerical algorithms with different computa- tional intensities, and determine the impact and correlation with performance of scientific applications. Our analyses is performed using a set of representatives kernels, as well as many highly used scientific benchmarks. We quantify a number of power and performance measurements, and draw observations and conclusions that can be viewed as a roadmap toward achieving energy efficiency computing algorithms.}, doi = {https://doi.org/10.1109/HPEC.2017.8091085}, author = {Azzam Haidar and Heike Jagode and Asim YarKhan and Phil Vaccaro and Stanimire Tomov and Jack Dongarra} } @article {1338, title = {Power-Aware HPC on Intel Xeon Phi KNL Processors}, year = {2017}, month = {2017-06}, publisher = {ISC High Performance (ISC17), Intel Booth Presentation}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Heike Jagode and Asim YarKhan and Phil Vaccaro and Stanimire Tomov and Jack Dongarra} } @techreport {1080, title = {Roadmap for the Development of a Linear Algebra Library for Exascale Computing: SLATE: Software for Linear Algebra Targeting Exascale}, journal = {SLATE Working Notes}, number = {01, ICL-UT-17-02}, year = {2017}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Aurelien Bouteiller and Anthony Danalis and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Stephen Wood and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan} } @conference {1164, title = {Sampling Algorithms to Update Truncated SVD}, booktitle = {IEEE International Conference on Big Data}, year = {2017}, month = {2017-12}, publisher = {IEEE}, organization = {IEEE}, address = {Boston, MA}, author = {Ichitaro Yamazaki and Stanimire Tomov and Jack Dongarra} } @techreport {1082, title = {Small Tensor Operations on Advanced Architectures for High-Order Applications}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-17-749}, year = {2017}, month = {2017-04}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @article {1019, title = {Solving Dense Symmetric Indefinite Systems using GPUs}, journal = {Concurrency and Computation: Practice and Experience}, volume = {29}, year = {2017}, month = {2017-03}, abstract = {This paper studies the performance of different algorithms for solving a dense symmetric indefinite linear system of equations on multicore CPUs with a Graphics Processing Unit (GPU). To ensure the numerical stability of the factorization, pivoting is required. Obtaining high performance of such algorithms on the GPU is difficult because all the existing pivoting strategies lead to frequent synchronizations and irregular data accesses. Until recently, there has not been any implementation of these algorithms on a hybrid CPU/GPU architecture. To improve their performance on the hybrid architecture, we explore different techniques to reduce the expensive data transfer and synchronization between the CPU and GPU, or on the GPU (e.g., factorizing the matrix entirely on the GPU or in a communication-avoiding fashion). We also study the performance of the solver using iterative refinements along with the factorization without pivoting combined with the preprocessing technique based on random butterfly transformations, or with the mixed-precision algorithm where the matrix is factorized in single precision. This randomization algorithm only has a probabilistic proof on the numerical stability, and for this paper, we only focused on the mixed-precision algorithm without pivoting. However, they demonstrate that we can obtain good performance on the GPU by avoiding the pivoting and using the lower precision arithmetics, respectively. As illustrated with the application in acoustics studied in this paper, in many practical cases, the matrices can be factorized without pivoting. Because the componentwise backward error computed in the iterative refinement signals when the algorithm failed to obtain the desired accuracy, the user can use these potentially unstable but efficient algorithms in most of the cases and fall back to a more stable algorithm with pivoting only in the case of the failure.}, doi = {10.1002/cpe.4055}, url = {http://onlinelibrary.wiley.com/doi/10.1002/cpe.4055/full}, author = {Marc Baboulin and Jack Dongarra and Adrien Remy and Stanimire Tomov and Ichitaro Yamazaki} } @article {1086, title = {Structure-aware Linear Solver for Realtime Convex Optimization for Embedded Systems}, journal = {IEEE Embedded Systems Letters}, volume = {9}, year = {2017}, month = {2017-05}, pages = {61{\textendash}64}, abstract = {With the increasing sophistication in the use of optimization algorithms such as deep learning on embedded systems, the convex optimization solvers on embedded systems have found widespread use. This letter presents a novel linear solver technique to reduce the run-time of convex optimization solver by using the property that some parameters are fixed during the solution iterations of a solve instance. Our experimental results show that the run-time can be reduced by two orders of magnitude.}, keywords = {Karush Kuhn Tucker (KKT), Realtime embedded convex optimization solver}, doi = {10.1109/LES.2017.2700401}, url = {http://ieeexplore.ieee.org/document/7917357/}, author = {Ichitaro Yamazaki and Saeid Nooshabadi and Stanimire Tomov and Jack Dongarra} } @article {, title = {With Extreme Computing, the Rules Have Changed}, journal = {Computing in Science \& Engineering}, volume = {19}, year = {2017}, month = {2017-05}, pages = {52-62}, abstract = {On the eve of exascale computing, traditional wisdom no longer applies. High-performance computing is gone as we know it. This article discusses a range of new algorithmic techniques emerging in the context of exascale computing, many of which defy the common wisdom of high-performance computing and are considered unorthodox, but could turn out to be a necessity in near future.}, doi = {https://doi.org/10.1109/MCSE.2017.48}, author = {Jack Dongarra and Stanimire Tomov and Piotr Luszczek and Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Hartwig Anzt and Azzam Haidar and Ahmad Abdelfattah} } @article {1342, title = {Accelerating Tensor Contractions for High-Order FEM on CPUs, GPUs, and KNLs}, year = {2016}, month = {2016-09}, publisher = {moky Mountains Computational Sciences and Engineering Conference (SMC16), Poster}, address = {Gatlinburg, TN}, author = {Azzam Haidar and Ahmad Abdelfattah and Veselin Dobrev and Ian Karlin and Tzanio Kolev and Stanimire Tomov and Jack Dongarra} } @article {1344, title = {Cholesky Factorization on Batches of Matrices with Fixed and Variable Sizes}, year = {2016}, month = {2016-04}, publisher = {GPU Technology Conference (GTC16), Poster}, address = {San Jose, CA}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @inbook {883, title = {Dense Symmetric Indefinite Factorization on GPU Accelerated Architectures}, booktitle = {Lecture Notes in Computer Science}, series = {11th International Conference, PPAM 2015, Krakow, Poland, September 6-9, 2015. Revised Selected Papers, Part I}, volume = {9573}, year = {2016}, month = {2015-09}, pages = {86-95}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, chapter = {Parallel Processing and Applied Mathematics}, abstract = {We study the performance of dense symmetric indefinite factorizations (Bunch-Kaufman and Aasen{\textquoteright}s algorithms) on multicore CPUs with a Graphics Processing Unit (GPU). Though such algorithms are needed in many scientific and engineering simulations, obtaining high performance of the factorization on the GPU is difficult because the pivoting that is required to ensure the numerical stability of the factorization leads to frequent synchronizations and irregular data accesses. As a result, until recently, there has not been any implementation of these algorithms on hybrid CPU/GPU architectures. To improve their performance on the hybrid architecture, we explore different techniques to reduce the expensive communication and synchronization between the CPU and GPU, or on the GPU. We also study the performance of an LDL^T factorization with no pivoting combined with the preprocessing technique based on Random Butterfly Transformations. Though such transformations only have probabilistic results on the numerical stability, they avoid the pivoting and obtain a great performance on the GPU. }, keywords = {Communication-avoiding, Dense symmetric indefinite factorization, gpu computation, randomization}, isbn = {978-3-319-32149-3}, doi = {10.1007/978-3-319-32149-3_9}, author = {Marc Baboulin and Jack Dongarra and Adrien Remy and Stanimire Tomov and Ichitaro Yamazaki}, editor = {Roman Wyrzykowski and Ewa Deelman and Konrad Karczewski and Jacek Kitowski and Kazimierz Wiatr} } @conference {940, title = {On the Development of Variable Size Batched Computation for Heterogeneous Parallel Architectures}, booktitle = {The 17th IEEE International Workshop on Parallel and Distributed Scientific and Engineering Computing (PDSEC 2016), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {

Many scientific applications, ranging from national security to medical advances, require solving a number of relatively small-size independent problems. As the size of each individual problem does not provide sufficient parallelism for the underlying hardware, especially accelerators, these problems must be solved concurrently as a batch in order to saturate the hardware with enough work, hence the name batched computation. A possible simplification is to assume a uniform size for all problems. However, real applications do not necessarily satisfy such assumption. Consequently, an efficient solution for variable-size batched computations is required.

This paper proposes a foundation for high performance variable-size batched matrix computation based on Graphics Processing Units (GPUs). Being throughput-oriented processors, GPUs favor regular computation and less divergence among threads, in order to achieve high performance. Therefore, the development of high performance numerical software for this kind of problems is challenging. As a case study, we developed efficient batched Cholesky factorization algorithms for relatively small matrices of different sizes. However, most of the strategies and the software developed, and in particular a set of variable size batched BLAS kernels, can be used in many other dense matrix factorizations, large scale sparse direct multifrontal solvers, and applications. We propose new interfaces and mechanisms to handle the irregular computation pattern on the GPU. According to the authors{\textquoteright} knowledge, this is the first attempt to develop high performance software for this class of problems. Using a K40c GPU, our performance tests show speedups of up to 2:5 against two Sandy Bridge CPUs (8-core each) running Intel MKL library.

}, keywords = {batched computation, GPUs, variable small sizes}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {939, title = {Heterogeneous Streaming}, booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This paper introduces a new heterogeneous streaming library called hetero Streams (hStreams). We show how a simple FIFO streaming model can be applied to heterogeneous systems that include manycore coprocessors and multicore CPUs. This model supports concurrency across nodes, among tasks within a node, and between data transfers and computation. We give examples for different approaches, show how the implementation can be layered, analyze overheads among layers, and apply those models to parallelize applications using simple, intuitive interfaces. We compare the features and versatility of hStreams, OpenMP, CUDA Streams1 and OmpSs. We show how the use of hStreams makes it easier for scientists to identify tasks and easily expose concurrency among them, and how it enables tuning experts and runtime systems to tailor execution for different heterogeneous targets. Practical application examples are taken from the field of numerical linear algebra, commercial structural simulation software, and a seismic processing application.}, keywords = {plasma}, author = {Chris J. Newburn and Gaurav Bansal and Michael Wood and Luis Crivelli and Judit Planas and Alejandro Duran and Paulo Souza and Leonardo Borges and Piotr Luszczek and Stanimire Tomov and Jack Dongarra and Hartwig Anzt and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Ichitaro Yamazaki and Jesus Labarta} } @techreport {972, title = {High Performance Realtime Convex Solver for Embedded Systems}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-745}, year = {2016}, month = {2016-10}, abstract = {Convex optimization solvers for embedded systems find widespread use. This letter presents a novel technique to reduce the run-time of decomposition of KKT matrix for the convex optimization solver for an embedded system, by two orders of magnitude. We use the property that although the KKT matrix changes, some of its block sub-matrices are fixed during the solution iterations and the associated solving instances.}, keywords = {KKT, Realtime embedded convex optimization solver}, author = {Ichitaro Yamazaki and Saeid Nooshabadi and Stanimire Tomov and Jack Dongarra} } @conference {964, title = {High-performance Matrix-matrix Multiplications of Very Small Matrices}, booktitle = {22nd International European Conference on Parallel and Distributed Computing (Euro-Par{\textquoteright}16)}, year = {2016}, month = {2016-08}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Grenoble, France}, abstract = {The use of the general dense matrix-matrix multiplication (GEMM) is fundamental for obtaining high performance in many scientific computing applications. GEMMs for small matrices (of sizes less than 32) however, are not sufficiently optimized in existing libraries. In this paper we consider the case of many small GEMMs on either CPU or GPU architectures. This is a case that often occurs in applications like big data analytics, machine learning, high-order FEM, and others. The GEMMs are grouped together in a single batched routine. We present specialized for these cases algorithms and optimization techniques to obtain performance that is within 90\% of the optimal. We show that these results outperform currently available state-of-the-art implementations and vendor-tuned math libraries.}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jo{\"e}l Falcou and Jack Dongarra} } @conference {942, title = {High-Performance Tensor Contractions for GPUs}, booktitle = {International Conference on Computational Science (ICCS{\textquoteright}16)}, year = {2016}, month = {2016-06}, address = {San Diego, CA}, abstract = {We present a computational framework for high-performance tensor contractions on GPUs. High-performance is difficult to obtain using existing libraries, especially for many independent contractions where each contraction is very small, e.g., sub-vector/warp in size. However, using our framework to batch contractions plus application-specifics, we demonstrate close to peak performance results. In particular, to accelerate large scale tensor-formulated high-order finite element method (FEM) simulations, which is the main focus and motivation for this work, we represent contractions as tensor index reordering plus matrix-matrix multiplications (GEMMs). This is a key factor to achieve algorithmically many-fold acceleration (vs. not using it) due to possible reuse of data loaded in fast memory. In addition to using this context knowledge, we design tensor data-structures, tensor algebra interfaces, and new tensor contraction algorithms and implementations to achieve 90+\% of a theoretically derived peak on GPUs. On a K40c GPU for contractions resulting in GEMMs on square matrices of size 8 for example, we are 2.8{\texttimes} faster than CUBLAS, and 8.5{\texttimes} faster than MKL on 16 cores of Intel Xeon E5-2670 (Sandy Bridge) 2.60GHz CPUs. Finally, we apply autotuning and code generation techniques to simplify tuning and provide an architecture-aware, user-friendly interface.}, keywords = {Applications, Batched linear algebra, FEM, gpu, Tensor contractions, Tensor HPC}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @techreport {929, title = {High-Performance Tensor Contractions for GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-738}, year = {2016}, month = {2016-01}, publisher = {University of Tennessee}, abstract = {We present a computational framework for high-performance tensor contractions on GPUs. High-performance is difficult to obtain using existing libraries, especially for many independent contractions where each contraction is very small, e.g., sub-vector/warp in size. However, using our framework to batch contractions plus application-specifics, we demonstrate close to peak performance results. In particular, to accelerate large scale tensor-formulated high-order finite element method (FEM) simulations, which is the main focus and motivation for this work, we represent contractions as tensor index reordering plus matrix-matrix multiplications (GEMMs). This is a key factor to achieve algorithmically many-fold acceleration (vs. not using it) due to possible reuse of data loaded in fast memory. In addition to using this context knowledge, we design tensor data-structures, tensor algebra interfaces, and new tensor contraction algorithms and implementations to achieve 90+\% of a theoretically derived peak on GPUs. On a K40c GPU for contractions resulting in GEMMs on square matrices of size 8 for example, we are 2.8{\texttimes} faster than CUBLAS, and 8.5{\texttimes} faster than MKL on 16 cores of Intel Xeon ES-2670 (Sandy Bridge) 2.60GHz CPUs. Finally, we apply autotuning and code generation techniques to simplify tuning and provide an architecture-aware, user-friendly interface.}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @article {1472, title = {Linear Algebra Software for Large-Scale Accelerated Multicore Computing}, journal = {Acta Numerica}, volume = {25}, year = {2016}, month = {2016-05}, pages = {1-160}, abstract = {Many crucial scientific computing applications, ranging from national security to medical advances, rely on high-performance linear algebra algorithms and technologies, underscoring their importance and broad impact. Here we present the state-of-the-art design and implementation practices for the acceleration of the predominant linear algebra algorithms on large-scale accelerated multicore systems. Examples are given with fundamental dense linear algebra algorithms {\textendash} from the LU, QR, Cholesky, and LDLT factorizations needed for solving linear systems of equations, to eigenvalue and singular value decomposition (SVD) problems. The implementations presented are readily available via the open-source PLASMA and MAGMA libraries, which represent the next generation modernization of the popular LAPACK library for accelerated multicore systems. To generate the extreme level of parallelism needed for the efficient use of these systems, algorithms of interest are redesigned and then split into well-chosen computational tasks. The task execution is scheduled over the computational components of a hybrid system of multicore CPUs with GPU accelerators and/or Xeon Phi coprocessors, using either static scheduling or light-weight runtime systems. The use of light-weight runtime systems keeps scheduling overheads low, similar to static scheduling, while enabling the expression of parallelism through sequential-like code. This simplifies the development effort and allows exploration of the unique strengths of the various hardware components. Finally, we emphasize the development of innovative linear algebra algorithms using three technologies {\textendash} mixed precision arithmetic, batched operations, and asynchronous iterations {\textendash} that are currently of high interest for accelerated multicore systems.}, doi = {10.1017/S0962492916000015}, author = {Ahmad Abdelfattah and Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and undefined and Asim YarKhan} } @conference {963, title = {LU, QR, and Cholesky Factorizations: Programming Model, Performance Analysis and Optimization Techniques for the Intel Knights Landing Xeon Phi}, booktitle = {IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}16)}, year = {2016}, month = {2016-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {A wide variety of heterogeneous compute resources, ranging from multicore CPUs to GPUs and coprocessors, are available to modern computers, making it challenging to design unified numerical libraries that efficiently and productively use all these varied resources. For example, in order to efficiently use Intel{\textquoteright}s Knights Langing (KNL) processor, the next-generation of Xeon Phi architectures, one must design and schedule an application in multiple degrees of parallelism and task grain sizes in order to obtain efficient performance. We propose a productive and portable programming model that allows us to write a serial-looking code, which, however, achieves parallelism and scalability by using a lightweight runtime environment to manage the resource-specific workload, and to control the dataflow and the parallel execution. This is done through multiple techniques ranging from multi-level data partitioning to adaptive task grain sizes, and dynamic task scheduling. In addition, our task abstractions enable unified algorithmic development across all the heterogeneous resources. Finally, we outline the strengths and the effectiveness of this approach {\textendash} especially in regards to hardware trends and ease of programming high-performance numerical software that current applications need {\textendash} in order to motivate current work and future directions for the next generation of parallel programming models for high-performance linear algebra libraries on heterogeneous systems.}, author = {Azzam Haidar and Stanimire Tomov and Konstantin Arturov and Murat Guney and Shane Story and Jack Dongarra} } @techreport {, title = {MAGMA Batched: A Batched BLAS Approach for Small Matrix Factorizations and Applications on GPUs}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-16-02}, year = {2016}, month = {2016-08}, publisher = {University of Tennessee}, abstract = {A particularly challenging class of problems arising in many applications, called batched problems, involves linear algebra operations on many small-sized matrices. We proposed and designed batched BLAS (Basic Linear Algebra Subroutines), Level-2 GEMV and Level-3 GEMM, to solve them. We illustrate how batched GEMV and GEMM to be able to assist batched advance factorization (e.g. bi-diagonalization) and other BLAS routines (e.g. triangular solve) to achieve optimal performance on GPUs. Our solutions achieved up to 2.8-3{\texttimes} speedups compared to CUBLAS and MKL solutions, wherever possible. We illustrated the batched methodology on a real-world Hydrodynamic application by reformulating the tensor operations into batched BLAS GEMV and GEMM operations. A 2.5{\texttimes} speedup and a 1.4{\texttimes} greenup are obtained by changing 10\% of the code. We accelerated and scaled it on Titan supercomputer to 4096 nodes.}, author = {Tingxing Dong and Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Ahmad Abdelfattah and Jack Dongarra} } @article {971, title = {Non-GPU-resident Dense Symmetric Indefinite Factorization}, journal = {Concurrency and Computation: Practice and Experience}, year = {2016}, month = {2016-11}, abstract = {We study various algorithms to factorize a symmetric indefinite matrix that does not fit in the core memory of a computer. There are two sources of the data movement into the memory: one needed for selecting and applying pivots and the other needed to update each column of the matrix for the factorization. It is a challenge to obtain high performance of such an algorithm when the pivoting is required to ensure the numerical stability of the factorization. For example, when factorizing each column of the matrix, a diagonal entry, which ensures the stability, may need to be selected as a pivot among the remaining diagonals, and moved to the leading diagonal by swapping both the corresponding rows and columns of the matrix. If the pivot is not in the core memory, then it must be loaded into the core memory. For updating the matrix, the data locality may be improved by partitioning the matrix. For example, a right-looking partitioned algorithm first factorizes the leading columns, called panel, and then uses the factorized panel to update the trailing submatrix. This algorithm only accesses the trailing submatrix after each panel factorization (instead of after each column factorization) and performs most of its floating-point operations (flops) using BLAS-3, which can take advantage of the memory hierarchy. However, because the pivots cannot be predetermined, the whole trailing submatrix must be updated before the next panel factorization can start. When the whole submatrix does not fit in the core memory all at once, loading the block columns into the memory can become the performance bottleneck. Similarly, the left-looking variant of the algorithm would require to update each panel with all of the previously factorized columns. This makes it a much greater challenge to implement an efficient out-of-core symmetric indefinite factorization compared with an out-of-core nonsymmetric LU factorization with partial pivoting, which only requires to swap the rows of the matrix and accesses the trailing submatrix after each in-core factorization (instead of after each panel factorization by the symmetric factorization). To reduce the amount of the data transfer, in this paper we uses the recently proposed left-looking communication-avoiding variant of the symmetric factorization algorithm to factorize the columns in the core memory, and then perform the partitioned right-looking out-of-core trailing submatrix updates. This combination may still require to load the pivots into the core memory, but it only updates the trailing submatrix after each in-core factorization, while the previous algorithm updates it after each panel factorization.Although these in-core and out-of-core algorithms can be applied at any level of the memory hierarchy, we apply our designs to the GPU and CPU memory, respectively. We call this specific implementation of the algorithm a non{\textendash}GPU-resident implementation. Our performance results on the current hybrid CPU/GPU architecture demonstrate that when the matrix is much larger than the GPU memory, the proposed algorithm can obtain significant speedups over the communication-hiding implementations of the previous algorithms.}, doi = {10.1002/cpe.4012}, author = {Ichitaro Yamazaki and Stanimire Tomov and Jack Dongarra} } @conference {968, title = {Performance Analysis and Acceleration of Explicit Integration for Large Kinetic Networks using Batched GPU Computations}, booktitle = {2016 IEEE High Performance Extreme Computing Conference (HPEC {\textquoteleft}16)}, year = {2016}, month = {2016-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {We demonstrate the systematic implementation of recently-developed fast explicit kinetic integration algorithms that solve efficiently N coupled ordinary differential equations (subject to initial conditions) on modern GPUs. We take representative test cases (Type Ia supernova explosions) and demonstrate two or more orders of magnitude increase in efficiency for solving such systems (of realistic thermonuclear networks coupled to fluid dynamics). This implies that important coupled, multiphysics problems in various scientific and technical disciplines that were intractable, or could be simulated only with highly schematic kinetic networks, are now computationally feasible. As examples of such applications we present the computational techniques developed for our ongoing deployment of these new methods on modern GPU accelerators. We show that similarly to many other scientific applications, ranging from national security to medical advances, the computation can be split into many independent computational tasks, each of relatively small-size. As the size of each individual task does not provide sufficient parallelism for the underlying hardware, especially for accelerators, these tasks must be computed concurrently as a single routine, that we call batched routine, in order to saturate the hardware with enough work.}, author = {Azzam Haidar and Benjamin Brock and Stanimire Tomov and Michael Guidry and Jay Jay Billings and Daniel Shyles and Jack Dongarra} } @article {990, title = {On the performance and energy efficiency of sparse linear algebra on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2016}, month = {2016-10}, abstract = {In this paper we unveil some performance and energy efficiency frontiers for sparse computations on GPU-based supercomputers. We compare the resource efficiency of different sparse matrix{\textendash}vector products (SpMV) taken from libraries such as cuSPARSE and MAGMA for GPU and Intel{\textquoteright}s MKL for multicore CPUs, and develop a GPU sparse matrix{\textendash}matrix product (SpMM) implementation that handles the simultaneous multiplication of a sparse matrix with a set of vectors in block-wise fashion. While a typical sparse computation such as the SpMV reaches only a fraction of the peak of current GPUs, we show that the SpMM succeeds in exceeding the memory-bound limitations of the SpMV. We integrate this kernel into a GPU-accelerated Locally Optimal Block Preconditioned Conjugate Gradient (LOBPCG) eigensolver. LOBPCG is chosen as a benchmark algorithm for this study as it combines an interesting mix of sparse and dense linear algebra operations that is typical for complex simulation applications, and allows for hardware-aware optimizations. In a detailed analysis we compare the performance and energy efficiency against a multi-threaded CPU counterpart. The reported performance and energy efficiency results are indicative of sparse computations on supercomputers.}, doi = {10.1177/1094342016672081}, url = {http://hpc.sagepub.com/content/early/2016/10/05/1094342016672081.abstract}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @techreport {934, title = {Performance, Design, and Autotuning of Batched GEMM for GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-739}, year = {2016}, month = {2016-02}, publisher = {University of Tennessee}, abstract = {Abstract. The general matrix-matrix multiplication (GEMM) is the most important numerical kernel in dense linear algebra. It is the key component for obtaining high performance in most LAPACK routines. As batched computations on relatively small problems continue to gain interest in many scientific applications, there becomes a need to have a high performance GEMM kernel for a batch of small matrices. Such kernel should be well designed and tuned to handle small sizes, and to maintain high performance for realistic test cases found in the higher level LAPACK routines, and scientific computing applications in general. This paper presents a high performance batched GEMM kernel on Graphics Processing Units (GPUs). We address batched problems with both xed and variable sizes, and show that specialized GEMM designs and a comprehensive autotuning process are needed to handle problems of small sizes. For most performance test reported in this paper, the proposed kernels outperform state-of-the-art approaches using a K40c GPU.}, keywords = {Autotuning, Batched GEMM, GEMM, GPU computing, HPC}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {944, title = {Performance, Design, and Autotuning of Batched GEMM for GPUs}, booktitle = {The International Supercomputing Conference (ISC High Performance 2016)}, year = {2016}, month = {2016-06}, address = {Frankfurt, Germany}, abstract = {The general matrix-matrix multiplication (GEMM) is the most important numerical kernel in dense linear algebra, and is the key component for obtaining high performance in most LAPACK routines. As batched computations on relatively small problems continue to gain interest in many scientific applications, a need arises for a high performance GEMM kernel for batches of small matrices. Such a kernel should be well designed and tuned to handle small sizes, and to maintain high performance for realistic test cases found in the higher level LAPACK routines, and scientific computing applications in general. This paper presents a high performance batched GEMM kernel on Graphics Processing Units (GPUs). We address batched problems with both fixed and variable sizes, and show that specialized GEMM designs and a comprehensive autotuning process are needed to handle problems of small sizes. For most performance tests reported in this paper, the proposed kernels outperform state-of-the-art approaches using a K40c GPU.}, keywords = {Autotuning, Batched GEMM, GEMM, GPU computing, HPC}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @inbook {997, title = {Performance, Design, and Autotuning of Batched GEMM for GPUs}, booktitle = {High Performance Computing: 31st International Conference, ISC High Performance 2016, Frankfurt, Germany, June 19-23, 2016, Proceedings}, number = {9697}, year = {2016}, pages = {21{\textendash}38}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {The general matrix-matrix multiplication (GEMM) is the most important numerical kernel in dense linear algebra, and is the key component for obtaining high performance in most LAPACK routines. As batched computations on relatively small problems continue to gain interest in many scientific applications, a need arises for a high performance GEMM kernel for batches of small matrices. Such a kernel should be well designed and tuned to handle small sizes, and to maintain high performance for realistic test cases found in the higher level LAPACK routines, and scientific computing applications in general. This paper presents a high performance batched GEMM kernel on Graphics Processing Units (GPUs). We address batched problems with both fixed and variable sizes, and show that specialized GEMM designs and a comprehensive autotuning process are needed to handle problems of small sizes. For most performance tests reported in this paper, the proposed kernels outperform state-of-the-art approaches using a K40c GPU.}, isbn = {978-3-319-41321-1}, doi = {10.1007/978-3-319-41321-1_2}, url = {http://dx.doi.org/10.1007/978-3-319-41321-1_2}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra}, editor = {Julian M. Kunkel and Pavan Balaji and Jack Dongarra} } @conference {943, title = {Performance Tuning and Optimization Techniques of Fixed and Variable Size Batched Cholesky Factorization on GPUs}, booktitle = {International Conference on Computational Science (ICCS{\textquoteright}16)}, year = {2016}, month = {2016-06}, address = {San Diego, CA}, abstract = {

Solving a large number of relatively small linear systems has recently drawn more attention in the HPC community, due to the importance of such computational workloads in many scientific applications, including sparse multifrontal solvers. Modern hardware accelerators and their architecture require a set of optimization techniques that are very different from the ones used in solving one relatively large matrix. In order to impose concurrency on such throughput-oriented architectures, a common practice is to batch the solution of these matrices as one task offloaded to the underlying hardware, rather than solving them individually.

This paper presents a high performance batched Cholesky factorization on large sets of relatively small matrices using Graphics Processing Units (GPUs), and addresses both fixed and variable size batched problems. We investigate various algorithm designs and optimization techniques, and show that it is essential to combine kernel design with performance tuning in order to achieve the best possible performance. We compare our approaches against state-of-the-art CPU solutions as well as GPU-based solutions using existing libraries, and show that, on a K40c GPU for example, our kernels are more than 2 faster.

}, keywords = {batched computation, Cholesky Factorization, GPUs, Tuning}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {935, title = {Stability and Performance of Various Singular Value QR Implementations on Multicore CPU with a GPU}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {43}, year = {2016}, month = {2016-10}, abstract = {To orthonormalize a set of dense vectors, Singular Value QR (SVQR) requires only one global reduction between the parallel processing units, and uses BLAS-3 kernels to perform most of its local computation. As a result, compared to other orthogonalization schemes, SVQR obtains superior performance on many of the current computers. In this paper, we study the stability and performance of various SVQR implementations on multicore CPUs with a GPU, focusing on the dense triangular solve, which performs half of the total floating-point operations in SVQR. As a part of this study, we examine its adaptive mixed-precision variant that decides if a lower-precision arithmetic can be used for the triangular solution at runtime without increasing the order of its orthogonality error. Since the backward error of this adaptive mixed-precision variant is significantly greater than that of the standard SVQR, we study its effects on the solution convergence of several subspace projection methods for solving a linear system of equations and for computing singular values or eigenvalues of a sparse matrix. Our experimental results indicate that in some cases, the convergence rate of the solver may not be affected by the larger backward errors, while reducing the time to solution.}, author = {Ichitaro Yamazaki and Stanimire Tomov and Jack Dongarra} } @article {1343, title = {A Standard for Batched BLAS Routines}, year = {2016}, month = {2016-04}, publisher = {17th SIAM Conference on Parallel Processing for Scientific Computing (SIAM PP16)}, address = {Paris, France}, author = {Pedro Valero-Lara and Jack Dongarra and Azzam Haidar and Samuel D. Relton and Stanimire Tomov and Mawussi Zounon} } @conference {977, title = {Towards Achieving Performance Portability Using Directives for Accelerators}, booktitle = {The International Conference for High Performance Computing, Networking, Storage and Analysis (SC{\textquoteright}16), Third Workshop on Accelerator Programming Using Directives (WACCPD)}, year = {2016}, month = {2016-11}, publisher = {Innovative Computing Laboratory, University of Tennessee}, organization = {Innovative Computing Laboratory, University of Tennessee}, address = {Salt Lake City, Utah}, abstract = {In this paper we explore the performance portability of directives provided by OpenMP 4 and OpenACC to program various types of node architectures with attached accelerators, both self-hosted multicore and offload multicore/GPU. Our goal is to examine how successful OpenACC and the newer of- fload features of OpenMP 4.5 are for moving codes between architectures, how much tuning might be required and what lessons we can learn from this experience. To do this, we use examples of algorithms with varying computational intensities for our evaluation, as both compute and data access efficiency are important considerations for overall application performance. We implement these kernels using various methods provided by newer OpenACC and OpenMP implementations, and we evaluate their performance on various platforms including both X86 64 with attached NVIDIA GPUs, self-hosted Intel Xeon Phi KNL, as well as an X86 64 host system with Intel Xeon Phi coprocessors. In this paper, we explain what factors affected the performance portability such as how to pick the right programming model, its programming style, its availability on different platforms, and how well compilers can optimize and target to multiple platforms.}, author = {M. Graham Lopez and Larrea, V and Joubert, W and Hernandez, O and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {914, title = {Accelerating the LOBPCG method on GPUs using a blocked Sparse Matrix Vector Product}, booktitle = {Spring Simulation Multi-Conference 2015 (SpringSim{\textquoteright}15)}, year = {2015}, month = {2015-04}, publisher = {SCS}, organization = {SCS}, address = {Alexandria, VA}, abstract = {This paper presents a heterogeneous CPU-GPU implementation for a sparse iterative eigensolver the Locally Optimal Block Preconditioned Conjugate Gradient (LOBPCG). For the key routine generating the Krylov search spaces via the product of a sparse matrix and a block of vectors, we propose a GPU kernel based on a modi ed sliced ELLPACK format. Blocking a set of vectors and processing them simultaneously accelerates the computation of a set of consecutive SpMVs significantly. Comparing the performance against similar routines from Intel{\textquoteright}s MKL and NVIDIA{\textquoteright}s cuSPARSE library we identify appealing performance improvements. We integrate it into the highly optimized LOBPCG implementation. Compared to the BLOBEX CPU implementation running on two eight-core Intel Xeon E5-2690s, we accelerate the computation of a small set of eigenvectors using NVIDIA{\textquoteright}s K40 GPU by typically more than an order of magnitude.}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @article {866, title = {Acceleration of GPU-based Krylov solvers via Data Transfer Reduction}, journal = {International Journal of High Performance Computing Applications}, year = {2015}, author = {Hartwig Anzt and William Sawyer and Stanimire Tomov and Piotr Luszczek and Jack Dongarra} } @conference {843, title = {Batched Matrix Computations on Hardware Accelerators}, booktitle = {EuroMPI/Asia 2015 Workshop}, year = {2015}, month = {2015-09}, address = {Bordeaux, France}, abstract = {Scientific applications require solvers that work on many small size problems that are independent from each other. At the same time, the high-end hardware evolves rapidly and becomes ever more throughput-oriented and thus there is an increasing need for effective approach to develop energy efficient, high-performance codes for these small matrix problems that we call batched factorizations. The many applications that need this functionality could especially benefit from the use of GPUs, which currently are four to five times more energy efficient than multicore CPUs on important scientific workloads. This paper, consequently, describes the development of the most common, one-sided factorizations: Cholesky, LU, and QR for a set of small dense matrices. The algorithms we present together with their implementations are, by design, inherently parallel. In particular, our approach is based on representing the process as a sequence of batched BLAS routines that are executed entirely on a GPU. Importantly, this is unlike the LAPACK and the hybridMAGMAfactorization algorithms that work under drastically different assumptions of hardware design and efficiency of execution of the various computational kernels involved in the implementation. Thus, our approach is more efficient than what works for a combination of multicore CPUs and GPUs for the problems sizes of interest of the application use cases. The paradigm where upon a single chip (a GPU or a CPU) factorizes a single problem at a time is not at all efficient for in our applications{\textquoteright} context. We illustrate all these claims through a detailed performance analysis. With the help of profiling and tracing tools, we guide our development of batched factorizations to achieve up to two-fold speedup and three-fold better energy efficiency as compared against our highly optimized batched CPU implementations based on MKL library. The tested system featured two sockets of Intel Sandy Bridge CPUs and we compared to a batched LU factorizations featured in the CUBLAS library for GPUs, we achieve as high as 2.5x speedup on the NVIDIA K40 GPU.}, author = {Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {895, title = {Batched Matrix Computations on Hardware Accelerators Based on GPUs}, booktitle = {2015 SIAM Conference on Applied Linear Algebra (SIAM LA)}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {We will present techniques for small matrix computations on GPUs and their use for energy efficient, high-performance solvers. Work on small problems delivers high performance through improved data reuse. Many numerical libraries and applications need this functionality further developed. We describe the main factorizations LU, QR, and Cholesky for a set of small dense matrices in parallel. We achieve significant acceleration and reduced energy consumption against other solutions. Our techniques are of interest to GPU application developers in general.}, author = {Azzam Haidar and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra} } @article {858, title = {Batched matrix computations on hardware accelerators based on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2015}, month = {2015-02}, abstract = {Scientific applications require solvers that work on many small size problems that are independent from each other. At the same time, the high-end hardware evolves rapidly and becomes ever more throughput-oriented and thus there is an increasing need for an effective approach to develop energy-efficient, high-performance codes for these small matrix problems that we call batched factorizations. The many applications that need this functionality could especially benefit from the use of GPUs, which currently are four to five times more energy efficient than multicore CPUs on important scientific workloads. This paper, consequently, describes the development of the most common, one-sided factorizations, Cholesky, LU, and QR, for a set of small dense matrices. The algorithms we present together with their implementations are, by design, inherently parallel. In particular, our approach is based on representing the process as a sequence of batched BLAS routines that are executed entirely on a GPU. Importantly, this is unlike the LAPACK and the hybrid MAGMA factorization algorithms that work under drastically different assumptions of hardware design and efficiency of execution of the various computational kernels involved in the implementation. Thus, our approach is more efficient than what works for a combination of multicore CPUs and GPUs for the problems sizes of interest of the application use cases. The paradigm where upon a single chip (a GPU or a CPU) factorizes a single problem at a time is not at all efficient in our applications{\textquoteright} context. We illustrate all of these claims through a detailed performance analysis. With the help of profiling and tracing tools, we guide our development of batched factorizations to achieve up to two-fold speedup and three-fold better energy efficiency as compared against our highly optimized batched CPU implementations based on MKL library. The tested system featured two sockets of Intel Sandy Bridge CPUs and we compared with a batched LU factorizations featured in the CUBLAS library for GPUs, we achieve as high as 2.5{\texttimes} speedup on the NVIDIA K40 GPU.}, keywords = {batched factorization, hardware accelerators, numerical linear algebra, numerical software libraries, one-sided factorization algorithms}, doi = {10.1177/1094342014567546}, author = {Azzam Haidar and Tingxing Dong and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {928, title = {Cholesky Across Accelerators}, booktitle = {17th IEEE International Conference on High Performance Computing and Communications (HPCC 2015)}, year = {2015}, month = {2015-08}, publisher = {IEEE}, organization = {IEEE}, address = {Elizabeth, NJ}, author = {Asim YarKhan and Azzam Haidar and Chongxiao Cao and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {896, title = {Comparing Hybrid CPU-GPU and Native GPU-only Acceleration for Linear Algebra}, booktitle = {2015 SIAM Conference on Applied Linear Algebra}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {Accelerating dense linear algebra using GPUs admits two models: hybrid CPU-GPU and GPU-only. The hybrid model factors the panel on the CPU while updating the trailing matrix on the GPU, concentrating the GPU on high-performance matrix multiplies. The GPU-only model performs the entire computation on the GPU, avoiding costly data transfers to the CPU. We compare these two approaches for three QR-based algorithms: QR factorization, rank revealing QR, and reduction to Hessenberg.}, author = {Mark Gates and Stanimire Tomov and Azzam Haidar} } @article {851, title = {Computing Low-rank Approximation of a Dense Matrix on Multicore CPUs with a GPU and its Application to Solving a Hierarchically Semiseparable Linear System of Equations}, journal = {Scientific Programming}, year = {2015}, abstract = {Low-rank matrices arise in many scientific and engineering computation. Both computational and storage costs of manipulating such matrices may be reduced by taking advantages of their low-rank properties. To compute a low-rank approximation of a dense matrix, in this paper, we study the performance of QR factorization with column pivoting or with restricted pivoting on multicore CPUs with a GPU. We first propose several techniques to reduce the postprocessing time, which is required for restricted pivoting, on a modern CPU. We then examine the potential of using a GPU to accelerate the factorization process with both column and restricted pivoting. Our performance results on two eight-core Intel Sandy Bridge CPUs with one NVIDIA Kepler GPU demonstrate that using the GPU, the factorization time can be reduced by a factor of more than two. In addition, to study the performance of our implementations in practice, we integrate them into a recently-developed software StruMF which algebraically exploits such low-rank structures for solving a general sparse linear system of equations. Our performance results for solving Poisson{\textquoteright}s equations demonstrate that the proposed techniques can significantly reduce the preconditioner construction time of StruMF on the CPUs, and the construction time can be further reduced by 10\%-50\% using the GPU.}, author = {Ichitaro Yamazaki and Stanimire Tomov and Jack Dongarra} } @conference {862, title = {On the Design, Development, and Analysis of Optimized Matrix-Vector Multiplication Routines for Coprocessors}, booktitle = {ISC High Performance 2015}, year = {2015}, month = {2015-07}, address = {Frankfurt, Germany}, abstract = {The dramatic change in computer architecture due to the manycore paradigm shift, made the development of numerical routines that are optimal extremely challenging. In this work, we target the development of numerical algorithms and implementations for Xeon Phi coprocessor architecture designs. In particular, we examine and optimize the general and symmetric matrix-vector multiplication routines (gemv/symv), which are some of the most heavily used linear algebra kernels in many important engineering and physics applications. We describe a successful approach on how to address the challenges for this problem, starting from our algorithm design, performance analysis and programing model, to kernel optimization. Our goal, by targeting low-level, easy to understand fundamental kernels, is to develop new optimization strategies that can be effective elsewhere for the use on manycore coprocessors, and to show significant performance improvements compared to the existing state-of-the-art implementations. Therefore, in addition to the new optimization strategies, analysis, and optimal performance results, we finally present the significance of using these routines/strategies to accelerate higher-level numerical algorithms for the eigenvalue problem (EVP) and the singular value decomposition (SVD) that by themselves are foundational for many important applications.}, author = {Khairul Kabir and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {894, title = {Efficient Eigensolver Algorithms on Accelerator Based Architectures}, booktitle = {2015 SIAM Conference on Applied Linear Algebra (SIAM LA)}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {The enormous gap between the high-performance capabilities of GPUs and the slow interconnect between them has made the development of numerical software that is scalable across multiple GPUs extremely challenging. We describe a successful methodology on how to address the challenges -starting from our algorithm design, kernel optimization and tuning, to our programming model- in the development of a scalable high-performance symmetric eigenvalue and singular value solver.}, author = {Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {886, title = {Efficient Implementation Of Quantum Materials Simulations On Distributed CPU-GPU Systems}, booktitle = {The International Conference for High Performance Computing, Networking, Storage and Analysis (SC15)}, year = {2015}, month = {2015-11}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, abstract = {We present a scalable implementation of the Linearized Augmented Plane Wave method for distributed memory systems, which relies on an efficient distributed, block-cyclic setup of the Hamiltonian and overlap matrices and allows us to turn around highly accurate 1000+ atom all-electron quantum materials simulations on clusters with a few hundred nodes. The implementation runs efficiently on standard multicore CPU nodes, as well as hybrid CPU-GPU nodes. The key for the latter is a novel algorithm to solve the generalized eigenvalue problem for dense, complex Hermitian matrices on distributed hybrid CPU-GPU systems. Performance tests for Li-intercalated CoO2 supercells containing 1501 atoms demonstrate that high-accuracy, transferable quantum simulations can now be used in throughput materials search problems. While our application can benefit and get scalable performance through CPU-only libraries like ScaLAPACK or ELPA2, our new hybrid solver enables the efficient use of GPUs and shows that a hybrid CPU-GPU architecture scales to a desired performance using substantially fewer cluster nodes, and notably, is considerably more energy efficient than the traditional multicore CPU only systems for such complex applications.}, author = {Raffaele Solc{\`a} and Anton Kozhevnikov and Azzam Haidar and Stanimire Tomov and Thomas C. Schulthess and Jack Dongarra} } @conference {857, title = {Energy Efficiency and Performance Frontiers for Sparse Computations on GPU Supercomputers}, booktitle = {Sixth International Workshop on Programming Models and Applications for Multicores and Manycores (PMAM {\textquoteright}15)}, year = {2015}, month = {2015-02}, publisher = {ACM}, organization = {ACM}, address = {San Francisco, CA}, abstract = {In this paper we unveil some energy efficiency and performance frontiers for sparse computations on GPU-based supercomputers. To do this, we consider state-of-the-art implementations of the sparse matrix-vector (SpMV) product in libraries like cuSPARSE, MKL, and MAGMA, and their use in the LOBPCG eigen-solver. LOBPCG is chosen as a benchmark for this study as it combines an interesting mix of sparse and dense linear algebra operations with potential for hardware-aware optimizations. Most notably, LOBPCG includes a blocking technique that is a common performance optimization for many applications. In particular, multiple memory-bound SpMV operations are blocked into a SpM-matrix product (SpMM), that achieves significantly higher performance than a sequence of SpMVs. We provide details about the GPU kernels we use for the SpMV, SpMM, and the LOBPCG implementation design, and study performance and energy consumption compared to CPU solutions. While a typical sparse computation like the SpMV reaches only a fraction of the peak of current GPUs, we show that the SpMM achieves up to a 6x performance improvement over the GPU{\textquoteright}s SpMV, and the GPU-accelerated LOBPCG based on this kernel is 3 to 5x faster than multicore CPUs with the same power draw, e.g., a K40 GPU vs. two Sandy Bridge CPUs (16 cores). In practice though, we show that currently available CPU implementations are much slower due to missed optimization opportunities. These performance results translate to similar improvements in energy consumption, and are indicative of today{\textquoteright}s frontiers in energy efficiency and performance for sparse computations on supercomputers.}, isbn = {978-1-4503-3404-4}, doi = {10.1145/2712386.2712387}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @conference {887, title = {Flexible Linear Algebra Development and Scheduling with Cholesky Factorization}, booktitle = {17th IEEE International Conference on High Performance Computing and Communications}, year = {2015}, month = {2015-08}, address = {Newark, NJ}, abstract = {Modern high performance computing environments are composed of networks of compute nodes that often contain a variety of heterogeneous compute resources, such as multicore-CPUs, GPUs, and coprocessors. One challenge faced by domain scientists is how to efficiently use all these distributed, heterogeneous resources. In order to use the GPUs effectively, the workload parallelism needs to be much greater than the parallelism for a multicore-CPU. On the other hand, a Xeon Phi coprocessor will work most effectively with degree of parallelism between GPUs and multicore-CPUs. Additionally, effectively using distributed memory nodes brings out another level of complexity where the workload must be carefully partitioned over the nodes. In this work we are using a lightweight runtime environment to handle many of the complexities in such distributed, heterogeneous systems. The runtime environment uses task-superscalar concepts to enable the developer to write serial code while providing parallel execution. The task-programming model allows the developer to write resource-specialization code, so that each resource gets the appropriate sized workload-grain. Our task programming abstraction enables the developer to write a single algorithm that will execute efficiently across the distributed heterogeneous machine. We demonstrate the effectiveness of our approach with performance results for dense linear algebra applications, specifically the Cholesky factorization.}, author = {Azzam Haidar and Asim YarKhan and Chongxiao Cao and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {864, title = {Framework for Batched and GPU-resident Factorization Algorithms to Block Householder Transformations}, booktitle = {ISC High Performance}, year = {2015}, month = {2015-07}, publisher = {Springer}, organization = {Springer}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Tingxing Dong and Stanimire Tomov and Piotr Luszczek and Jack Dongarra} } @article {829, title = {HPC Programming on Intel Many-Integrated-Core Hardware with MAGMA Port to Xeon Phi}, journal = {Scientific Programming}, volume = {23}, year = {2015}, month = {2015-01}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms for multicore with Intel Xeon Phi Coprocessors. In particular, we consider algorithms for solving linear systems. Further, we give an overview of the MAGMA MIC library, an open source, high performance library that incorporates the developments presented, and in general provides to heterogeneous architectures of multicore with coprocessors the DLA functionality of the popular LAPACK library. The LAPACK-compliance simplifies the use of the MAGMA MIC library in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance BLAS, hardware-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components. Our methodology and programming techniques are incorporated into the MAGMA MIC API, which abstracts the application developer from the specifics of the Xeon Phi architecture and is therefore applicable to algorithms beyond the scope of DLA.}, keywords = {communication and computation overlap, dynamic runtime scheduling using dataflow dependences, hardware accelerators and coprocessors, Intel Xeon Phi processor, Many Integrated Cores, numerical linear algebra}, issn = {1058-9244}, doi = {10.3233/SPR-140404}, author = {Azzam Haidar and Jack Dongarra and Khairul Kabir and Mark Gates and Piotr Luszczek and Stanimire Tomov and Yulu Jia} } @article {1348, title = {Linear Algebra Software for High-Performance Computing (Part 2: Software for Hardware Accelerators and Coprocessors)}, year = {2015}, month = {2015-06}, publisher = {ISC High Performance (ISC18), Tutorial Presentation}, address = {Frankfurt, Germany}, author = {Stanimire Tomov} } @conference {888, title = {MAGMA Embedded: Towards a Dense Linear Algebra Library for Energy Efficient Extreme Computing}, booktitle = { 2015 IEEE High Performance Extreme Computing Conference (HPEC {\textquoteright}15), (Best Paper Award)}, year = {2015}, month = {2015-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {Embedded computing, not only in large systems like drones and hybrid vehicles, but also in small portable devices like smart phones and watches, gets more extreme to meet ever increasing demands for extended and improved functionalities. This, combined with the typical constrains for low power consumption and small sizes, makes the design of numerical libraries for embedded systems challenging. In this paper, we present the design and implementation of embedded system aware algorithms, that target these challenges in the area of dense linear algebra. We consider the fundamental problems of solving linear systems of equations and least squares problems, using the LU, QR, and Cholesky factorizations, and illustrate our results, both in terms of performance and energy efficiency, on the Jetson TK1 development kit. We developed performance optimizations for both small and large problems. In contrast to the corresponding LAPACK algorithms, the new designs target the use of many-cores, readily available now even in mobile devices like the Jetson TK1, e.g., featuring 192 CUDA cores. The implementations presented will form the core of a MAGMA Embedded library, to be released as part of the MAGMA libraries. }, author = {Azzam Haidar and Stanimire Tomov and Piotr Luszczek and Jack Dongarra} } @article {1347, title = {MAGMA MIC: Optimizing Linear Algebra for Intel Xeon Phi}, year = {2015}, month = {2015-06}, publisher = {ISC High Performance (ISC15), Intel Booth Presentation}, address = {Frankfurt, Germany}, author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Khairul Kabir and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @conference {891, title = {Mixed-precision Block Gram Schmidt Orthogonalization}, booktitle = {6th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, year = {2015}, month = {2015-11}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, abstract = {The mixed-precision Cholesky QR (CholQR) can orthogonalize the columns of a dense matrix with the minimum communication cost. Moreover, its orthogonality error depends only linearly to the condition number of the input matrix. However, when the desired higher-precision is not supported by the hardware, the software-emulated arithmetics are needed, which could significantly increase its computational cost. When there are a large number of columns to be orthogonalized, this computational overhead can have a significant impact on the orthogonalization time, and the mixed-precision CholQR can be much slower than the standard CholQR. In this paper, we examine several block variants of the algorithm, which reduce the computational overhead associated with the software-emulated arithmetics, while maintaining the same orthogonality error bound as the mixed-precision CholQR. Our numerical and performance results on multicore CPUs with a GPU, as well as a hybrid CPU/GPU cluster, demonstrate that compared to the mixed-precision CholQR, such a block variant can obtain speedups of up to 7:1 while maintaining about the same order of the numerical errors.}, author = {Ichitaro Yamazaki and Stanimire Tomov and Jakub Kurzak and Jack Dongarra and Jesse Barlow} } @article {856, title = {Mixed-Precision Cholesky QR Factorization and its Case Studies on Multicore CPU with Multiple GPUs}, journal = {SIAM Journal on Scientific Computing}, volume = {37}, number = {3}, year = {2015}, month = {2015-05}, pages = {C203-C330}, abstract = {To orthonormalize the columns of a dense matrix, the Cholesky QR (CholQR) requires only one global reduction between the parallel processing units and performs most of its computation using BLAS-3 kernels. As a result, compared to other orthogonalization algorithms, CholQR obtains superior performance on many of the current computer architectures, where the communication is becoming increasingly expensive compared to the arithmetic operations. This is especially true when the input matrix is tall-skinny. Unfortunately, the orthogonality error of CholQR depends quadratically on the condition number of the input matrix, and it is numerically unstable when the matrix is ill-conditioned. To enhance the stability of CholQR, we recently used mixed-precision arithmetic; the input and output matrices are in the working precision, but some of its intermediate results are accumulated in the doubled precision. In this paper, we analyze the numerical properties of this mixed-precision CholQR. Our analysis shows that by selectively using the doubled precision, the orthogonality error of the mixed-precision CholQR only depends linearly on the condition number of the input matrix. We provide numerical results to demonstrate the improved numerical stability of the mixed-precision CholQR in practice. We then study its performance. When the target hardware does not support the desired higher precision, software emulation is needed. For example, using software-emulated double-double precision for the working 64-bit double precision, the mixed-precision CholQR requires about 8.5x more floating-point instructions than that required by the standard CholQR. On the other hand, the increase in the communication cost using the double-double precision is less significant, and our performance results on multicore CPU with a different graphics processing unit (GPU) demonstrate that the overhead of using the double-double arithmetic is decreasing on a newer architecture, where the computation is becoming less expensive compared to the communication. As a result, with a latest NVIDIA GPU, the mixed-precision CholQR was only 1.4x slower than the standard CholQR. Finally, we present case studies of using the mixed-precision CholQR within communication-avoiding variants of Krylov subspace projection methods for solving a nonsymmetric linear system of equations and for solving a symmetric eigenvalue problem, on a multicore CPU with multiple GPUs. These case studies demonstrate that by using the higher precision for this small but critical segment of the Krylov methods, we can improve not only the overall numerical stability of the solvers but also, in some cases, their performance.}, doi = {DOI:10.1137/14M0973773}, author = {Ichitaro Yamazaki and Stanimire Tomov and Jack Dongarra} } @conference {897, title = {Mixed-precision orthogonalization process Performance on multicore CPUs with GPUs}, booktitle = {2015 SIAM Conference on Applied Linear Algebra}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {Orthogonalizing a set of dense vectors is an important computational kernel in subspace projection methods for solving large-scale problems. In this talk, we discuss our efforts to improve the performance of the kernel, while maintaining its numerical accuracy. Our experimental results demonstrate the effectiveness of our approaches.}, author = {Ichitaro Yamazaki and Jesse Barlow and Stanimire Tomov and Jakub Kurzak and Jack Dongarra} } @conference {916, title = {Optimization for Performance and Energy for Batched Matrix Computations on GPUs}, booktitle = {8th Workshop on General Purpose Processing Using GPUs (GPGPU 8)}, year = {2015}, month = {2015-02}, publisher = {ACM}, organization = {ACM}, address = {San Francisco, CA}, abstract = {As modern hardware keeps evolving, an increasingly effective approach to develop energy efficient and high-performance solvers is to design them to work on many small size independent problems. Many applications already need this functionality, especially for GPUs, which are known to be currently about four to five times more energy efficient than multicore CPUs. We describe the development of the main one-sided factorizations that work for a set of small dense matrices in parallel, and we illustrate our techniques on the LU and Cholesky factorizations. We refer to this mode of operation as a batched factorization. Our approach is based on representing the algorithms as a sequence of batched BLAS routines for GPU-only execution. The goal of avoiding multicore CPU use, e.g., as in the hybrid CPU-GPU algorithms, is to exclusively benefit from the GPU{\textquoteright}s significantly higher energy efficiency, as well as from the removal of the costly CPU-to-GPU communications. Furthermore, we do not use a single symmetric multiprocessor (on the GPU) to factorize a single problem at a time. We illustrate how our performance analysis and the use of profiling and tracing tools guided the development and optimization of batched factorizations to achieve up to 2-fold speedup and 3-fold better energy efficiency compared to our highly optimized batched CPU implementations based on the MKL library (when using two sockets of Intel Sandy Bridge CPUs). Compared to a batched LU factorization featured in the CUBLAS library for GPUs, we achieved up to 2.5 speedup on the K40 GPU.}, keywords = {batched factorization, hardware accelerators, numerical linear algebra, numerical software libraries, one-sided factorization algorithms}, doi = {10.1145/2716282.2716288}, author = {Azzam Haidar and Tingxing Dong and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @article {936, title = {Parallel Programming Models for Dense Linear Algebra on Heterogeneous Systems}, journal = {Supercomputing Frontiers and Innovations}, volume = {2}, number = {4}, year = {2015}, month = {2015-10}, abstract = {We present a review of the current best practices in parallel programming models for dense linear algebra (DLA) on heterogeneous architectures. We consider multicore CPUs, stand alone manycore coprocessors, GPUs, and combinations of these. Of interest is the evolution of the programming models for DLA libraries {\textendash} in particular, the evolution from the popular LAPACK and ScaLAPACK libraries to their modernized counterparts PLASMA (for multicore CPUs) and MAGMA (for heterogeneous architectures), as well as other programming models and libraries. Besides providing insights into the programming techniques of the libraries considered, we outline our view of the current strengths and weaknesses of their programming models {\textendash} especially in regards to hardware trends and ease of programming high-performance numerical software that current applications need {\textendash} in order to motivate work and future directions for the next generation of parallel programming models for high-performance linear algebra libraries on heterogeneous systems.}, keywords = {dense linear algebra, gpu, HPC, Multicore, plasma, Programming models, runtime}, doi = {10.14529/jsfi1504}, author = {Maksims Abalenkovs and Ahmad Abdelfattah and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki and Asim YarKhan} } @conference {860, title = {Performance Analysis and Design of a Hessenberg Reduction using Stabilized Blocked Elementary Transformations for New Architectures}, booktitle = {The Spring Simulation Multi-Conference 2015 (SpringSim{\textquoteright}15), Best Paper Award}, year = {2015}, month = {2015-04}, address = {Alexandria, VA}, abstract = {The solution of nonsymmetric eigenvalue problems, Ax = λx, can be accelerated substantially by first reducing A to an upper Hessenberg matrix H that has the same eigenvalues as A. This can be done using Householder orthogonal transformations, which is a well established standard, or stabilized elementary transformations. The latter approach, although having half the flops of the former, has been used less in practice, e.g., on computer architectures with well developed hierarchical memories, because of its memory-bound operations and the complexity in stabilizing it. In this paper we revisit the stabilized elementary transformations approach in the context of new architectures {\textendash} both multicore CPUs and Xeon Phi coprocessors. We derive for a first time a blocking version of the algorithm. The blocked version reduces the memory-bound operations and we analyze its performance. A performance model is developed that shows the limitations of both approaches. The competitiveness of using stabilized elementary transformations has been quantified, highlighting that it can be 20 to 30\% faster on current high-end multicore CPUs and Xeon Phi coprocessors.}, keywords = {Eigenvalues problem, Hessenberg reduction, Multi/Many-core, Stabilized Elementary Transformations}, author = {Khairul Kabir and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {861, title = {Performance Analysis and Optimization of Two-Sided Factorization Algorithms for Heterogeneous Platform}, booktitle = {International Conference on Computational Science (ICCS 2015)}, year = {2015}, month = {2015-06}, address = {Reykjav{\'\i}k, Iceland}, author = {Khairul Kabir and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {884, title = {Performance of Random Sampling for Computing Low-rank Approximations of a Dense Matrix on GPUs}, booktitle = {The International Conference for High Performance Computing, Networking, Storage and Analysis (SC15)}, year = {2015}, month = {2015-11}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, author = {Theo Mary and Ichitaro Yamazaki and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @article {1346, title = {Towards a High-Performance Tensor Algebra Package for Accelerators}, year = {2015}, month = {2015-09}, publisher = {moky Mountains Computational Sciences and Engineering Conference (SMC15)}, address = {Gatlinburg, TN}, author = {Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @conference {844, title = {Towards Batched Linear Solvers on Accelerated Hardware Platforms}, booktitle = {8th Workshop on General Purpose Processing Using GPUs (GPGPU 8) co-located with PPOPP 2015}, year = {2015}, month = {2015-02}, publisher = {ACM}, organization = {ACM}, address = {San Francisco, CA}, abstract = {As hardware evolves, an increasingly effective approach to develop energy efficient, high-performance solvers, is to design them to work on many small and independent problems. Indeed, many applications already need this functionality, especially for GPUs, which are known to be currently about four to five times more energy efficient than multicore CPUs for every floating-point operation. In this paper, we describe the development of the main one-sided factorizations: LU, QR, and Cholesky; that are needed for a set of small dense matrices to work in parallel. We refer to such algorithms as batched factorizations. Our approach is based on representing the algorithms as a sequence of batched BLAS routines for GPU-contained execution. Note that this is similar in functionality to the LAPACK and the hybrid MAGMA algorithms for large-matrix factorizations. But it is different from a straightforward approach, whereby each of GPU{\textquoteright}s symmetric multiprocessors factorizes a single problem at a time.We illustrate how our performance analysis together with the profiling and tracing tools guided the development of batched factorizations to achieve up to 2-fold speedup and 3-fold better energy efficiency compared to our highly optimized batched CPU implementations based on the MKL library on a two-sockets, Intel Sandy Bridge server. Compared to a batched LU factorization featured in the NVIDIA{\textquoteright}s CUBLAS library for GPUs, we achieves up to 2.5-fold speedup on the K40 GPU.}, keywords = {batched factorization, hardware accelerators, numerical linear algebra, numerical software libraries, one-sided factorization algorithms}, author = {Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @inproceedings {959, title = {Weighted Dynamic Scheduling with Many Parallelism Grains for Offloading of Numerical Workloads to Multiple Varied Accelerators}, journal = {Proceedings of the 6th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA{\textquoteright}15)}, volume = {No. 5}, year = {2015}, month = {2015-11}, publisher = {ACM}, address = {Austin, TX}, abstract = {A wide variety of heterogeneous compute resources are available to modern computers, including multiple sockets containing multicore CPUs, one-or-more GPUs of varying power, and coprocessors such as the Intel Xeon Phi. The challenge faced by domain scientists is how to efficiently and productively use these varied resources. For example, in order to use GPUs effectively, the workload must have a greater degree of parallelism than a workload designed for a multicore-CPU. The domain scientist would have to design and schedule an application in multiple degrees of parallelism and task grain sizes in order to obtain efficient performance from the resources. We propose a productive programming model starting from serial code, which achieves parallelism and scalability by using a task-superscalar runtime environment to adapt the computation to the available resources. The adaptation is done at multiple points, including multi-level data partitioning, adaptive task grain sizes, and dynamic task scheduling. The effectiveness of this approach for utilizing multi-way heterogeneous hardware resources is demonstrated by implementing dense linear algebra applications.}, keywords = {dataflow scheduling, hardware accelerators, multi-grain parallelism}, author = {Azzam Haidar and Yulu Jia and Piotr Luszczek and Stanimire Tomov and Asim YarKhan and Jack Dongarra} } @inbook {780, title = {Accelerating Numerical Dense Linear Algebra Calculations with GPUs}, booktitle = {Numerical Computations with GPUs}, year = {2014}, pages = {3-28}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, chapter = {1}, isbn = {978-3-319-06547-2}, doi = {10.1007/978-3-319-06548-9_1}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @techreport {837, title = {Accelerating the LOBPCG method on GPUs using a blocked Sparse Matrix Vector Product}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-14-731}, year = {2014}, month = {2014-10}, publisher = {University of Tennessee}, abstract = {This paper presents a heterogeneous CPU-GPU algorithm design and optimized implementation for an entire sparse iterative eigensolver {\textendash} the Locally Optimal Block Preconditioned Conjugate Gradient (LOBPCG) {\textendash} starting from low-level GPU data structures and kernels to the higher-level algorithmic choices and overall heterogeneous design. Most notably, the eigensolver leverages the high-performance of a new GPU kernel developed for the simultaneous multiplication of a sparse matrix and a set of vectors (SpMM). This is a building block that serves as a backbone for not only block-Krylov, but also for other methods relying on blocking for acceleration in general. The heterogeneous LOBPCG developed here reveals the potential of this type of eigensolver by highly optimizing all of its components, and can be viewed as a benchmark for other SpMM-dependent applications. Compared to non-blocked algorithms, we show that the performance speedup factor of SpMM vs. SpMV-based algorithms is up to six on GPUs like NVIDIA{\textquoteright}s K40. In particular, a typical SpMV performance range in double precision is 20 to 25 GFlop/s, while the SpMM is in the range of 100 to 120 GFlop/s. Compared to highly-optimized CPU implementations, e.g., the SpMM from MKL on two eight-core Intel Xeon E5-2690s, our kernel is 3 to 5x. faster on a K40 GPU. For comparison to other computational loads, the same GPU to CPU performance acceleration is observed for the SpMV product, as well as dense linear algebra, e.g., matrix-matrix multiplication and factorizations like LU, QR, and Cholesky. Thus, the modeled GPU (vs. CPU) acceleration for the entire solver is also 3 to 5x. In practice though, currently available CPU implementations are much slower due to missed optimization opportunities, as we show.}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @conference {818, title = {Access-averse Framework for Computing Low-rank Matrix Approximations}, booktitle = {First International Workshop on High Performance Big Graph Data Management, Analysis, and Mining}, year = {2014}, month = {2014-10}, address = {Washington, DC}, author = {Ichitaro Yamazaki and Theo Mary and Jakub Kurzak and Stanimire Tomov and Jack Dongarra} } @conference {836, title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL }, booktitle = {International Workshop on OpenCL}, year = {2014}, month = {2014-05}, address = {Bristol University, England}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments presented, and in general provides to heterogeneous architectures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance OpenCL BLAS, hardware and OpenCL-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.}, author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @conference {819, title = {Deflation Strategies to Improve the Convergence of Communication-Avoiding GMRES}, booktitle = {5th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, year = {2014}, month = {2014-11}, address = {New Orleans, LA}, author = {Ichitaro Yamazaki and Stanimire Tomov and Jack Dongarra} } @conference {816, title = {Domain Decomposition Preconditioners for Communication-Avoiding Krylov Methods on a Hybrid CPU/GPU Cluster}, booktitle = {The International Conference for High Performance Computing, Networking, Storage and Analysis (SC 14)}, year = {2014}, month = {2014-11}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, author = {Ichitaro Yamazaki and Sivasankaran Rajamanickam and Eric G. Boman and Mark Hoemmen and Michael A. Heroux and Stanimire Tomov} } @conference {834, title = {Dynamically balanced synchronization-avoiding LU factorization with multicore and GPUs}, booktitle = {Fourth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2014}, year = {2014}, month = {2014-05}, abstract = {Graphics processing units (GPUs) brought huge performance improvements in the scientific and numerical fields. We present an efficient hybrid CPU/GPU approach that is portable, dynamically and efficiently balances the workload between the CPUs and the GPUs, and avoids data transfer bottlenecks that are frequently present in numerical algorithms. Our approach determines the amount of initial work to assign to the CPUs before the execution, and then dynamically balances workloads during the execution. Then, we present a theoretical model to guide the choice of the initial amount of work for the CPUs. The validation of our model allows our approach to self-adapt on any architecture using the manufacturer{\textquoteright}s characteristics of the underlying machine. We illustrate our method for the LU factorization. For this case, we show that the use of our approach combined with a communication avoiding LU algorithm is efficient. For example, our experiments on a 24 cores AMD Opteron 6172 show that by adding one GPU (Tesla S2050) we accelerate LU up to 2.4x compared to the corresponding routine in MKL using 24 cores. The comparisons with MAGMA also show significant improvements.}, author = {Simplice Donfack and Stanimire Tomov and Jack Dongarra} } @conference {715, title = {A Fast Batched Cholesky Factorization on a GPU}, booktitle = {International Conference on Parallel Processing (ICPP-2014)}, year = {2014}, month = {2014-09}, address = {Minneapolis, MN}, abstract = {Currently, state of the art libraries, like MAGMA, focus on very large linear algebra problems, while solving many small independent problems, which is usually referred to as batched problems, is not given adequate attention. In this paper, we proposed a batched Cholesky factorization on a GPU. Three algorithms {\textendash} nonblocked, blocked, and recursive blocked {\textendash} were examined. The left-looking version of the Cholesky factorization is used to factorize the panel, and the right-looking Cholesky version is used to update the trailing matrix in the recursive blocked algorithm. Our batched Cholesky achieves up to 1:8 speedup compared to the optimized parallel implementation in the MKL library on two sockets of Intel Sandy Bridge CPUs. Further, we use the new routines to develop a single Cholesky factorization solver which targets large matrix sizes. Our approach differs from MAGMA by having an entirely GPU implementation where both the panel factorization and the trailing matrix updates are on the GPU. Such an implementation does not depend on the speed of the CPU. Compared to the MAGMA library, our full GPU solution achieves 85\% of the hybrid MAGMA performance which uses 16 Sandy Bridge cores, in addition to a K40 Nvidia GPU. Moreover, we achieve 80\% of the practical dgemm peak of the machine, while MAGMA achieves only 75\%, and finally, in terms of energy consumption, we outperform MAGMA by 1.5 in performance-per-watt for large matrices.}, author = {Tingxing Dong and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {765, title = {Heterogeneous Acceleration for Linear Algebra in Mulit-Coprocessor Environments}, booktitle = {VECPAR 2014}, year = {2014}, month = {2014-06}, address = {Eugene, OR}, abstract = {We present an efficient and scalable programming model for the development of linear algebra in heterogeneous multi-coprocessor environments. The model incorporates some of the current best design and implementation practices for the heterogeneous acceleration of dense linear algebra (DLA). Examples are given as the basis for solving linear systems{\textquoteright} algorithms {\textendash} the LU, QR, and Cholesky factorizations. To generate the extreme level of parallelism needed for the efficient use of coprocessors, algorithms of interest are redesigned and then split into well-chosen computational tasks. The tasks execution is scheduled over the computational components of a hybrid system of multi-core CPUs and coprocessors using a light-weight runtime system. The use of light-weight runtime systems keeps scheduling overhead low, while enabling the expression of parallelism through otherwise sequential code. This simplifies the development efforts and allows the exploration of the unique strengths of the various hardware components.}, keywords = {Computer science, factorization, Heterogeneous systems, Intel Xeon Phi, linear algebra}, author = {Azzam Haidar and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {812, title = {Hybrid Multi-Elimination ILU Preconditioners on GPUs}, booktitle = {International Heterogeneity in Computing Workshop (HCW), IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Abstract{\textemdash}Iterative solvers for sparse linear systems often benefit from using preconditioners. While there are implementations for many iterative methods that leverage the computing power of accelerators, porting the latest developments in preconditioners to accelerators has been challenging. In this paper we develop a selfadaptive multi-elimination preconditioner for graphics processing units (GPUs). The preconditioner is based on a multi-level incomplete LU factorization and uses a direct dense solver for the bottom-level system. For test matrices from the University of Florida matrix collection, we investigate the influence of handling the triangular solvers in the distinct iteration steps in either single or double precision arithmetic. Integrated into a Conjugate Gradient method, we show that our multi-elimination algorithm is highly competitive against popular preconditioners, including multi-colored symmetric Gauss-Seidel relaxation preconditioners, and (multi-colored symmetric) ILU for numerous problems.}, author = {Dimitar Lukarski and Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @techreport {838, title = {Implementing a Sparse Matrix Vector Product for the SELL-C/SELL-C-σ formats on NVIDIA GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-14-727}, year = {2014}, month = {2014-04}, publisher = {University of Tennessee}, abstract = {Numerical methods in sparse linear algebra typically rely on a fast and efficient matrix vector product, as this usually is the backbone of iterative algorithms for solving eigenvalue problems or linear systems. Against the background of a large diversity in the characteristics of high performance computer architectures, it is a challenge to derive a cross-platform efficient storage format along with fast matrix vector kernels. Recently, attention focused on the SELL-C- format, a sliced ELLPACK format enhanced by row-sorting to reduce the fill in when padding rows with zeros. In this paper we propose an additional modification resulting in the padded sliced ELLPACK (SELLP) format, for which we develop a sparse matrix vector CUDA kernel that is able to efficiently exploit the computing power of NVIDIA GPUs. We show that the kernel we developed outperforms straight-forward implementations for the widespread CSR and ELLPACK formats, and is highly competitive to the implementations in the highly optimized CUSPARSE library.}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @conference {807, title = {Improving the performance of CA-GMRES on multicores with multiple GPUs}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Abstract{\textemdash}The Generalized Minimum Residual (GMRES) method is one of the most widely-used iterative methods for solving nonsymmetric linear systems of equations. In recent years, techniques to avoid communication in GMRES have gained attention because in comparison to floating-point operations, communication is becoming increasingly expensive on modern computers. Since graphics processing units (GPUs) are now becoming crucial component in computing, we investigate the effectiveness of these techniques on multicore CPUs with multiple GPUs. While we present the detailed performance studies of a matrix powers kernel on multiple GPUs, we particularly focus on orthogonalization strategies that have a great impact on both the numerical stability and performance of GMRES, especially as the matrix becomes sparser or ill-conditioned. We present the experimental results on two eight-core Intel Sandy Bridge CPUs with three NDIVIA Fermi GPUs and demonstrate that significant speedups can be obtained by avoiding communication, either on a GPU or between the GPUs. As part of our study, we investigate several optimization techniques for the GPU kernels that can also be used in other iterative solvers besides GMRES. Hence, our studies not only emphasize the importance of avoiding communication on GPUs, but they also provide insight about the effects of these optimization techniques on the performance of the sparse solvers, and may have greater impact beyond GMRES.}, author = {Ichitaro Yamazaki and Hartwig Anzt and Stanimire Tomov and Mark Hoemmen and Jack Dongarra} } @conference {859, title = {LU Factorization of Small Matrices: Accelerating Batched DGETRF on the GPU}, booktitle = {16th IEEE International Conference on High Performance Computing and Communications (HPCC)}, year = {2014}, month = {2014-08}, publisher = {IEEE}, organization = {IEEE}, address = {Paris, France}, abstract = {Gaussian Elimination is commonly used to solve dense linear systems in scientific models. In a large number of applications, a need arises to solve many small size problems, instead of few large linear systems. The size of each of these small linear systems depends, for example, on the number of the ordinary differential equations (ODEs) used in the model, and can be on the order of hundreds of unknowns. To efficiently exploit the computing power of modern accelerator hardware, these linear systems are processed in batches. To improve the numerical stability of the Gaussian Elimination, at least partial pivoting is required, most often accomplished with row pivoting. However, row pivoting can result in a severe performance penalty on GPUs because it brings in thread divergence and non-coalesced memory accesses. The state-of-the-art libraries for linear algebra that target GPUs, such as MAGMA, focus on large matrix sizes. They change the data layout by transposing the matrix to avoid these divergence and non-coalescing penalties. However, the data movement associated with transposition is very expensive for small matrices. In this paper, we propose a batched LU factorization for GPUs by using a multi-level blocked right looking algorithm that preserves the data layout but minimizes the penalty of partial pivoting. Our batched LU achieves up to 2:5-fold speedup when compared to the alternative CUBLAS solutions on a K40c GPU and 3:6-fold speedup over MKL on a node of the Titan supercomputer at ORNL in a nuclear reaction network simulation.}, author = {Tingxing Dong and Azzam Haidar and Piotr Luszczek and James Harris and Stanimire Tomov and Jack Dongarra} } @conference {713, title = {Mixed-precision orthogonalization scheme and adaptive step size for CA-GMRES on GPUs}, booktitle = {VECPAR 2014 (Best Paper)}, year = {2014}, month = {2014-06}, address = {Eugene, OR}, abstract = {We propose a mixed-precision orthogonalization scheme that takes the input matrix in a standard 32 or 64-bit floating-point precision, but uses higher-precision arithmetics to accumulate its intermediate results. For the 64-bit precision, our scheme uses software emulation for the higher-precision arithmetics, and requires about 20x more computation but about the same amount of communication as the standard orthogonalization scheme. Since the computation is becoming less expensive compared to the communication on new and emerging architectures, the relative cost of our mixed-precision scheme is decreasing. Our case studies with CA-GMRES on a GPU demonstrate that using mixed-precision for this small but critical segment of CA-GMRES can improve not only its overall numerical stability but also, in some cases, its performance.}, author = {Ichitaro Yamazaki and Stanimire Tomov and Tingxing Dong and Jack Dongarra} } @article {831, title = {Model-Driven One-Sided Factorizations on Multicore, Accelerated Systems}, journal = {Supercomputing Frontiers and Innovations}, volume = {1}, year = {2014}, abstract = {Hardware heterogeneity of the HPC platforms is no longer considered unusual but instead have become the most viable way forward towards Exascale. In fact, the multitude of the heterogeneous resources available to modern computers are designed for different workloads and their efficient use is closely aligned with the specialized role envisaged by their design. Commonly in order to efficiently use such GPU resources, the workload in question must have a much greater degree of parallelism than workloads often associated with multicore processors (CPUs). Available GPU variants differ in their internal architecture and, as a result, are capable of handling workloads of varying degrees of complexity and a range of computational patterns. This vast array of applicable workloads will likely lead to an ever accelerated mixing of multicore-CPUs and GPUs in multi-user environments with the ultimate goal of offering adequate computing facilities for a wide range of scientific and technical workloads. In the following paper, we present a research prototype that uses a lightweight runtime environment to manage the resource-specific workloads, and to control the dataflow and parallel execution in hybrid systems. Our lightweight runtime environment uses task superscalar concepts to enable the developer to write serial code while providing parallel execution. This concept is reminiscent of dataflow and systolic architectures in its conceptualization of a workload as a set of side-effect-free tasks that pass data items whenever the associated work assignment have been completed. Additionally, our task abstractions and their parametrization enable uniformity in the algorithmic development across all the heterogeneous resources without sacrificing precious compute cycles. We include performance results for dense linear algebra functions which demonstrate the practicality and effectiveness of our approach that is aptly capable of full utilization of a wide range of accelerator hardware.}, keywords = {dense linear algebra, hardware accelerators, task superscalar scheduling}, doi = {http://dx.doi.org/10.14529/jsfi1401}, author = {Jack Dongarra and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Asim YarKhan} } @article {758, title = {A Novel Hybrid CPU-GPU Generalized Eigensolver for Electronic Structure Calculations Based on Fine Grained Memory Aware Tasks}, journal = {International Journal of High Performance Computing Applications}, volume = {28}, year = {2014}, month = {2014-05}, pages = {196-209}, chapter = {196}, abstract = {The adoption of hybrid CPU{\textendash}GPU nodes in traditional supercomputing platforms such as the Cray-XK6 opens acceleration opportunities for electronic structure calculations in materials science and chemistry applications, where medium-sized generalized eigenvalue problems must be solved many times. These eigenvalue problems are too small to effectively solve on distributed systems, but can benefit from the massive computing power concentrated on a single-node, hybrid CPU{\textendash}GPU system. However, hybrid systems call for the development of new algorithms that efficiently exploit heterogeneity and massive parallelism of not just GPUs, but of multicore/manycore CPUs as well. Addressing these demands, we developed a generalized eigensolver featuring novel algorithms of increased computational intensity (compared with the standard algorithms), decomposition of the computation into fine-grained memory aware tasks, and their hybrid execution. The resulting eigensolvers are state-of-the-art in high-performance computing, significantly outperforming existing libraries. We describe the algorithm and analyze its performance impact on applications of interest when different fractions of eigenvectors are needed by the host electronic structure code. }, keywords = {Eigensolver, electronic structure calculations, generalized eigensolver, gpu, high performance, hybrid, Multicore, two-stage}, doi = {10.1177/1094342013502097 }, author = {Azzam Haidar and Raffaele Solc{\`a} and Mark Gates and Stanimire Tomov and Thomas C. Schulthess and Jack Dongarra} } @conference {833, title = {Optimizing Krylov Subspace Solvers on Graphics Processing Units}, booktitle = {Fourth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Krylov subspace solvers are often the method of choice when solving sparse linear systems iteratively. At the same time, hardware accelerators such as graphics processing units (GPUs) continue to offer significant floating point performance gains for matrix and vector computations through easy-to-use libraries of computational kernels. However, as these libraries are usually composed of a well optimized but limited set of linear algebra operations, applications that use them often fail to leverage the full potential of the accelerator. In this paper we target the acceleration of the BiCGSTAB solver for GPUs, showing that significant improvement can be achieved by reformulating the method and developing application-specific kernels instead of using the generic CUBLAS library provided by NVIDIA. We propose an implementation that benefits from a significantly reduced number of kernel launches and GPUhost communication events, by means of increased data locality and a simultaneous reduction of multiple scalar products. Using experimental data, we show that, depending on the dominance of the untouched sparse matrix vector products, significant performance improvements can be achieved compared to a reference implementation based on the CUBLAS library. We feel that such optimizations are crucial for the subsequent development of highlevel sparse linear algebra libraries.}, author = {Stanimire Tomov and Piotr Luszczek and Ichitaro Yamazaki and Jack Dongarra and Hartwig Anzt and William Sawyer} } @conference {828, title = {Performance and Portability with OpenCL for Throughput-Oriented HPC Workloads Across Accelerators, Coprocessors, and Multicore Processors}, booktitle = {5th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA {\textquoteright}14)}, year = {2014}, month = {2014-11}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, abstract = {Ever since accelerators and coprocessors became the mainstream hardware for throughput-oriented HPC workloads, various programming techniques have been proposed to increase productivity in terms of both the performance and ease-of-use. We evaluate these aspects of OpenCL on a number of hardware platforms for an important subset of dense linear algebra operations that are relevant to a wide range of scientific applications. Our findings indicate that OpenCL portability has improved since our previous publication and many new and surprising usage scenarios are possible that rival those available after decades of software development on the CPUs. The combined performance-portability metric, even though not promised by the OpenCL standard, reflects the need for tuning performance-critical operations during the porting process and we show how a large portion of the available efficiency is lost if the tuning is not done correctly.}, doi = {10.1109/ScalA.2014.8}, author = {Azzam Haidar and Chongxiao Cao and Ichitaro Yamazaki and Jack Dongarra and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @conference {714, title = {Self-Adaptive Multiprecision Preconditioners on Multicore and Manycore Architectures}, booktitle = {VECPAR 2014}, year = {2014}, month = {2014-06}, address = {Eugene, OR}, abstract = {Based on the premise that preconditioners needed for scientific computing are not only required to be robust in the numerical sense, but also scalable for up to thousands of light-weight cores, we argue that this two-fold goal is achieved for the recently developed self-adaptive multi-elimination preconditioner. For this purpose, we revise the underlying idea and analyze the performance of implementations realized in the PARALUTION and MAGMA open-source software libraries on GPU architectures (using either CUDA or OpenCL), Intel{\textquoteright}s Many Integrated Core Architecture, and Intel{\textquoteright}s Sandy Bridge processor. The comparison with other well-established preconditioners like multi-coloured Gauss-Seidel, ILU(0) and multi-colored ILU(0), shows that the twofold goal of a numerically stable cross-platform performant algorithm is achieved.}, author = {Hartwig Anzt and Dimitar Lukarski and Stanimire Tomov and Jack Dongarra} } @conference {767, title = {A Step towards Energy Efficient Computing: Redesigning A Hydrodynamic Application on CPU-GPU}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Power and energy consumption are becoming an increasing concern in high performance computing. Compared to multi-core CPUs, GPUs have a much better performance per watt. In this paper we discuss efforts to redesign the most computation intensive parts of BLAST, an application that solves the equations for compressible hydrodynamics with high order finite elements, using GPUs [10, 1]. In order to exploit the hardware parallelism of GPUs and achieve high performance, we implemented custom linear algebra kernels. We intensively optimized our CUDA kernels by exploiting the memory hierarchy, which exceed the vendor{\textquoteright}s library routines substantially in performance. We proposed an autotuning technique to adapt our CUDA kernels to the orders of the finite element method. Compared to a previous base implementation, our redesign and optimization lowered the energy consumption of the GPU in two aspects: 60\% less time to solution and 10\% less power required. Compared to the CPU-only solution, our GPU accelerated BLAST obtained a 2:5x overall speedup and 1:42x energy efficiency (greenup) using 4th order (Q4) finite elements, and a 1:9x speedup and 1:27x greenup using 2nd order (Q2) finite elements.}, keywords = {Computer science, CUDA, FEM, Finite element method, linear algebra, nVidia, Tesla K20}, author = {Tingxing Dong and Veselin Dobrev and Tzanio Kolev and Robert Rieben and Stanimire Tomov and Jack Dongarra} } @conference {809, title = {Unified Development for Mixed Multi-GPU and Multi-Coprocessor Environments using a Lightweight Runtime Environment}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Many of the heterogeneous resources available to modern computers are designed for different workloads. In order to efficiently use GPU resources, the workload must have a greater degree of parallelism than a workload designed for multicore-CPUs. And conceptually, the Intel Xeon Phi coprocessors are capable of handling workloads somewhere in between the two. This multitude of applicable workloads will likely lead to mixing multicore-CPUs, GPUs, and Intel coprocessors in multi-user environments that must offer adequate computing facilities for a wide range of workloads. In this work, we are using a lightweight runtime environment to manage the resourcespecific workload, and to control the dataflow and parallel execution in two-way hybrid systems. The lightweight runtime environment uses task superscalar concepts to enable the developer to write serial code while providing parallel execution. In addition, our task abstractions enable unified algorithmic development across all the heterogeneous resources. We provide performance results for dense linear algebra applications, demonstrating the effectiveness of our approach and full utilization of a wide variety of accelerator hardware.}, keywords = {algorithms, Computer science, CUDA, Heterogeneous systems, Intel Xeon Phi, linear algebra, nVidia, Tesla K20, Tesla M2090}, author = {Azzam Haidar and Chongxiao Cao and Jack Dongarra and Piotr Luszczek and Stanimire Tomov} } @article {icl:721, title = {Accelerating Linear System Solutions Using Randomization Techniques}, journal = {ACM Transactions on Mathematical Software (also LAWN 246)}, volume = {39}, year = {2013}, month = {2013-02}, abstract = {We illustrate how linear algebra calculations can be enhanced by statistical techniques in the case of a square linear system Ax = b. We study a random transformation of A that enables us to avoid pivoting and then to reduce the amount of communication. Numerical experiments show that this randomization can be performed at a very affordable computational price while providing us with a satisfying accuracy when compared to partial pivoting. This random transformation called Partial Random Butterfly Transformation (PRBT) is optimized in terms of data storage and flops count. We propose a solver where PRBT and the LU factorization with no pivoting take advantage of the current hybrid multicore/GPU machines and we compare its Gflop/s performance with a solver implemented in a current parallel library.}, keywords = {algorithms, dense linear algebra, experimentation, graphics processing units, linear systems, lu factorization, multiplicative preconditioning, numerical linear algebra, performance, plasma, randomization}, doi = {10.1145/2427023.2427025}, url = {http://dl.acm.org/citation.cfm?id=2427025}, author = {Marc Baboulin and Jack Dongarra and Julien Herrmann and Stanimire Tomov} } @article {icl:719, title = {A Block-Asynchronous Relaxation Method for Graphics Processing Units}, journal = {Journal of Parallel and Distributed Computing}, volume = {73}, year = {2013}, month = {2013-12}, pages = {1613{\textendash}1626}, abstract = {In this paper, we analyze the potential of asynchronous relaxation methods on Graphics Processing Units (GPUs). We develop asynchronous iteration algorithms in CUDA and compare them with parallel implementations of synchronous relaxation methods on CPU- or GPU-based systems. For a set of test matrices from UFMC we investigate convergence behavior, performance and tolerance to hardware failure. We observe that even for our most basic asynchronous relaxation scheme, the method can efficiently leverage the GPUs computing power and is, despite its lower convergence rate compared to the Gauss{\textendash}Seidel relaxation, still able to provide solution approximations of certain accuracy in considerably shorter time than Gauss{\textendash}Seidel running on CPUs- or GPU-based Jacobi. Hence, it overcompensates for the slower convergence by exploiting the scalability and the good fit of the asynchronous schemes for the highly parallel GPU architectures. Further, enhancing the most basic asynchronous approach with hybrid schemes{\textendash}using multiple iterations within the {\textquoteleft}{\textquoteleft}subdomain{\textquoteright}{\textquoteright} handled by a GPU thread block{\textendash}we manage to not only recover the loss of global convergence but often accelerate convergence of up to two times, while keeping the execution time of a global iteration practically the same. The combination with the advantageous properties of asynchronous iteration methods with respect to hardware failure identifies the high potential of the asynchronous methods for Exascale computing.}, doi = {http://dx.doi.org/10.1016/j.jpdc.2013.05.008}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra and Vincent Heuveline} } @techreport {681, title = {clMAGMA: High Performance Dense Linear Algebra with OpenCL}, journal = {University of Tennessee Technical Report (Lawn 275)}, number = {UT-CS-13-706}, year = {2013}, month = {2013-03}, publisher = {University of Tennessee}, abstract = {This paper presents the design and implementation of sev- eral fundamental dense linear algebra (DLA) algorithms in OpenCL. In particular, these are linear system solvers and eigenvalue problem solvers. Further, we give an overview of the clMAGMA library, an open source, high performance OpenCL library that incorporates the developments pre- sented, and in general provides to heterogeneous architec- tures the DLA functionality of the popular LAPACK library. The LAPACK-compliance and use of OpenCL simplify the use of clMAGMA in applications, while providing them with portably performant DLA. High performance is ob- tained through use of the high-performance OpenCL BLAS, hardware and OpenCL-speci c tuning, and a hybridization methodology where we split the algorithm into computa- tional tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components.}, author = {Chongxiao Cao and Jack Dongarra and Peng Du and Mark Gates and Piotr Luszczek and Stanimire Tomov} } @techreport {689, title = {Dynamically balanced synchronization-avoiding LU factorization with multicore and GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {ut-cs-13-713}, year = {2013}, month = {2013-07}, abstract = {Graphics processing units (GPUs) brought huge performance improvements in the scientific and numerical fields. We present an efficient hybrid CPU/GPU computing approach that is portable, dynamically and efficiently balances the workload between the CPUs and the GPUs, and avoids data transfer bottlenecks that are frequently present in numerical algorithms. Our approach determines the amount of initial work to assign to the CPUs before the execution, and then dynamically balances workloads during the execution. Then, we present a theoretical model to guide the choice of the initial amount of work for the CPUs. The validation of our model allows our approach to self-adapt on any architecture using the manufacturer{\textquoteright}s characteristics of the underlying machine. We illustrate our method for the LU factorization. For this case, we show that the use of our approach combined with a communication avoiding LU algorithm is efficient. For example, our experiments on high-end hybrid CPU/GPU systems show that our dynamically balanced synchronization-avoiding LU is both multicore and GPU scalable. Comparisons with state-of-the-art libraries like MKL (for multicore) and MAGMA (for hybrid systems) are provided, demonstrating significant performance improvements. The approach is applicable to other linear algebra algorithms. The scheduling mechanisms and tuning models can be incorporated into respectively dynamic runtime systems/schedulers and autotuning frameworks for hybrid CPU/MIC/GPU architectures.}, author = {Simplice Donfack and Stanimire Tomov and Jack Dongarra} } @techreport {690, title = {Hydrodynamic Computation with Hybrid Programming on CPU-GPU Clusters}, journal = {University of Tennessee Computer Science Technical Report}, number = {ut-cs-13-714}, year = {2013}, month = {2013-07}, abstract = {The explosion of parallelism and heterogeneity in today{\textquoteright}s computer architectures has created opportunities as well as challenges for redesigning legacy numerical software to harness the power of new hardware. In this paper we address the main challenges in redesigning BLAST { a numerical library that solves the equations of compressible hydrodynamics using high order nite element methods (FEM) in a moving Lagrangian frame { to support CPU-GPU clusters. We use a hybrid MPI + OpenMP + CUDA programming model that includes two layers: domain decomposed MPI parallelization and OpenMP + CUDA acceleration in a given domain. To optimize the code, we implemented custom linear algebra kernels and introduced an auto-tuning technique to deal with heterogeneity and load balancing at runtime. Our tests show that 12 Intel Xeon cores and two M2050 GPUs deliver a 24x speedup compared to a single core, and a 2.5x speedup compared to 12 MPI tasks in one node. Further, we achieve perfect weak scaling, demonstrated on a cluster with up to 64 GPUs in 32 nodes. Our choice of programming model and proposed solutions, as related to parallelism and load balancing, specifically targets high order FEM discretizations, and can be used equally successfully for applications beyond hydrodynamics. A major accomplishment is that we further establish the appeal of high order FEMs, which despite their better approximation properties, are often avoided due to their high computational cost. GPUs, as we show, have the potential to make them the method of choice, as the increased computational cost is also localized, e.g., cast as Level 3 BLAS, and thus can be done very efficiently (close to \free" relative to the usual overheads inherent in sparse computations).}, author = {Tingxing Dong and Veselin Dobrev and Tzanio Kolev and Robert Rieben and Stanimire Tomov and Jack Dongarra} } @inbook {762, title = {Keeneland: Computational Science Using Heterogeneous GPU Computing}, booktitle = {Contemporary High Performance Computing: From Petascale Toward Exascale}, series = {CRC Computational Science Series}, year = {2013}, publisher = {Taylor and Francis}, organization = {Taylor and Francis}, chapter = {7}, address = {Boca Raton, FL}, abstract = {The Keeneland Project is a five year Track 2D grant awarded by the National Science Foundation (NSF) under solicitation NSF 08-573 in August 2009 for the development and deployment of an innovative high performance computing system. The Keeneland project is led by the Georgia Institute of Technology (Georgia Tech) in collaboration with the University of Tennessee at Knoxville, National Institute of Computational Sciences, and Oak Ridge National Laboratory.}, author = {Jeffrey Vetter and Richard Glassbrook and Karsten Schwan and Sudha Yalamanchili and Mitch Horton and Ada Gavrilovska and Magda Slawinska and Jack Dongarra and Jeremy Meredith and Philip Roth and Kyle Spafford and Stanimire Tomov and John Wynkoop} } @inproceedings {757, title = {Leading Edge Hybrid Multi-GPU Algorithms for Generalized Eigenproblems in Electronic Structure Calculations}, journal = {International Supercomputing Conference (ISC)}, volume = {7905}, year = {2013}, month = {2013-06}, pages = {67-80}, publisher = {Springer Berlin Heidelberg}, edition = {Lecture Notes in Computer Science}, address = {Leipzig, Germany}, abstract = {Today{\textquoteright}s high computational demands from engineering fields and complex hardware development make it necessary to develop and optimize new algorithms toward achieving high performance and good scalability on the next generation of computers. The enormous gap between the high-performance capabilities of GPUs and the slow interconnect between them has made the development of numerical software that is scalable across multiple GPUs extremely challenging. We describe and analyze a successful methodology to address the challenges{\textemdash}starting from our algorithm design, kernel optimization and tuning, to our programming model{\textemdash}in the development of a scalable high-performance generalized eigenvalue solver in the context of electronic structure calculations in materials science applications. We developed a set of leading edge dense linear algebra algorithms, as part of a generalized eigensolver, featuring fine grained memory aware kernels, a task based approach and hybrid execution/scheduling. The goal of the new design is to increase the computational intensity of the major compute kernels and to reduce synchronization and data transfers between GPUs. We report the performance impact on the generalized eigensolver when different fractions of eigenvectors are needed. The algorithm described provides an enormous performance boost compared to current GPU-based solutions, and performance comparable to state-of-the-art distributed solutions, using a single node with multiple GPUs.}, isbn = {978-3-642-38750-0}, doi = {10.1007/978-3-642-38750-0_6}, author = {Azzam Haidar and Stanimire Tomov and Jack Dongarra and Raffaele Solc{\`a} and Thomas C. Schulthess} } @conference {753, title = {Portable HPC Programming on Intel Many-Integrated-Core Hardware with MAGMA Port to Xeon Phi}, booktitle = {PPAM 2013}, year = {2013}, month = {2013-09}, address = {Warsaw, Poland}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms for multicore with Intel Xeon Phi Coprocessors. In particular, we consider algorithms for solving linear systems. Further, we give an overview of the MAGMA MIC library, an open source, high performance library that incorporates the developments presented, and in general provides to heterogeneous architectures of multicore with coprocessors the DLA functionality of the popular LAPACK library. The LAPACK-compliance simplifies the use of the MAGMA MIC library in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance BLAS, hardware-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components. Our methodology and programming techniques are incorporated into the MAGMA MIC API, which abstracts the application developer from the specifics of the Xeon Phi architecture and is therefore applicable to algorithms beyond the scope of DLA.}, keywords = {magma, mic, xeon phi}, author = {Jack Dongarra and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Piotr Luszczek and Stanimire Tomov} } @inbook {695, title = {Scalable Dense Linear Algebra on Heterogeneous Hardware}, booktitle = {HPC: Transition Towards Exascale Processing, in the series Advances in Parallel Computing}, year = {2013}, abstract = {Abstract. Design of systems exceeding 1 Pflop/s and the push toward 1 Eflop/s, forced a dramatic shift in hardware design. Various physical and engineering constraints resulted in introduction of massive parallelism and functional hybridization with the use of accelerator units. This paradigm change brings about a serious challenge for application developers, as the management of multicore proliferation and heterogeneity rests on software. And it is reasonable to expect, that this situation will not change in the foreseeable future. This chapter presents a methodology of dealing with this issue in three common scenarios. In the context of shared-memory multicore installations, we show how high performance and scalability go hand in hand, when the well-known linear algebra algorithms are recast in terms of Direct Acyclic Graphs (DAGs), which are then transparently scheduled at runtime inside the Parallel Linear Algebra Software for Multicore Architectures (PLASMA) project. Similarly, Matrix Algebra on GPU and Multicore Architectures (MAGMA) schedules DAG-driven computations on multicore processors and accelerators. Finally, Distributed PLASMA (DPLASMA), takes the approach to distributed-memory machines with the use of automatic dependence analysis and the Direct Acyclic Graph Engine (DAGuE) to deliver high performance at the scale of many thousands of cores.}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Thomas Herault and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @article {755, title = {Soft Error Resilient QR Factorization for Hybrid System with GPGPU}, journal = {Journal of Computational Science}, volume = {4}, year = {2013}, month = {2013-11}, pages = {457{\textendash}464}, abstract = {The general purpose graphics processing units (GPGPUs) are increasingly deployed for scientific computing due to their performance advantages over CPUs. What followed is the fact that fault tolerance has become a more serious concern compared to the period when GPGPUs were used exclusively for graphics applications. Using GPUs and CPUs together in a hybrid computing system increases flexibility and performance but also increases the possibility of the computations being affected by soft errors, for example, in the form of bit flips. In this work, we propose a soft error resilient algorithm for QR factorization on such hybrid systems. Our contributions include: (1) a checkpointing and recovery mechanism for the left-factor Q whose performance is scalable on hybrid systems; (2) optimized Givens rotation utilities on GPGPUs to efficiently reduce an upper Hessenberg matrix to an upper triangular form for the protection of the right factor R; and (3) a recovery algorithm based on QR update on GPGPUs. Experimental results show that our fault tolerant QR factorization can successfully detect and recover from soft errors in the entire matrix with little overhead on hybrid systems with GPGPUs.}, keywords = {gpgpu, gpu, magma}, doi = {http://dx.doi.org/10.1016/j.jocs.2013.01.004}, author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @conference {686, title = {Toward a scalable multi-GPU eigensolver via compute-intensive kernels and efficient communication}, booktitle = {Proceedings of the 27th ACM International Conference on Supercomputing (ICS {\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Eugene, Oregon, USA}, abstract = {The enormous gap between the high-performance capabilities of GPUs and the slow interconnect between them has made the development of numerical software that is scalable across multiple GPUs extremely challenging. We describe a successful methodology on how to address the challenges---starting from our algorithm design, kernel optimization and tuning, to our programming model---in the development of a scalable high-performance tridiagonal reduction algorithm for the symmetric eigenvalue problem. This is a fundamental linear algebra problem with many engineering and physics applications. We use a combination of a task-based approach to parallelism and a new algorithmic design to achieve high performance. The goal of the new design is to increase the computational intensity of the major compute kernels and to reduce synchronization and data transfers between GPUs. This may increase the number of flops, but the increase is offset by the more efficient execution and reduced data transfers. Our performance results are the best available, providing an enormous performance boost compared to current state-of-the-art solutions. In particular, our software scales up to 1070 Gflop/s using 16 Intel E5-2670 cores and eight M2090 GPUs, compared to 45 Gflop/s achieved by the optimized Intel Math Kernel Library (MKL) using only the 16 CPU cores.}, keywords = {eigenvalue, gpu communication, gpu computation, heterogeneous programming model, performance, reduction to tridiagonal, singular value decomposiiton, task parallelism}, isbn = {9781450321303}, doi = {10.1145/2464996.2465438}, url = {http://dl.acm.org/citation.cfm?doid=2464996.2465438}, author = {Azzam Haidar and Mark Gates and Stanimire Tomov and Jack Dongarra}, editor = {Allen D. Malony and Nemirovsky, Mario and Midkiff, Sam} } @article {701, title = {Tridiagonalization of a dense symmetric matrix on multiple GPUs and its application to symmetric eigenvalue problems}, journal = {Concurrency and Computation: Practice and Experience}, year = {2013}, month = {2013-10}, abstract = {For software to fully exploit the computing power of emerging heterogeneous computers, not only must the required computational kernels be optimized for the specific hardware architectures but also an effective scheduling scheme is needed to utilize the available heterogeneous computational units and to hide the communication between them. As a case study, we develop a static scheduling scheme for the tridiagonalization of a symmetric dense matrix on multicore CPUs with multiple graphics processing units (GPUs) on a single compute node. We then parallelize and optimize the Basic Linear Algebra Subroutines (BLAS)-2 symmetric matrix-vector multiplication, and the BLAS-3 low rank symmetric matrix updates on the GPUs. We demonstrate the good scalability of these multi-GPU BLAS kernels and the effectiveness of our scheduling scheme on twelve Intel Xeon processors and three NVIDIA GPUs. We then integrate our hybrid CPU-GPU kernel into computational kernels at higher-levels of software stacks, that is, a shared-memory dense eigensolver and a distributed-memory sparse eigensolver. Our experimental results show that our kernels greatly improve the performance of these higher-level kernels, not only reducing the solution time but also enabling the solution of larger-scale problems. Because such symmetric eigenvalue problems arise in many scientific and engineering simulations, our kernels could potentially lead to new scientific discoveries. Furthermore, these dense linear algebra algorithms present algorithmic characteristics that can be found in other algorithms. Hence, they are not only important computational kernels on their own but also useful testbeds to study the performance of the emerging computers and the effects of the various optimization techniques.}, author = {Ichitaro Yamazaki and Tingxing Dong and Raffaele Solc{\`a} and Stanimire Tomov and Jack Dongarra and Thomas C. Schulthess} } @conference {1328, title = {Tridiagonalization of a Symmetric Dense Matrix on a GPU Cluster}, booktitle = {The Third International Workshop on Accelerators and Hybrid Exascale Systems (AsHES)}, year = {2013}, month = {2013-05}, author = {Ichitaro Yamazaki and Tingxing Dong and Stanimire Tomov and Jack Dongarra} } @article {icl:731, title = {Acceleration of the BLAST Hydro Code on GPU}, journal = {Supercomputing {\textquoteright}12 (poster)}, year = {2012}, month = {2012-11}, publisher = {SC12}, address = {Salt Lake City, Utah}, author = {Tingxing Dong and Tzanio Kolev and Robert Rieben and Veselin Dobrev and Stanimire Tomov and Jack Dongarra} } @article {, title = {Autotuning GEMM Kernels for the Fermi GPU}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {23}, number = {11}, year = {2012}, month = {2012-11}, abstract = {Abstract{\textemdash}In recent years, the use of graphics chips has been recognized as a viable way of accelerating scientific and engineering applications, even more so since the introduction of the Fermi architecture by NVIDIA, with features essential to numerical computing, such as fast double precision arithmetic and memory protected with error correction codes. Being the crucial component of numerical software packages, such as LAPACK and ScaLAPACK, the general dense matrix multiplication routine is one of the more important workloads to be implemented on these devices. This paper presents a methodology for producing matrix multiplication kernels tuned for a specific architecture, through a canonical process of heuristic autotuning, based on generation of multiple code variants and selecting the fastest ones through benchmarking. The key contribution of this work is in the method for generating the search space; specifically, pruning it to a manageable size. Performance numbers match or exceed other available implementations.}, doi = {https://doi.org/10.1109/TPDS.2011.311}, author = {Jakub Kurzak and Stanimire Tomov and Jack Dongarra} } @article {icl:697, title = {Block-asynchronous Multigrid Smoothers for GPU-accelerated Systems}, journal = {ICCS 2012}, year = {2012}, month = {2012-06}, address = {Omaha, NE}, author = {Hartwig Anzt and Stanimire Tomov and Mark Gates and Jack Dongarra and Vincent Heuveline} } @inproceedings {icl:685, title = {A Class of Communication-Avoiding Algorithms for Solving General Dense Linear Systems on CPU/GPU Parallel Machines}, journal = {Proc. of the International Conference on Computational Science (ICCS)}, volume = {9}, year = {2012}, month = {2012-06}, pages = {17-26}, keywords = {magma}, author = {Marc Baboulin and Simplice Donfack and Jack Dongarra and Laura Grigori and Adrien Remi and Stanimire Tomov} } @article {icl:703, title = {Dense Linear Algebra on Accelerated Multicore Hardware}, journal = {High Performance Scientific Computing: Algorithms and Applications}, year = {2012}, month = {2012-00}, publisher = {Springer-Verlag}, address = {London, UK}, author = {Jack Dongarra and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov}, editor = {Michael Berry and et al.,} } @article {icl:684, title = {Divide and Conquer on Hybrid GPU-Accelerated Multicore Systems}, journal = {SIAM Journal on Scientific Computing}, volume = {34(2)}, year = {2012}, month = {2012-04}, pages = {C70-C82}, keywords = {magma}, author = {Christof Voemel and Stanimire Tomov and Jack Dongarra} } @inproceedings {icl:669, title = {Enabling and Scaling Matrix Computations on Heterogeneous Multi-Core and Multi-GPU Systems}, journal = {26th ACM International Conference on Supercomputing (ICS 2012)}, year = {2012}, month = {2012-06}, publisher = {ACM}, address = {San Servolo Island, Venice, Italy}, keywords = {magma}, author = {Fengguang Song and Stanimire Tomov and Jack Dongarra} } @article {icl:725, title = {From CUDA to OpenCL: Towards a Performance-portable Solution for Multi-platform GPU Programming}, journal = {Parallel Computing}, volume = {38}, number = {8}, year = {2012}, month = {2012-08}, pages = {391-407}, author = {Peng Du and Rick Weber and Piotr Luszczek and Stanimire Tomov and Gregory D. Peterson and Jack Dongarra} } @article {1356, title = {The Future of Computing: Software Libraries}, year = {2012}, month = {2012-02}, publisher = {DOD CREATE Developers{\textquoteright} Review, Keynote Presentation}, address = {Savannah, GA}, author = {Stanimire Tomov and Jack Dongarra} } @article {1355, title = {MAGMA: A Breakthrough in Solvers for Eigenvalue Problems}, year = {2012}, month = {2012-05}, publisher = {GPU Technology Conference (GTC12), Presentation}, address = {San Jose, CA}, author = {Stanimire Tomov and Jack Dongarra and Azzam Haidar and Ichitaro Yamazaki and Tingxing Dong and Thomas Schulthess and Raffaele Solc{\`a}} } @article {1349, title = {MAGMA: A New Generation of Linear Algebra Library for GPU and Multicore Architectures}, year = {2012}, month = {2012-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC12), Presentation}, address = {Salt Lake City, UT}, author = {Jack Dongarra and Tingxing Dong and Mark Gates and Azzam Haidar and Stanimire Tomov and Ichitaro Yamazaki} } @article {1354, title = {MAGMA MIC: Linear Algebra Library for Intel Xeon Phi Coprocessors}, year = {2012}, month = {2012-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC12)}, address = {Salt Lake City, UT}, author = {Jack Dongarra and Mark Gates and Yulu Jia and Khairul Kabir and Piotr Luszczek and Stanimire Tomov} } @article {icl:730, title = {Matrices Over Runtime Systems at Exascale}, journal = {Supercomputing {\textquoteright}12 (poster)}, year = {2012}, month = {2012-11}, address = {Salt Lake City, Utah}, author = {Emmanuel Agullo and George Bosilca and Cedric Castagn{\`e}de and Jack Dongarra and Hatem Ltaeif and Stanimire Tomov} } @article {icl:729, title = {A Novel Hybrid CPU-GPU Generalized Eigensolver for Electronic Structure Calculations Based on Fine Grained Memory Aware Tasks}, journal = {Supercomputing {\textquoteright}12 (poster)}, year = {2012}, month = {2012-11}, address = {Salt Lake City, Utah}, author = {Raffaele Solc{\`a} and Azzam Haidar and Stanimire Tomov and Jack Dongarra and Thomas C. Schulthess} } @inproceedings {icl:678, title = {One-Sided Dense Matrix Factorizations on a Multicore with Multiple GPU Accelerators}, journal = {The International Conference on Computational Science (ICCS)}, year = {2012}, month = {2012-06}, keywords = {magma}, author = {Ichitaro Yamazaki and Stanimire Tomov and Jack Dongarra} } @techreport {icl:714, title = {Performance evaluation of LU factorization through hardware counter measurements}, journal = {University of Tennessee Computer Science Technical Report}, number = {ut-cs-12-700}, year = {2012}, month = {2012-10}, author = {Simplice Donfack and Stanimire Tomov and Jack Dongarra} } @article {icl:686, title = {Power Aware Computing on GPUs}, journal = {SAAHPC {\textquoteright}12 (Best Paper Award)}, year = {2012}, month = {2012-07}, address = {Argonne, IL}, keywords = {magma}, author = {Kiran Kasichayanula and Dan Terpstra and Piotr Luszczek and Stanimire Tomov and Shirley Moore and Gregory D. Peterson} } @article {icl:718, title = {Preliminary Results of Autotuning GEMM Kernels for the NVIDIA Kepler Architecture}, journal = {LAWN 267}, year = {2012}, month = {2012-00}, author = {Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @techreport {icl:715, title = {Providing GPU Capability to LU and QR within the ScaLAPACK Framework}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 272)}, number = {UT-CS-12-699}, year = {2012}, month = {2012-09}, author = {Peng Du and Stanimire Tomov and Jack Dongarra} } @inproceedings {icl:713, title = {Weighted Block-Asynchronous Iteration on GPU-Accelerated Systems}, journal = {Tenth International Workshop on Algorithms, Models and Tools for Parallel Computing on Heterogeneous Platforms (Best Paper)}, year = {2012}, month = {2012-08}, address = {Rhodes Island, Greece}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra and Vincent Heuveline} } @article {icl:637, title = {Accelerating Linear System Solutions Using Randomization Techniques}, journal = {INRIA RR-7616 / LAWN $\#$246 (presented at International AMMCS{\textquoteright}11)}, year = {2011}, month = {2011-07}, address = {Waterloo, Ontario, Canada}, keywords = {magma}, author = {Marc Baboulin and Jack Dongarra and Julien Herrmann and Stanimire Tomov} } @techreport {icl:630, title = {Autotuning GEMMs for Fermi}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-11-671, (also Lawn 245)}, year = {2011}, month = {2011-04}, keywords = {magma}, author = {Jakub Kurzak and Stanimire Tomov and Jack Dongarra} } @article {icl:661, title = {Block-asynchronous Multigrid Smoothers for GPU-accelerated Systems}, number = {UT-CS-11-689}, year = {2011}, month = {2011-12}, keywords = {magma}, author = {Hartwig Anzt and Stanimire Tomov and Mark Gates and Jack Dongarra and Vincent Heuveline} } @techreport {icl:656, title = {A Block-Asynchronous Relaxation Method for Graphics Processing Units}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-11-687 / LAWN 258}, year = {2011}, month = {2011-11}, keywords = {magma}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra and Vincent Heuveline} } @inproceedings {icl:640, title = {A Class of Hybrid LAPACK Algorithms for Multicore and GPU Architectures}, journal = {Symposium for Application Accelerators in High Performance Computing (SAAHPC{\textquoteright}11)}, year = {2011}, month = {2011-07}, address = {Knoxville, TN}, keywords = {magma, quark}, author = {Mitch Horton and Stanimire Tomov and Jack Dongarra} } @techreport {icl:628, title = {Efficient Support for Matrix Computations on Heterogeneous Multi-core and Multi-GPU Architectures}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-11-668, (also Lawn 250)}, year = {2011}, month = {2011-06}, keywords = {magma, plasma}, author = {Fengguang Song and Stanimire Tomov and Jack Dongarra} } @article {icl:653, title = {A Hybridization Methodology for High-Performance Linear Algebra Software for GPUs}, journal = {in GPU Computing Gems, Jade Edition}, volume = {2}, year = {2011}, month = {2011-00}, pages = {473-484}, publisher = {Elsevier}, keywords = {magma, morse}, author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Hatem Ltaeif and Raymond Namyst and Samuel Thibault and Stanimire Tomov}, editor = {Wen-mei W. Hwu} } @article {icl:599, title = {LU Factorization for Accelerator-Based Systems}, journal = {IEEE/ACS AICCSA 2011}, year = {2011}, month = {2011-12}, address = {Sharm-El-Sheikh, Egypt}, keywords = {magma, morse}, author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Mathieu Faverge and Julien Langou and Hatem Ltaeif and Stanimire Tomov} } @article {1360, title = {MAGMA - LAPACK for GPUs}, year = {2011}, month = {2011-04}, publisher = {Keeneland GPU Tutorial}, address = {Atlanta, GA}, author = {Stanimire Tomov} } @article {1359, title = {MAGMA - LAPACK for HPC on Heterogeneous Architectures}, year = {2011}, month = {2011-08}, publisher = {Titan Summit at Oak Ridge National Laboratory, Presentation}, address = {Oak Ridge, TN}, author = {Stanimire Tomov and Jack Dongarra} } @article {1358, title = {Matrix Algebra on GPU and Multicore Architectures}, year = {2011}, month = {2011-05}, publisher = {Workshop on GPU-enabled Numerical Libraries, Presentation}, address = {Basel, Switzerland}, author = {Stanimire Tomov} } @inproceedings {icl:632, title = {Optimizing Symmetric Dense Matrix-Vector Multiplication on GPUs}, journal = {ACM/IEEE Conference on Supercomputing (SC{\textquoteright}11)}, year = {2011}, month = {2011-11}, address = {Seattle, WA}, keywords = {magma}, author = {Rajib Nath and Stanimire Tomov and Tingxing Dong and Jack Dongarra} } @conference {icl:633, title = {Parallel Performance Measurement of Heterogeneous Parallel Systems with GPUs}, booktitle = {International Conference on Parallel Processing (ICPP{\textquoteright}11)}, year = {2011}, month = {2011-09}, publisher = {ACM}, organization = {ACM}, address = {Taipei, Taiwan}, abstract = {The power of GPUs is giving rise to heterogeneous parallel computing, with new demands on programming environments, runtime systems, and tools to deliver high-performing applications. This paper studies the problems associated with performance measurement of heterogeneous machines with GPUs. A heterogeneous computation model and alternative host-GPU measurement approaches are discussed to set the stage for reporting new capabilities for heterogeneous parallel performance measurement in three leading HPC tools: PAPI, Vampir, and the TAU Performance System. Our work leverages the new CUPTI tool support in NVIDIA{\textquoteright}s CUDA device library. Heterogeneous benchmarks from the SHOC suite are used to demonstrate the measurement methods and tool support.}, keywords = {magma, mumi, papi}, isbn = {978-0-7695-4510-3}, doi = {10.1109/ICPP.2011.71}, author = {Allen D. Malony and Scott Biersdorff and Sameer Shende and Heike Jagode and Stanimire Tomov and Guido Juckeland and Robert Dietrich and Duncan Poole and Christopher Lamb} } @article {icl:636, title = {Performance Portability of a GPU Enabled Factorization with the DAGuE Framework}, journal = {IEEE Cluster: workshop on Parallel Programming on Accelerator Clusters (PPAC)}, year = {2011}, month = {2011-06}, keywords = {dague, magma, parsec}, author = {George Bosilca and Aurelien Bouteiller and Thomas Herault and Pierre Lemariner and Narapat Ohm Saengpatsa and Stanimire Tomov and Jack Dongarra} } @article {1361, title = {Power-aware Computing on GPGPUs}, year = {2011}, month = {2011-09}, publisher = {Fall Creek Falls Conference, Poster}, address = {Gatlinburg, TN}, author = {Kiran Kasichayanula and Haihang You and Shirley Moore and Stanimire Tomov and Heike Jagode and Matt Johnson} } @techreport {icl:625, title = {Soft Error Resilient QR Factorization for Hybrid System}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-11-675}, year = {2011}, month = {2011-07}, address = {Knoxville, TN}, keywords = {ft-la}, author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @article {icl:635, title = {Soft Error Resilient QR Factorization for Hybrid System}, journal = {UT-CS-11-675 (also LAPACK Working Note $\#$252)}, number = {ICL-CS-11-675}, year = {2011}, month = {2011-07}, keywords = {magma}, author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @article {icl:642, title = {Soft Error Resilient QR Factorization for Hybrid System with GPGPU}, journal = {Journal of Computational Science}, year = {2011}, month = {2011-11}, publisher = {Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems at SC11}, address = {Seattle, WA}, keywords = {ft-la}, author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @inproceedings {icl:593, title = {A Unified HPC Environment for Hybrid Manycore/GPU Distributed Systems}, journal = {IEEE International Parallel and Distributed Processing Symposium (submitted)}, year = {2011}, month = {2011-05}, address = {Anchorage, AK}, keywords = {dague}, author = {George Bosilca and Aurelien Bouteiller and Thomas Herault and Pierre Lemariner and Narapat Ohm Saengpatsa and Stanimire Tomov and Jack Dongarra} } @article {icl:546, title = {Accelerating GPU Kernels for Dense Linear Algebra}, journal = {Proc. of VECPAR{\textquoteright}10}, year = {2010}, month = {2010-06}, address = {Berkeley, CA}, keywords = {magma}, author = {Rajib Nath and Stanimire Tomov and Jack Dongarra} } @article {1363, title = {Accelerating Linear Algebra on Heterogeneous Architectures of Multicore and GPUs using MAGMA and DPLASMA and StarPU Schedulers}, year = {2010}, month = {2010-07}, publisher = {2010 Symposium on Application Accelerators in. High-Performance Computing (SAAHPC{\textquoteright}10), Tutorial}, author = {Stanimire Tomov and George Bosilca and Cedric Augonnet} } @article {icl:547, title = {Accelerating the Reduction to Upper Hessenberg, Tridiagonal, and Bidiagonal Forms through Hybrid GPU-Based Computing}, journal = {Parallel Computing}, volume = {36}, number = {12}, year = {2010}, month = {2010-00}, pages = {645-654}, keywords = {magma}, author = {Stanimire Tomov and Rajib Nath and Jack Dongarra} } @article {1364, title = {Autotuning Dense Linear Algebra Libraries on GPUs}, year = {2010}, month = {2010-06}, publisher = {Sixth International Workshop on Parallel Matrix Algorithms and Applications (PMAA 2010)}, address = {Basel, Switzerland}, author = {Rajib Nath and Stanimire Tomov and Emmanuel Agullo and Jack Dongarra} } @inbook {854, title = {Blas for GPUs}, booktitle = {Scientific Computing with Multicore and Accelerators}, series = {Chapman \& Hall/CRC Computational Science}, year = {2010}, publisher = {CRC Press}, organization = {CRC Press}, chapter = {4}, address = {Boca Raton, Florida}, isbn = {9781439825365}, author = {Rajib Nath and Stanimire Tomov and Jack Dongarra} } @inbook {855, title = {Dense Linear Algebra for Hybrid GPU-based Systems}, booktitle = {Scientific Computing with Multicore and Accelerators}, series = {Chapman \& Hall/CRC Computational Science}, year = {2010}, publisher = {CRC Press}, organization = {CRC Press}, chapter = {3}, address = {Boca Raton, Florida}, isbn = {9781439825365}, author = {Stanimire Tomov and Jack Dongarra} } @inproceedings {icl:523, title = {Dense Linear Algebra Solvers for Multicore with GPU Accelerators}, journal = {Parallel Distributed Processing, Workshops and Phd Forum (IPDPSW), 2010 IEEE International Symposium on}, year = {2010}, pages = {1-8}, address = {Atlanta, GA}, abstract = {Solving dense linear systems of equations is a fundamental problem in scientific computing. Numerical simulations involving complex systems represented in terms of unknown variables and relations between them often lead to linear systems of equations that must be solved as fast as possible. We describe current efforts toward the development of these critical solvers in the area of dense linear algebra (DLA) for multicore with GPU accelerators. We describe how to code/develop solvers to effectively use the high computing power available in these new and emerging hybrid architectures. The approach taken is based on hybridization techniques in the context of Cholesky, LU, and QR factorizations. We use a high-level parallel programming model and leverage existing software infrastructure, e.g. optimized BLAS for CPU and GPU, and LAPACK for sequential CPU processing. Included also are architecture and algorithm-specific optimizations for standard solvers as well as mixed-precision iterative refinement solvers. The new algorithms, depending on the hardware configuration and routine parameters, can lead to orders of magnitude acceleration when compared to the same algorithms on standard multicore architectures that do not contain GPU accelerators. The newly developed DLA solvers are integrated and freely available through the MAGMA library.}, doi = {10.1109/IPDPSW.2010.5470941}, author = {Stanimire Tomov and Rajib Nath and Hatem Ltaeif and Jack Dongarra} } @article {1351, title = {Dense Linear Algebra Solvers for Multicore with GPU Accelerators}, year = {2010}, month = {2010-04}, publisher = {International Parallel and Distributed Processing Symposium (IPDPS 2010)}, address = {Atlanta, GA}, author = {Stanimire Tomov} } @article {icl:639, title = {Divide \& Conquer on Hybrid GPU-Accelerated Multicore Systems}, journal = {SIAM Journal on Scientific Computing (submitted)}, year = {2010}, month = {2010-08}, keywords = {magma}, author = {Christof Voemel and Stanimire Tomov and Jack Dongarra} } @techreport {icl:585, title = {Faster, Cheaper, Better - A Hybridization Methodology to Develop Linear Algebra Software for GPUs}, journal = {LAPACK Working Note}, number = {230}, year = {2010}, month = {2010-00}, keywords = {magma, morse}, author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Hatem Ltaeif and Raymond Namyst and Samuel Thibault and Stanimire Tomov} } @article {icl:526, title = {Hybrid Multicore Cholesky Factorization with Multiple GPU Accelerators}, journal = {IEEE Transaction on Parallel and Distributed Systems (submitted)}, year = {2010}, month = {2010-03}, keywords = {magma, plasma}, author = {Hatem Ltaeif and Stanimire Tomov and Rajib Nath and Jack Dongarra} } @techreport {icl:548, title = {An Improved MAGMA GEMM for Fermi GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-10-655 (also LAPACK working note 227)}, year = {2010}, month = {2010-07}, keywords = {magma}, author = {Rajib Nath and Stanimire Tomov and Jack Dongarra} } @article {icl:582, title = {An Improved MAGMA GEMM for Fermi GPUs}, journal = {International Journal of High Performance Computing}, volume = {24}, number = {4}, year = {2010}, month = {2010-00}, pages = {511-515}, keywords = {magma}, author = {Rajib Nath and Stanimire Tomov and Jack Dongarra} } @article {1350, title = {An Introduction to the MAGMA project - Acceleration of Dense Linear Algebra}, year = {2010}, month = {2010-06}, publisher = {NVIDIA Webinar}, url = {http://developer.download.nvidia.com/CUDA/training/introtomagma.mp4}, author = {Jack Dongarra and Stanimire Tomov} } @inproceedings {icl:562, title = {Mixed-Tool Performance Analysis on Hybrid Multicore Architectures}, journal = {First International Workshop on Parallel Software Tools and Tool Infrastructures (PSTI 2010)}, year = {2010}, month = {2010-09}, address = {San Diego, CA}, keywords = {magma}, author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @inproceedings {icl:584, title = {Performance Evaluation for Petascale Quantum Simulation Tools}, journal = {Proceedings of the Cray Users{\textquoteright} Group Meeting}, year = {2010}, month = {2010-05}, address = {Atlanta, GA}, author = {Stanimire Tomov and Wenchang Lu and and Jerzy Bernholc and Shirley Moore and Jack Dongarra} } @inproceedings {icl:577, title = {QR Factorization on a Multicore Node Enhanced with Multiple GPU Accelerators}, journal = {Proceedings of IPDPS 2011}, number = {ICL-UT-10-04}, year = {2010}, month = {2010-10}, address = {Anchorage, AK}, keywords = {magma, morse, plasma}, author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Mathieu Faverge and Hatem Ltaeif and Samuel Thibault and Stanimire Tomov} } @article {icl:554, title = {Scalability Study of a Quantum Simulation Code}, journal = {PARA 2010}, year = {2010}, month = {2010-06}, address = {Reykjavik, Iceland}, author = {Jerzy Bernholc and Miroslav Hodak and Wenchang Lu and Shirley Moore and Stanimire Tomov} } @article {icl:521, title = {A Scalable High Performant Cholesky Factorization for Multicore with GPU Accelerators}, journal = {Proc. of VECPAR{\textquoteright}10 (to appear)}, year = {2010}, month = {2010-06}, address = {Berkeley, CA}, keywords = {magma, plasma}, author = {Hatem Ltaeif and Stanimire Tomov and Rajib Nath and Peng Du and Jack Dongarra} } @article {1362, title = {Scheduling Cholesky Factorization on Multicore Architectures with GPU Accelerators}, year = {2010}, month = {2010-07}, publisher = {2010 Symposium on Application Accelerators in High-Performance Computing (SAAHPC{\textquoteright}10), Poster}, address = {Knoxville, TN}, author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Hatem Ltaeif and Raymond Namyst and Rajib Nath and Jean Roman and Samuel Thibault and Stanimire Tomov} } @article {icl:564, title = {Towards Dense Linear Algebra for Hybrid GPU Accelerated Manycore Systems}, journal = {Parallel Computing}, volume = {36}, number = {5-6}, year = {2010}, month = {2010-00}, pages = {232-240}, keywords = {magma}, author = {Stanimire Tomov and Jack Dongarra and Marc Baboulin} } @article {icl:620, title = {Using MAGMA with PGI Fortran}, journal = {PGI Insider}, year = {2010}, month = {2010-11}, keywords = {magma}, author = {Stanimire Tomov and Mathieu Faverge and Piotr Luszczek and Jack Dongarra} } @article {, title = {Accelerating Scientific Computations with Mixed Precision Algorithms}, journal = {Computer Physics Communications}, volume = {180}, year = {2009}, month = {2009-12}, pages = {2526-2533}, abstract = {On modern architectures, the performance of 32-bit operations is often at least twice as fast as the performance of 64-bit operations. By using a combination of 32-bit and 64-bit floating point arithmetic, the performance of many dense and sparse linear algebra algorithms can be significantly enhanced while maintaining the 64-bit accuracy of the resulting solution. The approach presented here can apply not only to conventional processors but also to other technologies such as Field Programmable Gate Arrays (FPGA), Graphical Processing Units (GPU), and the STI Cell BE processor. Results on modern processor architectures and the STI Cell BE are presented.}, doi = {https://doi.org/10.1016/j.cpc.2008.11.005}, author = {Marc Baboulin and Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julie Langou and Julien Langou and Piotr Luszczek and Stanimire Tomov} } @techreport {icl:485, title = {Accelerating the Reduction to Upper Hessenberg Form through Hybrid GPU-Based Computing}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-09-642 (also LAPACK Working Note 219)}, year = {2009}, month = {2009-05}, keywords = {magma}, author = {Stanimire Tomov and Jack Dongarra} } @inproceedings {icl:512, title = {A Note on Auto-tuning GEMM for GPUs}, journal = {9th International Conference on Computational Science (ICCS 2009)}, number = {5544-5545}, year = {2009}, month = {2009-05}, pages = {884-892}, address = {Baton Rouge, LA}, doi = {10.1007/978-3-642-01970-8_89}, author = {Yinan Li and Jack Dongarra and Stanimire Tomov}, editor = {Gabrielle Allen and Jaros{\l}aw Nabrzyski and E. Seidel and Geert Dick van Albada and Jack Dongarra and Peter M. Sloot} } @article {1352, title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects}, year = {2009}, month = {2009-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)}, address = {Portland, OR}, author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Rajib Nath and Stanimire Tomov and Asim YarKhan and Vasily Volkov} } @inproceedings {icl:486, title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects}, journal = {Journal of Physics: Conference Series}, volume = {180}, year = {2009}, month = {2009-00}, keywords = {magma, plasma}, author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Stanimire Tomov} } @article {1365, title = {Numerical Linear Algebra on Hybrid Architectures: Recent Developments in the MAGMA Project}, year = {2009}, month = {2009-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)}, address = {Portland, Oregon}, author = {Rajib Nath and Jack Dongarra and Stanimire Tomov and Hatem Ltaeif and Peng Du} } @inproceedings {icl:478, title = {Performance evaluation for petascale quantum simulation tools}, journal = {Proceedings of CUG09}, year = {2009}, month = {2009-05}, address = {Atlanta, GA}, keywords = {doe-nano}, author = {Stanimire Tomov and Wenchang Lu and Jerzy Bernholc and Shirley Moore and Jack Dongarra} } @article {1353, title = {Enhancing the Performance of Dense Linear Algebra Solvers on GPUs (in the MAGMA Project)}, year = {2008}, month = {2008-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC08)}, address = {Austin, TX}, author = {Marc Baboulin and James Demmel and Jack Dongarra and Stanimire Tomov and Vasily Volkov} } @article {icl:449, title = {Exploiting Mixed Precision Floating Point Hardware in Scientific Computations}, journal = {in High Performance Computing and Grids in Action}, year = {2008}, month = {2008-01}, publisher = {IOS Press}, address = {Amsterdam}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Julien Langou and Piotr Luszczek and Stanimire Tomov}, editor = {Lucio Grandinetti} } @inproceedings {icl:440, title = {Exploring New Architectures in Accelerating CFD for Air Force Applications}, journal = {Proceedings of the DoD HPCMP User Group Conference}, year = {2008}, month = {2008-01}, address = {Seattle, Washington}, keywords = {magma}, author = {Jack Dongarra and Shirley Moore and Gregory D. Peterson and Stanimire Tomov and Jeff Allred and Vincent Natoli and David Richie} } @inproceedings {icl:416, title = {Interior State Computation of Nano Structures}, journal = {PARA 2008, 9th International Workshop on State-of-the-Art in Scientific and Parallel Computing}, year = {2008}, month = {2008-05}, address = {Trondheim, Norway}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @inproceedings {icl:516, title = {Some Issues in Dense Linear Algebra for Multicore and Special Purpose Architectures}, journal = {PARA 2008, 9th International Workshop on State-of-the-Art in Scientific and Parallel Computing}, year = {2008}, month = {2008-05}, address = {Trondheim Norway}, keywords = {magma}, author = {Marc Baboulin and Stanimire Tomov and Jack Dongarra} } @techreport {icl:415, title = {Some Issues in Dense Linear Algebra for Multicore and Special Purpose Architectures}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-615 (also LAPACK Working Note 200)}, year = {2008}, month = {2008-01}, keywords = {magma}, author = {Marc Baboulin and Jack Dongarra and Stanimire Tomov} } @article {icl:447, title = {State-of-the-Art Eigensolvers for Electronic Structure Calculations of Large Scale Nano-Systems}, journal = {Journal of Computational Physics}, volume = {227}, number = {15}, year = {2008}, month = {2008-01}, pages = {7113-7124}, author = {Christof Voemel and Stanimire Tomov and Osni Marques and Andrew Canning and Lin-Wang Wang and Jack Dongarra} } @techreport {icl:443, title = {Towards Dense Linear Algebra for Hybrid GPU Accelerated Manycore Systems}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-632 (also LAPACK Working Note 210)}, year = {2008}, month = {2008-01}, keywords = {magma}, author = {Stanimire Tomov and Jack Dongarra and Marc Baboulin} } @article {icl:424, title = {Using Mixed Precision for Sparse Matrix Computations to Enhance the Performance while Achieving 64-bit Accuracy}, journal = {ACM Transactions on Mathematical Software}, volume = {34}, number = {4}, year = {2008}, month = {2008-00}, pages = {17-22}, keywords = {plasma}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov} } @article {icl:392, title = {Exploiting Mixed Precision Floating Point Hardware in Scientific Computations}, journal = {In High Performance Computing and Grids in Action (to appear)}, year = {2007}, month = {2007-00}, publisher = {IOS Press}, address = {Amsterdam}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Julie Langou and Piotr Luszczek and Stanimire Tomov}, editor = {Lucio Grandinetti} } @article {icl:401, title = {The Use of Bulk States to Accelerate the Band Edge State Calculation of a Semiconductor Quantum Dot}, journal = {Journal of Computational Physics}, volume = {223}, year = {2007}, month = {2007-00}, pages = {774-782}, author = {Christof Voemel and Stanimire Tomov and Lin-Wang Wang and Osni Marques and Jack Dongarra} } @article {icl:402, title = {Conjugate-Gradient Eigenvalue Solvers in Computing Electronic Properties of Nanostructure Architectures}, journal = {International Journal of Computational Science and Engineering}, volume = {2}, number = {3/4}, year = {2006}, month = {2006-00}, pages = {205-212}, author = {Stanimire Tomov and Julien Langou and Jack Dongarra and Andrew Canning and Lin-Wang Wang} } @article {icl:369, title = {The Impact of Multicore on Math Software}, journal = {PARA 2006}, year = {2006}, month = {2006-06}, address = {Umea, Sweden}, keywords = {plasma}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Piotr Luszczek and Stanimire Tomov} } @inproceedings {icl:325, title = {Performance evaluation of eigensolvers in nano-structure computations}, journal = {IEEE/ACM Proceedings of HPCNano SC06 (to appear)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @article {icl:327, title = {Predicting the electronic properties of 3D, million-atom semiconductor nanostructure architectures}, journal = {J. Phys.: Conf. Ser. 46}, volume = {:101088/1742-6596/46/1/040}, year = {2006}, month = {2006-01}, pages = {292-298}, keywords = {DOE_NANO}, author = {Alex Zunger and Alberto Franceschetti and Gabriel Bester and Wesley B. Jones and Kwiseon Kim and Peter A. Graf and Lin-Wang Wang and Andrew Canning and Osni Marques and Christof Voemel and Jack Dongarra and Julien Langou and Stanimire Tomov} } @article {icl:370, title = {Prospectus for the Next LAPACK and ScaLAPACK Libraries}, journal = {PARA 2006}, year = {2006}, month = {2006-06}, address = {Umea, Sweden}, author = {James Demmel and Jack Dongarra and B. Parlett and William Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye Li and Osni Marques and Jason E. Riedy and Christof Voemel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julien Langou and Stanimire Tomov} } @inproceedings {icl:324, title = {Towards bulk based preconditioning for quantum dot computations}, journal = {IEEE/ACM Proceedings of HPCNano SC06 (to appear)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @article {icl:326, title = {The use of bulk states to accelerate the band edge state calculation of a semiconductor quantum dot}, journal = {Journal of Computational Physics (submitted)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Christof Voemel and Stanimire Tomov and Lin-Wang Wang and Osni Marques and Jack Dongarra} } @inproceedings {icl:284, title = {Comparison of Nonlinear Conjugate-Gradient methods for computing the Electronic Properties of Nanostructure Architectures}, journal = {Proceedings of 5th International Conference on Computational Science (ICCS)}, year = {2005}, month = {2005-01}, pages = {317-325}, publisher = {Springer{\textquoteright}s Lecture Notes in Computer Science}, address = {Atlanta, GA, USA}, keywords = {doe-nano}, author = {Stanimire Tomov and Julien Langou and Andrew Canning and Lin-Wang Wang and Jack Dongarra}, editor = {V. S. Sunderman and Geert Dick van Albada and Peter M. Sloot and Jack Dongarra} } @article {icl:292, title = {Conjugate-Gradient Eigenvalue Solvers in Computing Electronic Properties of Nanostructure Architectures}, journal = {International Journal of Computational Science and Engineering (to appear)}, year = {2005}, month = {2005-01}, author = {Stanimire Tomov and Julien Langou and Andrew Canning and Lin-Wang Wang and Jack Dongarra} }