@article {1319,
title = {Adaptive precision in block-Jacobi preconditioning for iterative sparse linear system solvers},
journal = {Concurrency and Computation: Practice and Experience},
volume = {31},
number = {6},
year = {2019},
pages = {e4460},
abstract = {Summary We propose an adaptive scheme to reduce communication overhead caused by data movement by selectively storing the diagonal blocks of a block-Jacobi preconditioner in different precision formats (half, single, or double). This specialized preconditioner can then be combined with any Krylov subspace method for the solution of sparse linear systems to perform all arithmetic in double precision. We assess the effects of the adaptive precision preconditioner on the iteration count and data transfer cost of a preconditioned conjugate gradient solver. A preconditioned conjugate gradient method is, in general, a memory bandwidth-bound algorithm, and therefore its execution time and energy consumption are largely dominated by the costs of accessing the problem{\textquoteright}s data in memory. Given this observation, we propose a model that quantifies the time and energy savings of our approach based on the assumption that these two costs depend linearly on the bit length of a floating point number. Furthermore, we use a number of test problems from the SuiteSparse matrix collection to estimate the potential benefits of the adaptive block-Jacobi preconditioning scheme.},
keywords = {adaptive precision, block-Jacobi preconditioning, communication reduction, energy efficiency, Krylov subspace methods, sparse linear systems},
doi = {10.1002/cpe.4460},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4460},
author = {Anzt, Hartwig and Jack Dongarra and Flegar, Goran and Nicholas J. Higham and Quintana-Orti, Enrique S.}
}
@article {1267,
title = {Adaptive Precision in Block-Jacobi Preconditioning for Iterative Sparse Linear System Solvers},
journal = {Concurrency Computation: Practice and Experience},
year = {2018},
month = {2018-03},
abstract = {We propose an adaptive scheme to reduce communication overhead caused by data movement by selectively storing the diagonal blocks of a block-Jacobi preconditioner in different precision formats (half, single, or double). This specialized preconditioner can then be combined with any Krylov subspace method for the solution of sparse linear systems to perform all arithmetic in double precision. We assess the effects of the adaptive precision preconditioner on the iteration count and data transfer cost of a preconditioned conjugate gradient solver. A preconditioned conjugate gradient method is, in general, a memory bandwidth-bound algorithm, and therefore its execution time and energy consumption are largely dominated by the costs of accessing the problem{\textquoteright}s data in memory. Given this observation, we propose a model that quantifies the time and energy savings of our approach based on the assumption that these two costs depend linearly on the bit length of a floating point number. Furthermore, we use a number of test problems from the SuiteSparse matrix collection to estimate the potential benefits of the adaptive block-Jacobi preconditioning scheme.},
keywords = {adaptive precision, block-Jacobi preconditioning, communication reduction, energy efficiency, Krylov subspace methods, sparse linear systems},
doi = {https://doi.org/10.1002/cpe.4460},
url = {http://www.netlib.org/utk/people/JackDongarra/PAPERS/Anzt_et_al-2018-Concurrency.pdf},
author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Nicholas J. Higham and Enrique S. Quintana-Ort{\'\i}}
}
@article {1300,
title = {Batched BLAS (Basic Linear Algebra Subprograms) 2018 Specification},
year = {2018},
month = {2018-07},
abstract = {This document describes an API for Batch Basic Linear Algebra Subprograms (Batched BLAS or BBLAS). We focus on many independent BLAS operations on small matrices that are grouped together and processed by a single routine, called a Batched BLAS routine. The extensions beyond the original BLAS standard are considered that specify a programming interface not only for routines with uniformly-sized matrices and/or vectors but also for the situation where the sizes vary. The aim is to provide more efficient, but portable, implementations of algorithms on high-performance manycore platforms. These include multicore and many-core CPU processors; GPUs and coprocessors; as well as other hardware accelerators with floating-point compute facility.},
author = {Jack Dongarra and Iain Duff and Mark Gates and Azzam Haidar and Sven Hammarling and Nicholas J. Higham and Jonathan Hogg and Pedro Valero Lara and Piotr Luszczek and Mawussi Zounon and Samuel D. Relton and Stanimire Tomov and Timothy Costa and Sarah Knepper}
}
@conference {1264,
title = {Harnessing GPU Tensor Cores for Fast FP16 Arithmetic to Speed up Mixed-Precision Iterative Refinement Solvers},
booktitle = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC18)},
year = {2018},
month = {2018-11},
publisher = {IEEE},
organization = {IEEE},
address = {Dallas, TX},
abstract = {Low-precision floating-point arithmetic is a powerful tool for accelerating scientific computing applications, especially those in artificial intelligence. Here, we present an investigation showing that other high-performance computing (HPC) applications can also harness this power. Specifically, we use the general HPC problem, Ax = b, where A is a large dense matrix, and a double precision (FP64) solution is needed for accuracy. Our approach is based on mixed-precision (FP16-FP64) iterative refinement, and we generalize and extend prior advances into a framework, for which we develop architecture-specific algorithms and highly tuned implementations. These new methods show how using half-precision Tensor Cores (FP16-TC) for the arithmetic can provide up to 4{\texttimes} speedup. This is due to the performance boost that the FP16-TC provide as well as to the improved accuracy over the classical FP16 arithmetic that is obtained because the GEMM accumulation occurs in FP32 arithmetic.},
author = {Azzam Haidar and Stanimire Tomov and Jack Dongarra and Nicholas J. Higham}
}
@conference {1168,
title = {The Design and Performance of Batched BLAS on Modern High-Performance Computing Systems},
booktitle = {International Conference on Computational Science (ICCS 2017)},
year = {2017},
month = {2017-06},
publisher = {Elsevier},
organization = {Elsevier},
address = {Z{\"u}rich, Switzerland},
abstract = {A current trend in high-performance computing is to decompose a large linear algebra problem into batches containing thousands of smaller problems, that can be solved independently, before collating the results. To standardize the interface to these routines, the community is developing an extension to the BLAS standard (the batched BLAS), enabling users to perform thousands of small BLAS operations in parallel whilst making efficient use of their hardware. We discuss the benefits and drawbacks of the current batched BLAS proposals and perform a number of experiments, focusing on a general matrix-matrix multiplication (GEMM), to explore their affect on the performance. In particular we analyze the effect of novel data layouts which, for example, interleave the matrices in memory to aid vectorization and prefetching of data. Utilizing these modifications our code outperforms both MKL1 CuBLAS2 by up to 6 times on the self-hosted Intel KNL (codenamed Knights Landing) and Kepler GPU architectures, for large numbers of double precision GEMM operations using matrices of size 2 {\texttimes} 2 to 20 {\texttimes} 20.},
keywords = {Batched BLAS, BLAS, High-performance computing, Memory management, Parallel processing, Scientific computing},
doi = {DOI:10.1016/j.procs.2017.05.138},
author = {Jack Dongarra and Sven Hammarling and Nicholas J. Higham and Samuel Relton and Pedro Valero-Lara and Mawussi Zounon}
}
@conference {1170,
title = {Optimized Batched Linear Algebra for Modern Architectures},
booktitle = {Euro-Par 2017},
year = {2017},
month = {2017-08},
publisher = {Springer},
organization = {Springer},
address = {Santiago de Compostela, Spain},
abstract = {Solving large numbers of small linear algebra problems simultaneously is becoming increasingly important in many application areas. Whilst many researchers have investigated the design of efficient batch linear algebra kernels for GPU architectures, the common approach for many/multi-core CPUs is to use one core per subproblem in the batch. When solving batches of very small matrices, 2 {\texttimes} 2 for example, this design exhibits two main issues: it fails to fully utilize the vector units and the cache of modern architectures, since the matrices are too small. Our approach to resolve this is as follows: given a batch of small matrices spread throughout the primary memory, we first reorganize the elements of the matrices into a contiguous array, using a block interleaved memory format, which allows us to process the small independent problems as a single large matrix problem and enables cross-matrix vectorization. The large problem is solved using blocking strategies that attempt to optimize the use of the cache. The solution is then converted back to the original storage format. To explain our approach we focus on two BLAS routines: general matrix-matrix multiplication (GEMM) and the triangular solve (TRSM). We extend this idea to LAPACK routines using the Cholesky factorization and solve (POSV). Our focus is primarily on very small matrices ranging in size from 2 {\texttimes} 2 to 32 {\texttimes} 32. Compared to both MKL and OpenMP implementations, our approach can be up to 4 times faster for GEMM, up to 14 times faster for TRSM, and up to 40 times faster for POSV on the new Intel Xeon Phi processor, code-named Knights Landing (KNL). Furthermore, we discuss strategies to avoid data movement between sockets when using our interleaved approach on a NUMA node.},
doi = {https://doi.org/10.1007/978-3-319-64203-1_37},
author = {Jack Dongarra and Sven Hammarling and Nicholas J. Higham and Samuel Relton and Mawussi Zounon}
}
@inbook {927,
title = {High-Performance Computing},
booktitle = {The Princeton Companion to Applied Mathematics},
year = {2015},
pages = {839-842},
publisher = {Princeton University Press},
organization = {Princeton University Press},
address = {Princeton, New Jersey},
isbn = {9781400874477},
author = {Jack Dongarra and Nicholas J. Higham and Mark R. Dennis and Paul Glendinning and Paul A. Martin and Fadil Santosa and Jared Tanner}
}