@booklet {, title = {Earth Virtualization Engines - A Technical Perspective}, year = {2023}, month = {2023-09}, abstract = {Participants of the Berlin Summit on Earth Virtualization Engines (EVEs) discussed ideas and concepts to improve our ability to cope with climate change. EVEs aim to provide interactive and accessible climate simulations and data for a wide range of users. They combine high-resolution physics-based models with machine learning techniques to improve the fidelity, efficiency, and interpretability of climate projections. At their core, EVEs offer a federated data layer that enables simple and fast access to exabyte-sized climate data through simple interfaces. In this article, we summarize the technical challenges and opportunities for developing EVEs, and argue that they are essential for addressing the consequences of climate change.}, url = {https://arxiv.org/abs/2309.09002}, author = {Torsten Hoefler and Bjorn Stevens and Andreas F. Prein and Johanna Baehr and Thomas Schulthess and Thomas F. Stocker and John Taylor and Daniel Klocke and Pekka Manninen and Piers M. Forster and Tobias K{\"o}lling and Nicolas Gruber and Hartwig Anzt and Claudia Frauen and Florian Ziemen and Milan Kl{\"o}wer and Karthik Kashinath and Christoph Sch{\"a}r and Oliver Fuhrer and Bryan N. Lawrence} } @article {, title = {Sparse matrix-vector and matrix-multivector products for the truncated SVD on graphics processors}, journal = {Concurrency and Computation: Practice and Experience}, year = {2023}, month = {2023-08}, abstract = {Many practical algorithms for numerical rank computations implement an iterative procedure that involves repeated multiplications of a vector, or a collection of vectors, with both a sparse matrix A and its transpose. Unfortunately, the realization of these sparse products on current high performance libraries often deliver much lower arithmetic throughput when the matrix involved in the product is transposed. In this work, we propose a hybrid sparse matrix layout, named CSRC, that combines the flexibility of some well-known sparse formats to offer a number of appealing properties: (1) CSRC can be obtained at low cost from the popular CSR (compressed sparse row) format; (2) CSRC has similar storage requirements as CSR; and especially, (3) the implementation of the sparse product kernels delivers high performance for both the direct product and its transposed variant on modern graphics accelerators thanks to a significant reduction of atomic operations compared to a conventional implementation based on CSR. This solution thus renders considerably higher performance when integrated into an iterative algorithm for the truncated singular value decomposition (SVD), such as the randomized SVD or, as demonstrated in the experimental results, the block Golub{\textendash}Kahan{\textendash}Lanczos algorithm.}, keywords = {graphics processing units, Singular value decomposition, sparse matrix-multivector product, sparse matrix-vector product}, issn = {1532-0626}, doi = {10.1002/cpe.7871}, url = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/cpe.7871}, author = {Jos{\'e} I. Aliaga and Hartwig Anzt and Enrique S. Quintana-Orti and Andres E. Thomas} } @article {, title = {Ginkgo{\textemdash}A math library designed for platform portability}, journal = {Parallel Computing}, volume = {111}, year = {2022}, month = {2022-02}, pages = {102902}, abstract = {In an era of increasing computer system diversity, the portability of software from one system to another plays a central role. Software portability is important for the software developers as many software projects have a lifetime longer than a specific system, e.g., a supercomputer, and it is important for the domain scientists that realize their scientific application in a software framework and want to be able to run on one or another system. On a high level, there exist two approaches for realizing platform portability: (1) implementing software using a portability layer leveraging any technique which always generates specific kernels from another language or through an interface for running on different architectures; and (2) providing backends for different hardware architectures, with the backends typically differing in how and in which programming language functionality is realized due to using the language of choice for each hardware (e.g., CUDA kernels for NVIDIA GPUs, SYCL (DPC++) kernels to targeting Intel GPUs and other supported hardware, {\textellipsis}). In practice, these two approaches can be combined in applications to leverage their respective strengths. In this paper, we present how we realize portability across different hardware architectures for the Ginkgo library by following the second strategy and the goal to not only port to new hardware architectures but also achieve good performance. We present the Ginkgo library design, separating algorithms from hardware-specific kernels forming the distinct hardware executors, and report our experience when adding execution backends for NVIDIA, AMD, and Intel GPUs. We also present the performance we achieve with this approach for distinct hardware backends.}, keywords = {AMD, Intel, nVidia, performance portability, Platform Portability, Porting to GPU accelerators}, issn = {0167-8191}, doi = {https://doi.org/10.1016/j.parco.2022.102902}, url = {https://www.sciencedirect.com/science/article/pii/S0167819122000096}, author = {Terry Cojean and Yu-Hsiang Mike Tsai and Hartwig Anzt} } @article {, title = {Gingko: A Sparse Linear Algebrea Library for HPC}, year = {2021}, month = {2021-04}, publisher = {2021 ECP Annual Meeting}, author = {Hartwig Anzt and Natalie Beams and Terry Cojean and Fritz G{\"o}bel and Thomas Gr{\"u}tzmacher and Aditya Kashi and Pratik Nayak and Tobias Ribizel and Yuhsiang M. Tsai} } @article {, title = {Clover: Computational Libraries Optimized via Exascale Research}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Mark Gates and Stanimire Tomov and Hartwig Anzt and Piotr Luszczek and Jack Dongarra} } @article {, title = {Evaluating Asynchronous Schwarz Solvers on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2020}, month = {2020-08}, abstract = {With the commencement of the exascale computing era, we realize that the majority of the leadership supercomputers are heterogeneous and massively parallel. Even a single node can contain multiple co-processors such as GPUs and multiple CPU cores. For example, ORNL{\textquoteright}s Summit accumulates six NVIDIA Tesla V100 GPUs and 42 IBM Power9 cores on each node. Synchronizing across compute resources of multiple nodes can be prohibitively expensive. Hence, it is necessary to develop and study asynchronous algorithms that circumvent this issue of bulk-synchronous computing. In this study, we examine the asynchronous version of the abstract Restricted Additive Schwarz method as a solver. We do not explicitly synchronize, but allow the communication between the sub-domains to be completely asynchronous, thereby removing the bulk synchronous nature of the algorithm. We accomplish this by using the one-sided Remote Memory Access (RMA) functions of the MPI standard. We study the benefits of using such an asynchronous solver over its synchronous counterpart. We also study the communication patterns governed by the partitioning and the overlap between the sub-domains on the global solver. Finally, we show that this concept can render attractive performance benefits over the synchronous counterparts even for a well-balanced problem.}, keywords = {abstract Schwarz methods, Asynchronous solvers, exascale, GPUs, multicore processors, parallel numerical linear algebra}, doi = {https://doi.org/10.1177/1094342020946814}, author = {Pratik Nayak and Terry Cojean and Hartwig Anzt} } @conference {, title = {Evaluating the Performance of NVIDIA{\textquoteright}s A100 Ampere GPU for Sparse and Batched Computations}, booktitle = {2020 IEEE/ACM Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {GPU accelerators have become an important backbone for scientific high performance-computing, and the performance advances obtained from adopting new GPU hardware are significant. In this paper we take a first look at NVIDIA{\textquoteright}s newest server-line GPU, the A100 architecture, part of the Ampere generation. Specifically, we assess its performance for sparse and batch computations, as these routines are relied upon in many scientific applications, and compare to the p}, keywords = {Batched linear algebra, NVIDIA A100 GPU, sparse linear algebra, Sparse Matrix Vector Product}, author = {Hartwig Anzt and Yuhsiang M. Tsai and Ahmad Abdelfattah and Terry Cojean and Jack Dongarra} } @article {, title = {Ginkgo: A High Performance Numerical Linear Algebra Library}, journal = {Journal of Open Source Software}, volume = {5}, year = {2020}, month = {2020-08}, abstract = {Ginkgo is a production-ready sparse linear algebra library for high performance computing on GPU-centric architectures with a high level of performance portability and focuses on software sustainability. The library focuses on solving sparse linear systems and accommodates a large variety of matrix formats, state-of-the-art iterative (Krylov) solvers and preconditioners, which make the library suitable for a variety of scientific applications. Ginkgo supports many architectures such as multi-threaded CPU, NVIDIA GPUs, and AMD GPUs. The heavy use of modern C++ features simplifies the addition of new executor paradigms and algorithmic functionality without introducing significant performance overhead. Solving linear systems is usually one of the most computationally and memory intensive aspects of any application. Hence there has been a significant amount of effort in this direction with software libraries such as UMFPACK (Davis, 2004) and CHOLMOD (Chen, Davis, Hager, \& Rajamanickam, 2008) for solving linear systems with direct methods and PETSc (Balay et al., 2020), Trilinos ({\textquotedblleft}The Trilinos Project Website,{\textquotedblright} 2020), Eigen (Guennebaud, Jacob, \& others, 2010) and many more to solve linear systems with iterative methods. With Ginkgo, we aim to ensure high performance while not compromising portability. Hence, we provide very efficient low level kernels optimized for different architectures and separate these kernels from the algorithms thereby ensuring extensibility and ease of use. Ginkgo is also a part of the xSDK effort (Bartlett et al., 2017) and available as a Spack (Gamblin et al., 2015) package. xSDK aims to provide infrastructure for and interoperability between a collection of related and complementary software elements to foster rapid and efficient development of scientific applications using High Performance Computing. Within this effort, we provide interoperability with application libraries such as deal.ii (Arndt et al., 2019) and mfem (Anderson et al., 2020). Ginkgo provides wrappers within these two libraries so that they can take advantage of the features of Ginkgo.}, doi = {https://doi.org/10.21105/joss.02260}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai} } @article {, title = {Ginkgo: A Node-Level Sparse Linear Algebra Library for HPC (Poster)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai and Jack Dongarra} } @article {, title = {Load-Balancing Sparse Matrix Vector Product Kernels on GPUs}, journal = {ACM Transactions on Parallel Computing}, volume = {7}, year = {2020}, month = {2020-03}, abstract = {Efficient processing of Irregular Matrices on Single Instruction, Multiple Data (SIMD)-type architectures is a persistent challenge. Resolving it requires innovations in the development of data formats, computational techniques, and implementations that strike a balance between thread divergence, which is inherent for Irregular Matrices, and padding, which alleviates the performance-detrimental thread divergence but introduces artificial overheads. To this end, in this article, we address the challenge of designing high performance sparse matrix-vector product (SpMV) kernels designed for Nvidia Graphics Processing Units (GPUs). We present a compressed sparse row (CSR) format suitable for unbalanced matrices. We also provide a load-balancing kernel for the coordinate (COO) matrix format and extend it to a hybrid algorithm that stores part of the matrix in SIMD-friendly Ellpack format (ELL) format. The ratio between the ELL- and the COO-part is determined using a theoretical analysis of the nonzeros-per-row distribution. For the over 2,800 test matrices available in the Suite Sparse matrix collection, we compare the performance against SpMV kernels provided by NVIDIA{\textquoteright}s cuSPARSE library and a heavily-tuned sliced ELL (SELL-P) kernel that prevents unnecessary padding by considering the irregular matrices as a combination of matrix blocks stored in ELL format.}, doi = {https://doi.org/10.1145/3380930}, author = {Hartwig Anzt and Terry Cojean and Chen Yen-Chen and Jack Dongarra and Goran Flegar and Pratik Nayak and Stanimire Tomov and Yuhsiang M. Tsai and Weichung Wang} } @conference {, title = {Multiprecision Block-Jacobi for Iterative Triangular Solves}, booktitle = {European Conference on Parallel Processing (Euro-Par 2020)}, year = {2020}, month = {2020-08}, publisher = {Springer}, organization = {Springer}, abstract = {Recent research efforts have shown that Jacobi and block-Jacobi relaxation methods can be used as an effective and highly parallel approach for the solution of sparse triangular linear systems arising in the application of ILU-type preconditioners. Simultaneously, a few independent works have focused on designing efficient high performance adaptive-precision block-Jacobi preconditioning (block-diagonal scaling), in the context of the iterative solution of sparse linear systems, on manycore architectures. In this paper, we bridge the gap between relaxation methods based on regular splittings and preconditioners by demonstrating that iterative refinement can be leveraged to construct a relaxation method from the preconditioner. In addition, we exploit this insight to construct a highly-efficient sparse triangular system solver for graphics processors that combines iterative refinement with the block-Jacobi preconditioner available in the Ginkgo library.}, keywords = {Block-Jacobi, graphics processing units (GPUs), incomplete factorization preconditioning, multiprecision, sparse linear algebra}, doi = {https://doi.org/10.1007/978-3-030-57675-2_34}, author = {Fritz Goebel and Hartwig Anzt and Terry Cojean and Goran Flegar and Enrique S. Quintana-Orti} } @conference {, title = {Scalable Data Generation for Evaluating Mixed-Precision Solvers}, booktitle = {2020 IEEE High Performance Extreme Computing Conference (HPEC)}, year = {2020}, month = {2020-09}, publisher = {IEEE}, organization = {IEEE}, address = { Waltham, MA, USA}, abstract = {We present techniques of generating data for mixed precision solvers that allows to test those solvers in a scalable manner. Our techniques focus on mixed precision hardware and software where both the solver and the hardware can take advantage of mixing multiple floating precision formats. This allows taking advantage of recently released generation of hardware platforms that focus on ML and DNN workloads but can also be utilized for HPC applications if a new breed of algorithms is combined with the custom floating-point formats to deliver performance levels beyond the standard IEEE data types while delivering a comparable accuracy of the results.}, doi = {https://doi.org/10.1109/HPEC43674.2020.9286145}, author = {Piotr Luszczek and Yaohung Tsai and Neil Lindquist and Hartwig Anzt and Jack Dongarra} } @conference {, title = {Sparse Linear Algebra on AMD and NVIDIA GPUs{\textemdash}The Race is On}, booktitle = {ISC High Performance}, year = {2020}, month = {2020-06}, publisher = {Springer}, organization = {Springer}, abstract = {Efficiently processing sparse matrices is a central and performance-critical part of many scientific simulation codes. Recognizing the adoption of manycore accelerators in HPC, we evaluate in this paper the performance of the currently best sparse matrix-vector product (SpMV) implementations on high-end GPUs from AMD and NVIDIA. Specifically, we optimize SpMV kernels for the CSR, COO, ELL, and HYB format taking the hardware characteristics of the latest GPU technologies into account. We compare for 2,800 test matrices the performance of our kernels against AMD{\textquoteright}s hipSPARSE library and NVIDIA{\textquoteright}s cuSPARSE library, and ultimately assess how the GPU technologies from AMD and NVIDIA compare in terms of SpMV performance.}, keywords = {AMD, GPUs, nVidia, sparse matrix vector product (SpMV)}, doi = {https://doi.org/10.1007/978-3-030-50743-5_16}, author = {Yuhsiang M. Tsai and Terry Cojean and Hartwig Anzt} } @techreport {, title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic}, journal = {SLATE Working Notes}, number = {15, ICL-UT-20-08}, year = {2020}, month = {2020-07}, publisher = {University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang} } @article {1319, title = {Adaptive Precision in Block-Jacobi Preconditioning for Iterative Sparse Linear System Solvers}, journal = {Concurrency and Computation: Practice and Experience}, volume = {31}, number = {6}, year = {2019}, month = {2019-03}, pages = {e4460}, abstract = {Summary We propose an adaptive scheme to reduce communication overhead caused by data movement by selectively storing the diagonal blocks of a block-Jacobi preconditioner in different precision formats (half, single, or double). This specialized preconditioner can then be combined with any Krylov subspace method for the solution of sparse linear systems to perform all arithmetic in double precision. We assess the effects of the adaptive precision preconditioner on the iteration count and data transfer cost of a preconditioned conjugate gradient solver. A preconditioned conjugate gradient method is, in general, a memory bandwidth-bound algorithm, and therefore its execution time and energy consumption are largely dominated by the costs of accessing the problem{\textquoteright}s data in memory. Given this observation, we propose a model that quantifies the time and energy savings of our approach based on the assumption that these two costs depend linearly on the bit length of a floating point number. Furthermore, we use a number of test problems from the SuiteSparse matrix collection to estimate the potential benefits of the adaptive block-Jacobi preconditioning scheme.}, keywords = {adaptive precision, block-Jacobi preconditioning, communication reduction, energy efficiency, Krylov subspace methods, sparse linear systems}, doi = {https://doi.org/10.1002/cpe.4460}, url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4460}, author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Nicholas J. Higham and Enrique S. Quintana-Orti} } @conference {1368, title = {Approximate and Exact Selection on GPUs}, booktitle = {2019 IEEE International Parallel and Distributed Processing Symposium Workshops}, year = {2019}, month = {2019-05}, publisher = {IEEE}, organization = {IEEE}, address = {Rio de Janeiro, Brazil}, abstract = {We present a novel algorithm for parallel selection on GPUs. The algorithm requires no assumptions on the input data distribution, and has a much lower recursion depth compared to many state-of-the-art algorithms. We implement the algorithm for different GPU generations, always using the respectively-available low-level communication features, and assess the performance on server-line hardware. The computational complexity of our SampleSelect algorithm is comparable to specialized algorithms designed for - and exploiting the characteristics of - "pleasant" data distributions. At the same time, as the SampleSelect does not work on the actual values but the ranks of the elements only, it is robust to the input data and can complete significantly faster for adversarial data distributions. Additionally to the exact SampleSelect, we address the use case of approximate selection by designing a variant that radically reduces the computational cost while preserving high approximation accuracy.}, doi = {10.1109/IPDPSW.2019.00088}, author = {Tobias Ribizel and Hartwig Anzt} } @conference {1367, title = {Are we Doing the Right Thing? {\textendash} A Critical Analysis of the Academic HPC Community}, booktitle = {2019 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2019}, month = {2019-05}, publisher = {IEEE}, organization = {IEEE}, address = {Rio de Janeiro, Brazil}, abstract = {Like in any other research field, academically surviving in the High Performance Computing (HPC) community generally requires to publish papers, in the bast case many of them and in high-ranked journals or at top-tier conferences. As a result, the number of scientific papers published each year in this relatively small community easily outnumbers what a single researcher can read. At the same time, many of the proposed and analyzed strategies, algorithms, and hardware-optimized implementations never make it beyond the prototype stage, as they are abandoned once they served the single purpose of yielding (another) publication. In a time and field where high-quality manpower is a scarce resource, this is extremely inefficient. In this position paper we promote a radical paradigm shift towards accepting high-quality software patches to community software packages as legitimate conference contributions. In consequence, the reputation and appointability of researchers is no longer based on the classical scientific metrics, but on the quality and documentation of open source software contributions - effectively improving and accelerating the collaborative development of community software.}, doi = {10.1109/IPDPSW.2019.00122}, author = {Hartwig Anzt and Goran Flegar} } @article {1369, title = {A Customized Precision Format Based on Mantissa Segmentation for Accelerating Sparse Linear Algebra}, journal = {Concurrency and Computation: Practice and Experience}, volume = {40319}, year = {2019}, month = {2019-01}, issn = {1532-0626}, doi = {https://doi.org/10.1002/cpe.5418}, author = {Thomas Gruetzmacher and Terry Cojean and Goran Flegar and Fritz G{\"o}bel and Hartwig Anzt} } @article {1377, title = {PAPI Software-Defined Events for in-Depth Performance Analysis}, journal = {The International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1113-1127}, abstract = {The methodology and standardization layer provided by the Performance Application Programming Interface (PAPI) has played a vital role in application profiling for almost two decades. It has enabled sophisticated performance analysis tool designers and performance-conscious scientists to gain insights into their applications by simply instrumenting their code using a handful of PAPI functions that {\textquotedblleft}just work{\textquotedblright} across different hardware components. In the past, PAPI development had focused primarily on hardware-specific performance metrics. However, the rapidly increasing complexity of software infrastructure poses new measurement and analysis challenges for the developers of large-scale applications. In particular, acquiring information regarding the behavior of libraries and runtimes{\textemdash}used by scientific applications{\textemdash}requires low-level binary instrumentation, or APIs specific to each library and runtime. No uniform API for monitoring events that originate from inside the software stack has emerged. In this article, we present our efforts to extend PAPI{\textquoteright}s role so that it becomes the de facto standard for exposing performance-critical events, which we refer to as software-defined events (SDEs), from different software layers. Upgrading PAPI with SDEs enables monitoring of both types of performance events{\textemdash}hardware- and software-related events{\textemdash}in a uniform way, through the same consistent PAPI. The goal of this article is threefold. First, we motivate the need for SDEs and describe our design decisions regarding the functionality we offer through PAPI{\textquoteright}s new SDE interface. Second, we illustrate how SDEs can be utilized by different software packages, specifically, by showcasing their use in the numerical linear algebra library MAGMA-Sparse, the tensor algebra library TAMM that is part of the NWChem suite, and the compiler-based performance analysis tool Byfl. Third, we provide a performance analysis of the overhead that results from monitoring SDEs and discuss the trade-offs between overhead and functionality.}, url = {https://doi.org/10.1177/1094342019846287}, author = {Heike Jagode and Anthony Danalis and Hartwig Anzt and Jack Dongarra} } @article {1437, title = {Parallel Selection on GPUs}, journal = {Parallel Computing}, volume = {91}, year = {2019}, month = {2020-03}, abstract = {We present a novel parallel selection algorithm for GPUs capable of handling single rank selection (single selection) and multiple rank selection (multiselection). The algorithm requires no assumptions on the input data distribution, and has a much lower recursion depth compared to many state-of-the-art algorithms. We implement the algorithm for different GPU generations, always leveraging the respectively-available low-level communication features, and assess the performance on server-line hardware. The computational complexity of our SampleSelect algorithm is comparable to specialized algorithms designed for {\textendash} and exploiting the characteristics of {\textendash} {\textquotedblleft}pleasant{\textquotedblright} data distributions. At the same time, as the proposed SampleSelect algorithm does not work on the actual element values but on the element ranks of the elements only, it is robust to the input data and can complete significantly faster for adversarial data distributions. We also address the use case of approximate selection by designing a variant that radically reduces the computational cost while preserving high approximation accuracy.}, keywords = {approximate selection, gpu, kth order statistics, multiselection, parallel selection algorithm}, doi = {https://doi.org/10.1016/j.parco.2019.102588}, url = {https://www.sciencedirect.com/science/article/pii/S0167819119301796}, author = {Tobias Ribizel and Hartwig Anzt} } @conference {1436, title = {ParILUT {\textendash} A Parallel Threshold ILU for GPUs}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2019}, month = {2019-05}, publisher = {IEEE}, organization = {IEEE}, address = {Rio de Janeiro, Brazil}, abstract = {In this paper, we present the first algorithm for computing threshold ILU factorizations on GPU architectures. The proposed ParILUT-GPU algorithm is based on interleaving parallel fixed-point iterations that approximate the incomplete factors for an existing nonzero pattern with a strategy that dynamically adapts the nonzero pattern to the problem characteristics. This requires the efficient selection of thresholds that separate the values to be dropped from the incomplete factors, and we design a novel selection algorithm tailored towards GPUs. All components of the ParILUT-GPU algorithm make heavy use of the features available in the latest NVIDIA GPU generations, and outperform existing multithreaded CPU implementations.}, doi = {https://doi.org/10.1109/IPDPS.2019.00033}, author = {Hartwig Anzt and Tobias Ribizel and Goran Flegar and Edmond Chow and Jack Dongarra} } @article {1317, title = {Toward a Modular Precision Ecosystem for High-Performance Computing}, journal = {The International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1069-1078}, abstract = {With the memory bandwidth of current computer architectures being significantly slower than the (floating point) arithmetic performance, many scientific computations only leverage a fraction of the computational power in today{\textquoteright}s high-performance architectures. At the same time, memory operations are the primary energy consumer of modern architectures, heavily impacting the resource cost of large-scale applications and the battery life of mobile devices. This article tackles this mismatch between floating point arithmetic throughput and memory bandwidth by advocating a disruptive paradigm change with respect to how data are stored and processed in scientific applications. Concretely, the goal is to radically decouple the data storage format from the processing format and, ultimately, design a {\textquotedblleft}modular precision ecosystem{\textquotedblright} that allows for more flexibility in terms of customized data access. For memory-bounded scientific applications, dynamically adapting the memory precision to the numerical requirements allows for attractive resource savings. In this article, we demonstrate the potential of employing a modular precision ecosystem for the block-Jacobi preconditioner and the PageRank algorithm{\textemdash}two applications that are popular in the communities and at the same characteristic representatives for the field of numerical linear algebra and data analytics, respectively.}, keywords = {conjugate gradient, GPUs, Jacobi method, Modular precision, multicore processors, PageRank, parallel numerical linear algebra}, issn = {1094-3420}, doi = {https://doi.org/10.1177/1094342019846547}, author = {Hartwig Anzt and Goran Flegar and Thomas Gruetzmacher and Enrique S. Quintana-Orti} } @article {1438, title = {Towards a New Peer Review Concept for Scientific Computing ensuring Technical Quality, Software Sustainability, and Result Reproducibility}, journal = {Proceedings in Applied Mathematics and Mechanics}, volume = {19}, year = {2019}, month = {2019-11}, abstract = {In this position paper we argue for implementing an alternative peer review process for scientific computing contributions that promotes high quality scientific software developments as fully-recognized conference submission. The idea is based on leveraging the code reviewers{\textquoteright} feedback on scientific software contributions to community software developments as a third-party review involvement. Providing open access to this technical review would complement the scientific review of the contribution, efficiently reduce the workload of the undisclosed reviewers, improve the algorithm implementation quality and software sustainability, and ensure full reproducibility of the reported results. Using this process creates incentives to publish scientific algorithms in open source software {\textendash} instead of designing prototype algorithms with the unique purpose of publishing a paper. In addition, the comments and suggestions of the community being archived in the versioning control systems ensure that also community reviewers are receiving credit for the review contributions {\textendash} unlike reviewers in the traditional peer review process. Finally, it reflects the particularity of the scientific computing community using conferences rather than journals as the main publication venue.}, issn = {1617-7061}, doi = {https://doi.org/10.1002/pamm.201900490}, author = {Hartwig Anzt and Terry Cojean and Eileen Kuhn} } @conference {1318, title = {Towards Continuous Benchmarking}, booktitle = {Platform for Advanced Scientific Computing Conference (PASC 2019)}, year = {2019}, month = {2019-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Zurich, Switzerland}, abstract = {We present an automated performance evaluation framework that enables an automated workflow for testing and performance evaluation of software libraries. Integrating this component into an ecosystem enables sustainable software development, as a community effort, via a web application for interactively evaluating the performance of individual software components. The performance evaluation tool is based exclusively on web technologies, which removes the burden of downloading performance data or installing additional software. We employ this framework for the Ginkgo software ecosystem, but the framework can be used with essentially any software project, including the comparison between different software libraries. The Continuous Integration (CI) framework of Ginkgo is also extended to automatically run a benchmark suite on predetermined HPC systems, store the state of the machine and the environment along with the compiled binaries, and collect results in a publicly accessible performance data repository based on Git. The Ginkgo performance explorer (GPE) can be used to retrieve the performance data from the repository, and visualizes it in a web browser. GPE also implements an interface that allows users to write scripts, archived in a Git repository, to extract particular data, compute particular metrics, and visualize them in many different formats (as specified by the script). The combination of these approaches creates a workflow which enables performance reproducibility and software sustainability of scientific software. In this paper, we present example scripts that extract and visualize performance data for Ginkgo{\textquoteright}s SpMV kernels that allow users to identify the optimal kernel for specific problem characteristics.}, isbn = {9781450367707}, doi = {https://doi.org/10.1145/3324989.3325719}, author = {Hartwig Anzt and Yen Chen Chen and Terry Cojean and Jack Dongarra and Goran Flegar and Pratik Nayak and Enrique S. Quintana-Orti and Yuhsiang M. Tsai and Weichung Wang} } @article {1220, title = {Variable-Size Batched Gauss-Jordan Elimination for Block-Jacobi Preconditioning on Graphics Processors}, journal = {Parallel Computing}, volume = {81}, year = {2019}, month = {2019-01}, pages = {131-146}, abstract = {In this work, we address the efficient realization of block-Jacobi preconditioning on graphics processing units (GPUs). This task requires the solution of a collection of small and independent linear systems. To fully realize this implementation, we develop a variable-size batched matrix inversion kernel that uses Gauss-Jordan elimination (GJE) along with a variable-size batched matrix{\textendash}vector multiplication kernel that transforms the linear systems{\textquoteright} right-hand sides into the solution vectors. Our kernels make heavy use of the increased register count and the warp-local communication associated with newer GPU architectures. Moreover, in the matrix inversion, we employ an implicit pivoting strategy that migrates the workload (i.e., operations) to the place where the data resides instead of moving the data to the executing cores. We complement the matrix inversion with extraction and insertion strategies that allow the block-Jacobi preconditioner to be set up rapidly. The experiments on NVIDIA{\textquoteright}s K40 and P100 architectures reveal that our variable-size batched matrix inversion routine outperforms the CUDA basic linear algebra subroutine (cuBLAS) library functions that provide the same (or even less) functionality. We also show that the preconditioner setup and preconditioner application cost can be somewhat offset by the faster convergence of the iterative solver.}, keywords = {Batched algorithms, Block-Jacobi, Gauss{\textendash}Jordan elimination, Graphics processor, matrix inversion, sparse linear systems}, doi = {https://doi.org/10.1016/j.parco.2017.12.006}, author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti} } @article {1158, title = {Incomplete Sparse Approximate Inverses for Parallel Preconditioning}, journal = {Parallel Computing}, volume = {71}, year = {2018}, month = {2018-01}, pages = {1{\textendash}22}, abstract = {In this paper, we propose a new preconditioning method that can be seen as a generalization of block-Jacobi methods, or as a simplification of the sparse approximate inverse (SAI) preconditioners. The {\textquotedblleft}Incomplete Sparse Approximate Inverses{\textquotedblright} (ISAI) is in particular efficient in the solution of sparse triangular linear systems of equations. Those arise, for example, in the context of incomplete factorization preconditioning. ISAI preconditioners can be generated via an algorithm providing fine-grained parallelism, which makes them attractive for hardware with a high concurrency level. In a study covering a large number of matrices, we identify the ISAI preconditioner as an attractive alternative to exact triangular solves in the context of incomplete factorization preconditioning.}, issn = {01678191}, doi = {10.1016/j.parco.2017.10.003}, url = {http://www.sciencedirect.com/science/article/pii/S016781911730176X}, author = {Hartwig Anzt and Thomas Huckle and J{\"u}rgen Br{\"a}ckle and Jack Dongarra} } @article {1219, title = {Optimization and Performance Evaluation of the IDR Iterative Krylov Solver on GPUs}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, number = {2}, year = {2018}, month = {2018-03}, pages = {220{\textendash}230}, abstract = {In this paper, we present an optimized GPU implementation for the induced dimension reduction algorithm. We improve data locality, combine it with an efficient sparse matrix vector kernel, and investigate the potential of overlapping computation with communication as well as the possibility of concurrent kernel execution. A comprehensive performance evaluation is conducted using a suitable performance model. The analysis reveals efficiency of up to 90\%, which indicates that the implementation achieves performance close to the theoretically attainable bound.}, keywords = {co-design, gpu, Induced dimension reduction (IDR), kernel fusion, kernel overlap, roofline performance model}, doi = {https://doi.org/10.1177/1094342016646844}, author = {Hartwig Anzt and Moritz Kreutzer and Eduardo Ponce and Gregory D. Peterson and Gerhard Wellein and Jack Dongarra} } @article {1190, title = {ParILUT - A New Parallel Threshold ILU}, journal = {SIAM Journal on Scientific Computing}, volume = {40}, year = {2018}, month = {2018-07}, pages = {C503{\textendash}C519}, publisher = {SIAM}, abstract = {We propose a parallel algorithm for computing a threshold incomplete LU (ILU) factorization. The main idea is to interleave a parallel fixed-point iteration that approximates an incomplete factorization for a given sparsity pattern with a procedure that adjusts the pattern. We describe and test a strategy for identifying nonzeros to be added and nonzeros to be removed from the sparsity pattern. The resulting pattern may be different and more effective than that of existing threshold ILU algorithms. Also in contrast to other parallel threshold ILU algorithms, much of the new algorithm has fine-grained parallelism.}, doi = {https://doi.org/10.1137/16M1079506}, author = {Hartwig Anzt and Edmond Chow and Jack Dongarra} } @techreport {1275, title = {Software-Defined Events (SDEs) in MAGMA-Sparse}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-12}, year = {2018}, month = {2018-12}, publisher = {University of Tennessee}, author = {Heike Jagode and Anthony Danalis and Hartwig Anzt and Ichitaro Yamazaki and Mark Hoemmen and Erik Boman and Stanimire Tomov and Jack Dongarra} } @techreport {1204, title = {Solver Interface \& Performance on Cori}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-05}, year = {2018}, month = {2018-06}, publisher = {University of Tennessee}, author = {Hartwig Anzt and Ichitaro Yamazaki and Mark Hoemmen and Erik Boman and Jack Dongarra} } @article {1221, title = {Using Jacobi Iterations and Blocking for Solving Sparse Triangular Systems in Incomplete Factorization Preconditioning}, journal = {Journal of Parallel and Distributed Computing}, volume = {119}, year = {2018}, month = {2018-11}, pages = {219{\textendash}230}, abstract = {When using incomplete factorization preconditioners with an iterative method to solve large sparse linear systems, each application of the preconditioner involves solving two sparse triangular systems. These triangular systems are challenging to solve efficiently on computers with high levels of concurrency. On such computers, it has recently been proposed to use Jacobi iterations, which are highly parallel, to approximately solve the triangular systems from incomplete factorizations. The effectiveness of this approach, however, is problem-dependent: the Jacobi iterations may not always converge quickly enough for all problems. Thus, as a necessary and important step to evaluate this approach, we experimentally test the approach on a large number of realistic symmetric positive definite problems. We also show that by using block Jacobi iterations, we can extend the range of problems for which such an approach can be effective. For block Jacobi iterations, it is essential for the blocking to be cognizant of the matrix structure.}, doi = {https://doi.org/10.1016/j.jpdc.2018.04.017}, author = {Edmond Chow and Hartwig Anzt and Jennifer Scott and Jack Dongarra} } @conference {1234, title = {Variable-Size Batched Condition Number Calculation on GPUs}, booktitle = {SBAC-PAD}, year = {2018}, month = {2018-09}, address = {Lyon, France}, url = {https://ieeexplore.ieee.org/document/8645907}, author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Thomas Gruetzmacher} } @inproceedings {998, title = {Batched Gauss-Jordan Elimination for Block-Jacobi Preconditioner Generation on GPUs}, journal = {Proceedings of the 8th International Workshop on Programming Models and Applications for Multicores and Manycores}, year = {2017}, month = {2017-02}, pages = {1{\textendash}10}, publisher = {ACM}, address = {New York, NY, USA}, abstract = {In this paper, we design and evaluate a routine for the efficient generation of block-Jacobi preconditioners on graphics processing units (GPUs). Concretely, to exploit the architecture of the graphics accelerator, we develop a batched Gauss-Jordan elimination CUDA kernel for matrix inversion that embeds an implicit pivoting technique and handles the entire inversion process in the GPU registers. In addition, we integrate extraction and insertion CUDA kernels to rapidly set up the block-Jacobi preconditioner. Our experiments compare the performance of our implementation against a sequence of batched routines from the MAGMA library realizing the inversion via the LU factorization with partial pivoting. Furthermore, we evaluate the costs of different strategies for the block-Jacobi extraction and insertion steps, using a variety of sparse matrices from the SuiteSparse matrix collection. Finally, we assess the efficiency of the complete block-Jacobi preconditioner generation in the context of an iterative solver applied to a set of computational science problems, and quantify its benefits over a scalar Jacobi preconditioner.}, keywords = {block-Jacobi preconditioner, Gauss-Jordan elimination, graphics processing units (GPUs), iterative methods, matrix inversion, sparse linear systems}, isbn = {978-1-4503-4883-6}, doi = {10.1145/3026937.3026940}, url = {http://doi.acm.org/10.1145/3026937.3026940}, author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti} } @inbook {1167, title = {Bringing High Performance Computing to Big Data Algorithms}, booktitle = {Handbook of Big Data Technologies}, year = {2017}, publisher = {Springer}, organization = {Springer}, isbn = {978-3-319-49339-8}, doi = {10.1007/978-3-319-49340-4}, author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @article {1163, title = {Flexible Batched Sparse Matrix Vector Product on GPUs}, year = {2017}, month = {2017-11}, publisher = {ScalA{\textquoteright}17: 8th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, address = {Denver, Colorado}, author = {Hartwig Anzt and Collins, Gary and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti} } @conference {, title = {Flexible Batched Sparse Matrix-Vector Product on GPUs}, booktitle = {8th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA {\textquoteright}17)}, year = {2017}, month = {2017-11}, publisher = {ACM Press}, organization = {ACM Press}, address = {Denver, CO}, abstract = { We propose a variety of batched routines for concurrently processing a large collection of small-size, independent sparse matrix-vector products (SpMV) on graphics processing units (GPUs). These batched SpMV kernels are designed to be flexible in order to handle a batch of matrices which differ in size, nonzero count, and nonzero distribution. Furthermore, they support three most commonly used sparse storage formats: CSR, COO and ELL. Our experimental results on a state-of-the-art GPU reveal performance improvements of up to 25X compared to non-batched SpMV routines.}, doi = {http://dx.doi.org/10.1145/3148226.3148230}, author = {Hartwig Anzt and Gary Collins and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti} } @techreport {1130, title = {MAGMA-sparse Interface Design Whitepaper}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-05}, year = {2017}, month = {2017-09}, type = {Technical Report}, abstract = {In this report we describe the logic and interface we develop for the MAGMA-sparse library to allow for easy integration as third-party library into a top-level software ecosystem. The design choices are based on extensive consultation with other software library developers, in particular the Trilinos software development team. The interface documentation is at this point not exhaustive, but a first proposal for setting a standard. Although the interface description targets the MAGMA-sparse software module, we hope that the design choices carry beyond this specific library, and are attractive for adoption in other packages. This report is not intended as static document, but will be updated over time to reflect the agile software development in the ECP 1.3.3.11 STMS11-PEEKS project.}, author = {Hartwig Anzt and Erik Boman and Jack Dongarra and Goran Flegar and Mark Gates and Mike Heroux and Mark Hoemmen and Jakub Kurzak and Piotr Luszczek and Sivasankaran Rajamanickam and Stanimire Tomov and Stephen Wood and Ichitaro Yamazaki} } @article {1067, title = {Preconditioned Krylov Solvers on GPUs}, journal = {Parallel Computing}, year = {2017}, month = {2017-06}, abstract = {In this paper, we study the effect of enhancing GPU-accelerated Krylov solvers with preconditioners. We consider the BiCGSTAB, CGS, QMR, and IDR(s) Krylov solvers. For a large set of test matrices, we assess the impact of Jacobi and incomplete factorization preconditioning on the solvers{\textquoteright} numerical stability and time-to-solution performance. We also analyze how the use of a preconditioner impacts the choice of the fastest solver.}, keywords = {gpu, ILU, Jacobi, Krylov solvers, Preconditioning}, issn = {01678191}, doi = {10.1016/j.parco.2017.05.006}, url = {http://www.sciencedirect.com/science/article/pii/S0167819117300777}, author = {Hartwig Anzt and Mark Gates and Jack Dongarra and Moritz Kreutzer and Gerhard Wellein and Martin Kohler} } @techreport {1080, title = {Roadmap for the Development of a Linear Algebra Library for Exascale Computing: SLATE: Software for Linear Algebra Targeting Exascale}, journal = {SLATE Working Notes}, number = {01, ICL-UT-17-02}, year = {2017}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Aurelien Bouteiller and Anthony Danalis and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Stephen Wood and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan} } @inproceedings {1088, title = {Variable-Size Batched Gauss-Huard for Block-Jacobi Preconditioning}, journal = {International Conference on Computational Science (ICCS 2017)}, volume = {108}, year = {2017}, month = {2017-06}, pages = {1783-1792}, publisher = {Procedia Computer Science}, address = {Zurich, Switzerland}, abstract = {In this work we present new kernels for the generation and application of block-Jacobi precon-ditioners that accelerate the iterative solution of sparse linear systems on graphics processing units (GPUs). Our approach departs from the conventional LU factorization and decomposes the diagonal blocks of the matrix using the Gauss-Huard method. When enhanced with column pivoting, this method is as stable as LU with partial/row pivoting. Due to extensive use of GPU registers and integration of implicit pivoting, our variable size batched Gauss-Huard implementation outperforms the batched version of LU factorization. In addition, the application kernel combines the conventional two-stage triangular solve procedure, consisting of a backward solve followed by a forward solve, into a single stage that performs both operations simultaneously.}, doi = {https://doi.org/10.1016/j.procs.2017.05.186}, author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti and Andres E. Thomas} } @conference {1160, title = {Variable-Size Batched LU for Small Matrices and Its Integration into Block-Jacobi Preconditioning}, booktitle = {46th International Conference on Parallel Processing (ICPP)}, year = {2017}, month = {2017-08}, publisher = {IEEE}, organization = {IEEE}, address = {Bristol, United Kingdom}, abstract = {We present a set of new batched CUDA kernels for the LU factorization of a large collection of independent problems of different size, and the subsequent triangular solves. All kernels heavily exploit the registers of the graphics processing unit (GPU) in order to deliver high performance for small problems. The development of these kernels is motivated by the need for tackling this embarrassingly parallel scenario in the context of block-Jacobi preconditioning that is relevant for the iterative solution of sparse linear systems.}, keywords = {graphics processing units, Jacobian matrices, Kernel, linear systems, Parallel processing, Sparse matrices}, doi = {10.1109/ICPP.2017.18}, url = {http://ieeexplore.ieee.org/abstract/document/8025283/?reload=true}, author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti} } @article {, title = {With Extreme Computing, the Rules Have Changed}, journal = {Computing in Science \& Engineering}, volume = {19}, year = {2017}, month = {2017-05}, pages = {52-62}, abstract = {On the eve of exascale computing, traditional wisdom no longer applies. High-performance computing is gone as we know it. This article discusses a range of new algorithmic techniques emerging in the context of exascale computing, many of which defy the common wisdom of high-performance computing and are considered unorthodox, but could turn out to be a necessity in near future.}, doi = {https://doi.org/10.1109/MCSE.2017.48}, author = {Jack Dongarra and Stanimire Tomov and Piotr Luszczek and Jakub Kurzak and Mark Gates and Ichitaro Yamazaki and Hartwig Anzt and Azzam Haidar and Ahmad Abdelfattah} } @article {994, title = {Accelerating the Conjugate Gradient Algorithm with GPU in CFD Simulations}, journal = {VECPAR}, year = {2016}, abstract = {This paper illustrates how GPU computing can be used to accelerate computational fluid dynamics (CFD) simulations. For sparse linear systems arising from finite volume discretization, we evaluate and optimize the performance of Conjugate Gradient (CG) routines designed for manycore accelerators and compare against an industrial CPU-based implementation. We also investigate how the recent advances in preconditioning, such as iterative Incomplete Cholesky (IC, as symmetric case of ILU) preconditioning, match the requirements for solving real world problems.}, url = {http://hgpu.org/?p=16264}, author = {Hartwig Anzt and Marc Baboulin and Jack Dongarra and Yvan Fournier and Frank Hulsemann and Amal Khabou and Yushan Wang} } @inproceedings {991, title = {Batched Generation of Incomplete Sparse Approximate Inverses on GPUs}, journal = {Proceedings of the 7th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems}, year = {2016}, month = {2016-11}, pages = {49{\textendash}56}, abstract = {Incomplete Sparse Approximate Inverses (ISAI) have recently been shown to be an attractive alternative to exact sparse triangular solves in the context of incomplete factorization preconditioning. In this paper we propose a batched GPU-kernel for the efficient generation of ISAI matrices. Utilizing only thread-local memory allows for computing the ISAI matrix with very small memory footprint. We demonstrate that this strategy is faster than the existing strategy for generating ISAI matrices, and use a large number of test matrices to assess the algorithm{\textquoteright}s efficiency in an iterative solver setting.}, isbn = {978-1-5090-5222-6}, doi = {10.1109/ScalA.2016.11}, author = {Hartwig Anzt and Edmond Chow and Thomas Huckle and Jack Dongarra} } @techreport {988, title = {On block-asynchronous execution on GPUs}, journal = {LAPACK Working Note}, number = {291}, year = {2016}, month = {2016-11}, abstract = {This paper experimentally investigates how GPUs execute instructions when used for general purpose computing (GPGPU). We use a light-weight realizing a vector operation to analyze which vector entries are updated subsequently, and identify regions where parallel execution can be expected. The results help us to understand how GPUs operate, and map this operation mode to the mathematical concept of asynchronism. In particular it helps to understand the effects that can occur when implementing a fixed-point method using in-place updates on GPU hardware.}, url = {http://www.netlib.org/lapack/lawnspdf/lawn291.pdf}, author = {Hartwig Anzt and Edmond Chow and Jack Dongarra} } @inproceedings {996, title = {Domain Overlap for Iterative Sparse Triangular Solves on GPUs}, journal = {Software for Exascale Computing - SPPEXA}, volume = {113}, year = {2016}, month = {2016-09}, pages = {527{\textendash}545}, publisher = {Springer International Publishing}, abstract = {Iterative methods for solving sparse triangular systems are an attractive alternative to exact forward and backward substitution if an approximation of the solution is acceptable. On modern hardware, performance benefits are available as iterative methods allow for better parallelization. In this paper, we investigate how block-iterative triangular solves can benefit from using overlap. Because the matrices are triangular, we use {\textquotedblleft}directed{\textquotedblright} overlap, depending on whether the matrix is upper or lower triangular. We enhance a GPU implementation of the block-asynchronous Jacobi method with directed overlap. For GPUs and other cases where the problem must be overdecomposed, i.e., more subdomains and threads than cores, there is a preference in processing or scheduling the subdomains in a specific order, following the dependencies specified by the sparse triangular matrix. For sparse triangular factors from incomplete factorizations, we demonstrate that moderate directed overlap with subdomain scheduling can improve convergence and time-to-solution.}, doi = {10.1007/978-3-319-40528-5_24}, author = {Hartwig Anzt and Edmond Chow and Daniel Szyld and Jack Dongarra}, editor = {Hans-Joachim Bungartz and Philipp Neumann and Wolfgang E. Nagel} } @inproceedings {992, title = {Efficiency of General Krylov Methods on GPUs {\textendash} An Experimental Study}, journal = {2016 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2016}, month = {2016-05}, pages = {683-691}, abstract = {This paper compares different Krylov methods based on short recurrences with respect to their efficiency whenimplemented on GPUs. The comparison includes BiCGSTAB, CGS, QMR, and IDR using different shadow space dimensions. These methods are known for their good convergencecharacteristics. For a large set of test matrices taken from theUniversity of Florida Matrix Collection, we evaluate the methods{\textquoteright}performance against different target metrics: convergence, number of sparse matrix-vector multiplications, and executiontime. We also analyze whether the methods are "orthogonal"in terms of problem suitability. We propose best practicesfor choosing methods in a "black box" scenario, where noinformation about the optimal solver is available.}, keywords = {algorithmic bombardment, BiCGSTAB, CGS, Convergence, Electric breakdown, gpu, graphics processing units, Hardware, IDR(s), Krylov solver, Libraries, linear systems, QMR, Sparse matrices}, doi = {10.1109/IPDPSW.2016.45}, author = {Hartwig Anzt and Jack Dongarra and Moritz Kreutzer and Gerhard Wellein and Martin Kohler} } @conference {937, title = {Efficiency of General Krylov Methods on GPUs {\textendash} An Experimental Study}, booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES)}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This paper compares different Krylov methods based on short recurrences with respect to their efficiency when implemented on GPUs. The comparison includes BiCGSTAB, CGS, QMR, and IDR using different shadow space dimensions. These methods are known for their good convergence characteristics. For a large set of test matrices taken from the University of Florida Matrix Collection, we evaluate the methods{\textquoteright} performance against different target metrics: convergence, number of sparse matrix-vector multiplications, and execution time. We also analyze whether the methods are {\textquotedblleft}orthogonal{\textquotedblright} in terms of problem suitability. We propose best practices for choosing methods in a {\textquotedblleft}black box{\textquotedblright} scenario, where no information about the optimal solver is available.}, keywords = {algorithmic bombardment, BiCGSTAB, CGS, gpu, IDR(s), Krylov solver, QMR}, doi = {10.1109/IPDPSW.2016.45}, author = {Hartwig Anzt and Jack Dongarra and Moritz Kreutzer and Gerhard Wellein and Martin Kohler} } @article {989, title = {Fine-grained Bit-Flip Protection for Relaxation Methods}, journal = {Journal of Computational Science}, year = {2016}, month = {2016-11}, abstract = {Resilience is considered a challenging under-addressed issue that the high performance computing community (HPC) will have to face in order to produce reliable Exascale systems by the beginning of the next decade. As part of a push toward a resilient HPC ecosystem, in this paper we propose an error-resilient iterative solver for sparse linear systems based on stationary component-wise relaxation methods. Starting from a plain implementation of the Jacobi iteration, our approach introduces a low-cost component-wise technique that detects bit-flips, rejecting some component updates, and turning the initial synchronized solver into an asynchronous iteration. Our experimental study with sparse incomplete factorizations from a collection of real-world applications, and a practical GPU implementation, exposes the convergence delay incurred by the fault-tolerant implementation and its practical performance.}, keywords = {Bit flips, Fault tolerance, High Performance Computing, iterative solvers, Jacobi method, sparse linear systems}, doi = {https://doi.org/10.1016/j.jocs.2016.11.013}, author = {Hartwig Anzt and Jack Dongarra and Enrique S. Quintana-Orti} } @conference {939, title = {Heterogeneous Streaming}, booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This paper introduces a new heterogeneous streaming library called hetero Streams (hStreams). We show how a simple FIFO streaming model can be applied to heterogeneous systems that include manycore coprocessors and multicore CPUs. This model supports concurrency across nodes, among tasks within a node, and between data transfers and computation. We give examples for different approaches, show how the implementation can be layered, analyze overheads among layers, and apply those models to parallelize applications using simple, intuitive interfaces. We compare the features and versatility of hStreams, OpenMP, CUDA Streams1 and OmpSs. We show how the use of hStreams makes it easier for scientists to identify tasks and easily expose concurrency among them, and how it enables tuning experts and runtime systems to tailor execution for different heterogeneous targets. Practical application examples are taken from the field of numerical linear algebra, commercial structural simulation software, and a seismic processing application.}, keywords = {plasma}, author = {Chris J. Newburn and Gaurav Bansal and Michael Wood and Luis Crivelli and Judit Planas and Alejandro Duran and Paulo Souza and Leonardo Borges and Piotr Luszczek and Stanimire Tomov and Jack Dongarra and Hartwig Anzt and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Ichitaro Yamazaki and Jesus Labarta} } @article {1472, title = {Linear Algebra Software for Large-Scale Accelerated Multicore Computing}, journal = {Acta Numerica}, volume = {25}, year = {2016}, month = {2016-05}, pages = {1-160}, abstract = {Many crucial scientific computing applications, ranging from national security to medical advances, rely on high-performance linear algebra algorithms and technologies, underscoring their importance and broad impact. Here we present the state-of-the-art design and implementation practices for the acceleration of the predominant linear algebra algorithms on large-scale accelerated multicore systems. Examples are given with fundamental dense linear algebra algorithms {\textendash} from the LU, QR, Cholesky, and LDLT factorizations needed for solving linear systems of equations, to eigenvalue and singular value decomposition (SVD) problems. The implementations presented are readily available via the open-source PLASMA and MAGMA libraries, which represent the next generation modernization of the popular LAPACK library for accelerated multicore systems. To generate the extreme level of parallelism needed for the efficient use of these systems, algorithms of interest are redesigned and then split into well-chosen computational tasks. The task execution is scheduled over the computational components of a hybrid system of multicore CPUs with GPU accelerators and/or Xeon Phi coprocessors, using either static scheduling or light-weight runtime systems. The use of light-weight runtime systems keeps scheduling overheads low, similar to static scheduling, while enabling the expression of parallelism through sequential-like code. This simplifies the development effort and allows exploration of the unique strengths of the various hardware components. Finally, we emphasize the development of innovative linear algebra algorithms using three technologies {\textendash} mixed precision arithmetic, batched operations, and asynchronous iterations {\textendash} that are currently of high interest for accelerated multicore systems.}, doi = {10.1017/S0962492916000015}, author = {Ahmad Abdelfattah and Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and undefined and Asim YarKhan} } @article {990, title = {On the performance and energy efficiency of sparse linear algebra on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2016}, month = {2016-10}, abstract = {In this paper we unveil some performance and energy efficiency frontiers for sparse computations on GPU-based supercomputers. We compare the resource efficiency of different sparse matrix{\textendash}vector products (SpMV) taken from libraries such as cuSPARSE and MAGMA for GPU and Intel{\textquoteright}s MKL for multicore CPUs, and develop a GPU sparse matrix{\textendash}matrix product (SpMM) implementation that handles the simultaneous multiplication of a sparse matrix with a set of vectors in block-wise fashion. While a typical sparse computation such as the SpMV reaches only a fraction of the peak of current GPUs, we show that the SpMM succeeds in exceeding the memory-bound limitations of the SpMV. We integrate this kernel into a GPU-accelerated Locally Optimal Block Preconditioned Conjugate Gradient (LOBPCG) eigensolver. LOBPCG is chosen as a benchmark algorithm for this study as it combines an interesting mix of sparse and dense linear algebra operations that is typical for complex simulation applications, and allows for hardware-aware optimizations. In a detailed analysis we compare the performance and energy efficiency against a multi-threaded CPU counterpart. The reported performance and energy efficiency results are indicative of sparse computations on supercomputers.}, doi = {10.1177/1094342016672081}, url = {http://hpc.sagepub.com/content/early/2016/10/05/1094342016672081.abstract}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @article {995, title = {Updating Incomplete Factorization Preconditioners for Model Order Reduction}, journal = {Numerical Algorithms}, volume = {73}, number = {3}, year = {2016}, month = {2016-02}, pages = {611{\textendash}630}, abstract = {When solving a sequence of related linear systems by iterative methods, it is common to reuse the preconditioner for several systems, and then to recompute the preconditioner when the matrix has changed significantly. Rather than recomputing the preconditioner from scratch, it is potentially more efficient to update the previous preconditioner. Unfortunately, it is not always known how to update a preconditioner, for example, when the preconditioner is an incomplete factorization. A recently proposed iterative algorithm for computing incomplete factorizations, however, is able to exploit an initial guess, unlike existing algorithms for incomplete factorizations. By treating a previous factorization as an initial guess to this algorithm, an incomplete factorization may thus be updated. We use a sequence of problems from model order reduction. Experimental results using an optimized GPU implementation show that updating a previous factorization can be inexpensive and effective, making solving sequences of linear systems a potential niche problem for the iterative incomplete factorization algorithm.}, keywords = {key publication}, doi = {10.1007/s11075-016-0110-2}, author = {Hartwig Anzt and Edmond Chow and Jens Saak and Jack Dongarra} } @conference {880, title = {Accelerating Collaborative Filtering for Implicit Feedback Datasets using GPUs}, booktitle = {2015 IEEE International Conference on Big Data (IEEE BigData 2015)}, year = {2015}, month = {2015-11}, publisher = {IEEE}, organization = {IEEE}, address = {Santa Clara, CA}, abstract = {In this paper we accelerate the Alternating Least Squares (ALS) algorithm used for generating product recommendations on the basis of implicit feedback datasets. We approach the algorithm with concepts proven to be successful in High Performance Computing. This includes the formulation of the algorithm as a mix of cache-optimized algorithm-specific kernels and standard BLAS routines, acceleration via graphics processing units (GPUs), use of parallel batched kernels, and autotuning to identify performance winners. For benchmark datasets, the multi-threaded CPU implementation we propose achieves more than a 10 times speedup over the implementations available in the GraphLab and Spark MLlib software packages. For the GPU implementation, the parameters of an algorithm-specific kernel were optimized using a comprehensive autotuning sweep. This results in an additional 2 times speedup over our CPU implementation.}, author = {Mark Gates and Hartwig Anzt and Jakub Kurzak and Jack Dongarra} } @conference {914, title = {Accelerating the LOBPCG method on GPUs using a blocked Sparse Matrix Vector Product}, booktitle = {Spring Simulation Multi-Conference 2015 (SpringSim{\textquoteright}15)}, year = {2015}, month = {2015-04}, publisher = {SCS}, organization = {SCS}, address = {Alexandria, VA}, abstract = {This paper presents a heterogeneous CPU-GPU implementation for a sparse iterative eigensolver the Locally Optimal Block Preconditioned Conjugate Gradient (LOBPCG). For the key routine generating the Krylov search spaces via the product of a sparse matrix and a block of vectors, we propose a GPU kernel based on a modi ed sliced ELLPACK format. Blocking a set of vectors and processing them simultaneously accelerates the computation of a set of consecutive SpMVs significantly. Comparing the performance against similar routines from Intel{\textquoteright}s MKL and NVIDIA{\textquoteright}s cuSPARSE library we identify appealing performance improvements. We integrate it into the highly optimized LOBPCG implementation. Compared to the BLOBEX CPU implementation running on two eight-core Intel Xeon E5-2690s, we accelerate the computation of a small set of eigenvectors using NVIDIA{\textquoteright}s K40 GPU by typically more than an order of magnitude.}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @article {866, title = {Acceleration of GPU-based Krylov solvers via Data Transfer Reduction}, journal = {International Journal of High Performance Computing Applications}, year = {2015}, author = {Hartwig Anzt and William Sawyer and Stanimire Tomov and Piotr Luszczek and Jack Dongarra} } @conference {912, title = {Adaptive Precision Solvers for Sparse Linear Systems}, booktitle = {3rd International Workshop on Energy Efficient Supercomputing (E2SC {\textquoteright}15)}, year = {2015}, month = {2015-11}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, author = {Hartwig Anzt and Jack Dongarra and Enrique S. Quintana-Orti} } @conference {865, title = {Asynchronous Iterative Algorithm for Computing Incomplete Factorizations on GPUs}, booktitle = {International Supercomputing Conference (ISC 2015)}, year = {2015}, month = {2015-07}, address = {Frankfurt, Germany}, author = {Edmond Chow and Hartwig Anzt and Jack Dongarra} } @conference {857, title = {Energy Efficiency and Performance Frontiers for Sparse Computations on GPU Supercomputers}, booktitle = {Sixth International Workshop on Programming Models and Applications for Multicores and Manycores (PMAM {\textquoteright}15)}, year = {2015}, month = {2015-02}, publisher = {ACM}, organization = {ACM}, address = {San Francisco, CA}, abstract = {In this paper we unveil some energy efficiency and performance frontiers for sparse computations on GPU-based supercomputers. To do this, we consider state-of-the-art implementations of the sparse matrix-vector (SpMV) product in libraries like cuSPARSE, MKL, and MAGMA, and their use in the LOBPCG eigen-solver. LOBPCG is chosen as a benchmark for this study as it combines an interesting mix of sparse and dense linear algebra operations with potential for hardware-aware optimizations. Most notably, LOBPCG includes a blocking technique that is a common performance optimization for many applications. In particular, multiple memory-bound SpMV operations are blocked into a SpM-matrix product (SpMM), that achieves significantly higher performance than a sequence of SpMVs. We provide details about the GPU kernels we use for the SpMV, SpMM, and the LOBPCG implementation design, and study performance and energy consumption compared to CPU solutions. While a typical sparse computation like the SpMV reaches only a fraction of the peak of current GPUs, we show that the SpMM achieves up to a 6x performance improvement over the GPU{\textquoteright}s SpMV, and the GPU-accelerated LOBPCG based on this kernel is 3 to 5x faster than multicore CPUs with the same power draw, e.g., a K40 GPU vs. two Sandy Bridge CPUs (16 cores). In practice though, we show that currently available CPU implementations are much slower due to missed optimization opportunities. These performance results translate to similar improvements in energy consumption, and are indicative of today{\textquoteright}s frontiers in energy efficiency and performance for sparse computations on supercomputers.}, isbn = {978-1-4503-3404-4}, doi = {10.1145/2712386.2712387}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @article {982, title = {Experiences in Autotuning Matrix Multiplication for Energy Minimization on GPUs}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2015}, month = {12-Oct}, pages = {5096 - 5113}, abstract = {In this paper, we report extensive results and analysis of autotuning the computationally intensive graphics processing units kernel for dense matrix{\textendash}matrix multiplication in double precision. In contrast to traditional autotuning and/or optimization for runtime performance only, we also take the energy efficiency into account. For kernels achieving equal performance, we show significant differences in their energy balance. We also identify the memory throughput as the most influential metric that trades off performance and energy efficiency. As a result, the performance optimal case ends up not being the most efficient kernel in overall resource use.}, keywords = {Autotuning, energy efficiency, hardware accelerators, matrix multiplication, power}, doi = {10.1002/cpe.3516}, url = {http://doi.wiley.com/10.1002/cpe.3516https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1002\%2Fcpe.3516}, author = {Hartwig Anzt and Blake Haugen and Jakub Kurzak and Piotr Luszczek and Jack Dongarra} } @article {873, title = {Experiences in autotuning matrix multiplication for energy minimization on GPUs}, journal = {Concurrency in Computation: Practice and Experience}, volume = {27}, year = {2015}, month = {2015-12}, pages = {5096-5113}, doi = {10.1002/cpe.3516}, author = {Hartwig Anzt and Blake Haugen and Jakub Kurzak and Piotr Luszczek and Jack Dongarra} } @conference {913, title = {GPU-accelerated Co-design of Induced Dimension Reduction: Algorithmic Fusion and Kernel Overlap}, booktitle = {2nd International Workshop on Hardware-Software Co-Design for High Performance Computing}, year = {2015}, month = {2015-11}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, abstract = {In this paper we present an optimized GPU co-design of the Induced Dimension Reduction (IDR) algorithm for solving linear systems. Starting from a baseline implementation based on the generic BLAS routines from the MAGMA software library, we apply optimizations that are based on kernel fusion and kernel overlap. Runtime experiments are used to investigate the benefit of the distinct optimization techniques for different variants of the IDR algorithm. A comparison to the reference implementation reveals that the interplay between them can succeed in cutting the overall runtime by up to about one third.}, author = {Hartwig Anzt and Eduardo Ponce and Gregory D. Peterson and Jack Dongarra} } @article {881, title = {Implementation and Tuning of Batched Cholesky Factorization and Solve for NVIDIA GPUs}, journal = {IEEE Transactions on Parallel and Distributed Systems}, number = {1045-9219}, year = {2015}, month = {2015-11}, author = {Jakub Kurzak and Hartwig Anzt and Mark Gates and Jack Dongarra} } @conference {878, title = {Iterative Sparse Triangular Solves for Preconditioning}, booktitle = {EuroPar 2015}, year = {2015}, month = {2015-08}, publisher = {Springer Berlin}, organization = {Springer Berlin}, address = {Vienna, Austria}, abstract = {Sparse triangular solvers are typically parallelized using level scheduling techniques, but parallel eciency is poor on high-throughput architectures like GPUs. We propose using an iterative approach for solving sparse triangular systems when an approximation is suitable. This approach will not work for all problems, but can be successful for sparse triangular matrices arising from incomplete factorizations, where an approximate solution is acceptable. We demonstrate the performance gains that this approach can have on GPUs in the context of solving sparse linear systems with a preconditioned Krylov subspace method. We also illustrate the effect of using asynchronous iterations.}, doi = {10.1007/978-3-662-48096-0_50}, url = {http://dx.doi.org/10.1007/978-3-662-48096-0_50}, author = {Hartwig Anzt and Edmond Chow and Jack Dongarra} } @article {1347, title = {MAGMA MIC: Optimizing Linear Algebra for Intel Xeon Phi}, year = {2015}, month = {2015-06}, publisher = {ISC High Performance (ISC15), Intel Booth Presentation}, address = {Frankfurt, Germany}, author = {Hartwig Anzt and Jack Dongarra and Mark Gates and Azzam Haidar and Khairul Kabir and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki} } @conference {893, title = {Random-Order Alternating Schwarz for Sparse Triangular Solves}, booktitle = {2015 SIAM Conference on Applied Linear Algebra (SIAM LA)}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {Block-asynchronous Jacobi is an iteration method where a locally synchronous iteration is embedded in an asynchronous global iteration. The unknowns are partitioned into small subsets, and while the components within the same subset are iterated in Jacobi fashion, no update order in-between the subsets is enforced. The values of the non-local entries remain constant during the local iterations, which can result in slow inter-subset information propagation and slow convergence. Interpreting of the subsets as subdomains allows to transfer the concept of domain overlap typically enhancing the information propagation to block-asynchronous solvers. In this talk we explore the impact of overlapping domains to convergence and performance of block-asynchronous Jacobi iterations, and present results obtained by running this solver class on state-of-the-art HPC systems.}, author = {Hartwig Anzt and Edmond Chow and Daniel Szyld and Jack Dongarra} } @conference {911, title = {Tuning Stationary Iterative Solvers for Fault Resilience}, booktitle = {6th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA15)}, year = {2015}, month = {2015-11}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, abstract = {As the transistor{\textquoteright}s feature size decreases following Moore{\textquoteright}s Law, hardware will become more prone to permanent, intermittent, and transient errors, increasing the number of failures experienced by applications, and diminishing the confidence of users. As a result, resilience is considered the most difficult under addressed issue faced by the High Performance Computing community. In this paper, we address the design of error resilient iterative solvers for sparse linear systems. Contrary to most previous approaches, based on Krylov subspace methods, for this purpose we analyze stationary component-wise relaxation. Concretely, starting from a plain implementation of the Jacobi iteration, we design a low-cost component-wise technique that elegantly handles bit-flips, turning the initial synchronized solver into an asynchronous iteration. Our experimental study employs sparse incomplete factorizations from several practical applications to expose the convergence delay incurred by the fault-tolerant implementation.}, author = {Hartwig Anzt and Jack Dongarra and Enrique S. Quintana-Orti} } @techreport {837, title = {Accelerating the LOBPCG method on GPUs using a blocked Sparse Matrix Vector Product}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-14-731}, year = {2014}, month = {2014-10}, publisher = {University of Tennessee}, abstract = {This paper presents a heterogeneous CPU-GPU algorithm design and optimized implementation for an entire sparse iterative eigensolver {\textendash} the Locally Optimal Block Preconditioned Conjugate Gradient (LOBPCG) {\textendash} starting from low-level GPU data structures and kernels to the higher-level algorithmic choices and overall heterogeneous design. Most notably, the eigensolver leverages the high-performance of a new GPU kernel developed for the simultaneous multiplication of a sparse matrix and a set of vectors (SpMM). This is a building block that serves as a backbone for not only block-Krylov, but also for other methods relying on blocking for acceleration in general. The heterogeneous LOBPCG developed here reveals the potential of this type of eigensolver by highly optimizing all of its components, and can be viewed as a benchmark for other SpMM-dependent applications. Compared to non-blocked algorithms, we show that the performance speedup factor of SpMM vs. SpMV-based algorithms is up to six on GPUs like NVIDIA{\textquoteright}s K40. In particular, a typical SpMV performance range in double precision is 20 to 25 GFlop/s, while the SpMM is in the range of 100 to 120 GFlop/s. Compared to highly-optimized CPU implementations, e.g., the SpMM from MKL on two eight-core Intel Xeon E5-2690s, our kernel is 3 to 5x. faster on a K40 GPU. For comparison to other computational loads, the same GPU to CPU performance acceleration is observed for the SpMV product, as well as dense linear algebra, e.g., matrix-matrix multiplication and factorizations like LU, QR, and Cholesky. Thus, the modeled GPU (vs. CPU) acceleration for the entire solver is also 3 to 5x. In practice though, currently available CPU implementations are much slower due to missed optimization opportunities, as we show.}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @conference {812, title = {Hybrid Multi-Elimination ILU Preconditioners on GPUs}, booktitle = {International Heterogeneity in Computing Workshop (HCW), IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Abstract{\textemdash}Iterative solvers for sparse linear systems often benefit from using preconditioners. While there are implementations for many iterative methods that leverage the computing power of accelerators, porting the latest developments in preconditioners to accelerators has been challenging. In this paper we develop a selfadaptive multi-elimination preconditioner for graphics processing units (GPUs). The preconditioner is based on a multi-level incomplete LU factorization and uses a direct dense solver for the bottom-level system. For test matrices from the University of Florida matrix collection, we investigate the influence of handling the triangular solvers in the distinct iteration steps in either single or double precision arithmetic. Integrated into a Conjugate Gradient method, we show that our multi-elimination algorithm is highly competitive against popular preconditioners, including multi-colored symmetric Gauss-Seidel relaxation preconditioners, and (multi-colored symmetric) ILU for numerous problems.}, author = {Dimitar Lukarski and Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @techreport {838, title = {Implementing a Sparse Matrix Vector Product for the SELL-C/SELL-C-σ formats on NVIDIA GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-14-727}, year = {2014}, month = {2014-04}, publisher = {University of Tennessee}, abstract = {Numerical methods in sparse linear algebra typically rely on a fast and efficient matrix vector product, as this usually is the backbone of iterative algorithms for solving eigenvalue problems or linear systems. Against the background of a large diversity in the characteristics of high performance computer architectures, it is a challenge to derive a cross-platform efficient storage format along with fast matrix vector kernels. Recently, attention focused on the SELL-C- format, a sliced ELLPACK format enhanced by row-sorting to reduce the fill in when padding rows with zeros. In this paper we propose an additional modification resulting in the padded sliced ELLPACK (SELLP) format, for which we develop a sparse matrix vector CUDA kernel that is able to efficiently exploit the computing power of NVIDIA GPUs. We show that the kernel we developed outperforms straight-forward implementations for the widespread CSR and ELLPACK formats, and is highly competitive to the implementations in the highly optimized CUSPARSE library.}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} } @article {825, title = {Improving the Energy Efficiency of Sparse Linear System Solvers on Multicore and Manycore Systems}, journal = {Philosophical Transactions of the Royal Society A -- Mathematical, Physical and Engineering Sciences}, volume = {372}, year = {2014}, month = {2014-07}, abstract = {While most recent breakthroughs in scientific research rely on complex simulations carried out in large-scale supercomputers, the power draft and energy spent for this purpose is increasingly becoming a limiting factor to this trend. In this paper, we provide an overview of the current status in energy-efficient scientific computing by reviewing different technologies used to monitor power draft as well as power- and energy-saving mechanisms available in commodity hardware. For the particular domain of sparse linear algebra, we analyze the energy efficiency of a broad collection of hardware architectures and investigate how algorithmic and implementation modifications can improve the energy performance of sparse linear system solvers, without negatively impacting their performance.}, keywords = {energy efficiency, graphics processing units, High Performance Computing, iterative solvers, multicore processors, sparse linear systems}, doi = {10.1098/rsta.2013.0279}, author = {Hartwig Anzt and Enrique S. Quintana-Orti} } @conference {807, title = {Improving the performance of CA-GMRES on multicores with multiple GPUs}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Abstract{\textemdash}The Generalized Minimum Residual (GMRES) method is one of the most widely-used iterative methods for solving nonsymmetric linear systems of equations. In recent years, techniques to avoid communication in GMRES have gained attention because in comparison to floating-point operations, communication is becoming increasingly expensive on modern computers. Since graphics processing units (GPUs) are now becoming crucial component in computing, we investigate the effectiveness of these techniques on multicore CPUs with multiple GPUs. While we present the detailed performance studies of a matrix powers kernel on multiple GPUs, we particularly focus on orthogonalization strategies that have a great impact on both the numerical stability and performance of GMRES, especially as the matrix becomes sparser or ill-conditioned. We present the experimental results on two eight-core Intel Sandy Bridge CPUs with three NDIVIA Fermi GPUs and demonstrate that significant speedups can be obtained by avoiding communication, either on a GPU or between the GPUs. As part of our study, we investigate several optimization techniques for the GPU kernels that can also be used in other iterative solvers besides GMRES. Hence, our studies not only emphasize the importance of avoiding communication on GPUs, but they also provide insight about the effects of these optimization techniques on the performance of the sparse solvers, and may have greater impact beyond GMRES.}, author = {Ichitaro Yamazaki and Hartwig Anzt and Stanimire Tomov and Mark Hoemmen and Jack Dongarra} } @conference {833, title = {Optimizing Krylov Subspace Solvers on Graphics Processing Units}, booktitle = {Fourth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Krylov subspace solvers are often the method of choice when solving sparse linear systems iteratively. At the same time, hardware accelerators such as graphics processing units (GPUs) continue to offer significant floating point performance gains for matrix and vector computations through easy-to-use libraries of computational kernels. However, as these libraries are usually composed of a well optimized but limited set of linear algebra operations, applications that use them often fail to leverage the full potential of the accelerator. In this paper we target the acceleration of the BiCGSTAB solver for GPUs, showing that significant improvement can be achieved by reformulating the method and developing application-specific kernels instead of using the generic CUBLAS library provided by NVIDIA. We propose an implementation that benefits from a significantly reduced number of kernel launches and GPUhost communication events, by means of increased data locality and a simultaneous reduction of multiple scalar products. Using experimental data, we show that, depending on the dominance of the untouched sparse matrix vector products, significant performance improvements can be achieved compared to a reference implementation based on the CUBLAS library. We feel that such optimizations are crucial for the subsequent development of highlevel sparse linear algebra libraries.}, author = {Stanimire Tomov and Piotr Luszczek and Ichitaro Yamazaki and Jack Dongarra and Hartwig Anzt and William Sawyer} } @conference {714, title = {Self-Adaptive Multiprecision Preconditioners on Multicore and Manycore Architectures}, booktitle = {VECPAR 2014}, year = {2014}, month = {2014-06}, address = {Eugene, OR}, abstract = {Based on the premise that preconditioners needed for scientific computing are not only required to be robust in the numerical sense, but also scalable for up to thousands of light-weight cores, we argue that this two-fold goal is achieved for the recently developed self-adaptive multi-elimination preconditioner. For this purpose, we revise the underlying idea and analyze the performance of implementations realized in the PARALUTION and MAGMA open-source software libraries on GPU architectures (using either CUDA or OpenCL), Intel{\textquoteright}s Many Integrated Core Architecture, and Intel{\textquoteright}s Sandy Bridge processor. The comparison with other well-established preconditioners like multi-coloured Gauss-Seidel, ILU(0) and multi-colored ILU(0), shows that the twofold goal of a numerically stable cross-platform performant algorithm is achieved.}, author = {Hartwig Anzt and Dimitar Lukarski and Stanimire Tomov and Jack Dongarra} } @article {826, title = {Unveiling the Performance-energy Trade-off in Iterative Linear System Solvers for Multithreaded Processors}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2014}, month = {2014-09}, pages = {885-904}, chapter = {885}, abstract = {In this paper, we analyze the interactions occurring in the triangle performance-power-energy for the execution of a pivotal numerical algorithm, the iterative conjugate gradient (CG) method, on a diverse collection of parallel multithreaded architectures. This analysis is especially timely in a decade where the power wall has arisen as a major obstacle to build faster processors. Moreover, the CG method has recently been proposed as a complement to the LINPACK benchmark, as this iterative method is argued to be more archetypical of the performance of today{\textquoteright}s scientific and engineering applications. To gain insights about the benefits of hands-on optimizations we include runtime and energy efficiency results for both out-of-the-box usage relying exclusively on compiler optimizations, and implementations manually optimized for target architectures, that range from general-purpose and digital signal multicore processors to manycore graphics processing units, all representative of current multithreaded systems.}, keywords = {CG, CPUs, energy efficiency, GPUs, low-power architectures}, doi = {10.1002/cpe.3341}, url = {http://dx.doi.org/10.1002/cpe.3341}, author = {Jos{\'e} I. Aliaga and Hartwig Anzt and Maribel Castillo and Juan C. Fern{\'a}ndez and Germ{\'a}n Le{\'o}n and Joaqu{\'\i}n P{\'e}rez and Enrique S. Quintana-Orti} } @article {icl:719, title = {A Block-Asynchronous Relaxation Method for Graphics Processing Units}, journal = {Journal of Parallel and Distributed Computing}, volume = {73}, year = {2013}, month = {2013-12}, pages = {1613{\textendash}1626}, abstract = {In this paper, we analyze the potential of asynchronous relaxation methods on Graphics Processing Units (GPUs). We develop asynchronous iteration algorithms in CUDA and compare them with parallel implementations of synchronous relaxation methods on CPU- or GPU-based systems. For a set of test matrices from UFMC we investigate convergence behavior, performance and tolerance to hardware failure. We observe that even for our most basic asynchronous relaxation scheme, the method can efficiently leverage the GPUs computing power and is, despite its lower convergence rate compared to the Gauss{\textendash}Seidel relaxation, still able to provide solution approximations of certain accuracy in considerably shorter time than Gauss{\textendash}Seidel running on CPUs- or GPU-based Jacobi. Hence, it overcompensates for the slower convergence by exploiting the scalability and the good fit of the asynchronous schemes for the highly parallel GPU architectures. Further, enhancing the most basic asynchronous approach with hybrid schemes{\textendash}using multiple iterations within the {\textquoteleft}{\textquoteleft}subdomain{\textquoteright}{\textquoteright} handled by a GPU thread block{\textendash}we manage to not only recover the loss of global convergence but often accelerate convergence of up to two times, while keeping the execution time of a global iteration practically the same. The combination with the advantageous properties of asynchronous iteration methods with respect to hardware failure identifies the high potential of the asynchronous methods for Exascale computing.}, doi = {http://dx.doi.org/10.1016/j.jpdc.2013.05.008}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra and Vincent Heuveline} } @article {icl:697, title = {Block-asynchronous Multigrid Smoothers for GPU-accelerated Systems}, journal = {ICCS 2012}, year = {2012}, month = {2012-06}, address = {Omaha, NE}, author = {Hartwig Anzt and Stanimire Tomov and Mark Gates and Jack Dongarra and Vincent Heuveline} } @article {icl:723, title = {GPU-Accelerated Asynchronous Error Correction for Mixed Precision Iterative Refinement}, journal = {EuroPar 2012 (also LAWN 260)}, year = {2012}, month = {2012-08}, address = {Rhodes Island, Greece}, author = {Hartwig Anzt and Piotr Luszczek and Jack Dongarra and Vincent Heuveline} } @inproceedings {icl:713, title = {Weighted Block-Asynchronous Iteration on GPU-Accelerated Systems}, journal = {Tenth International Workshop on Algorithms, Models and Tools for Parallel Computing on Heterogeneous Platforms (Best Paper)}, year = {2012}, month = {2012-08}, address = {Rhodes Island, Greece}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra and Vincent Heuveline} } @article {icl:701, title = {Weighted Block-Asynchronous Relaxation for GPU-Accelerated Systems}, journal = {SIAM Journal on Computing (submitted)}, year = {2012}, month = {2012-03}, author = {Hartwig Anzt and Jack Dongarra and Vincent Heuveline} } @article {icl:661, title = {Block-asynchronous Multigrid Smoothers for GPU-accelerated Systems}, number = {UT-CS-11-689}, year = {2011}, month = {2011-12}, keywords = {magma}, author = {Hartwig Anzt and Stanimire Tomov and Mark Gates and Jack Dongarra and Vincent Heuveline} } @techreport {icl:656, title = {A Block-Asynchronous Relaxation Method for Graphics Processing Units}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-11-687 / LAWN 258}, year = {2011}, month = {2011-11}, keywords = {magma}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra and Vincent Heuveline} } @techreport {icl:662, title = {GPU-Accelerated Asynchronous Error Correction for Mixed Precision Iterative Refinement}, journal = {University of Tennessee Computer Science Technical Report UT-CS-11-690 (also Lawn 260)}, year = {2011}, month = {2011-12}, keywords = {magma}, author = {Hartwig Anzt and Piotr Luszczek and Jack Dongarra and Vincent Heuveline} }