@article {, title = {Ginkgo{\textemdash}A math library designed for platform portability}, journal = {Parallel Computing}, volume = {111}, year = {2022}, month = {2022-02}, pages = {102902}, abstract = {In an era of increasing computer system diversity, the portability of software from one system to another plays a central role. Software portability is important for the software developers as many software projects have a lifetime longer than a specific system, e.g., a supercomputer, and it is important for the domain scientists that realize their scientific application in a software framework and want to be able to run on one or another system. On a high level, there exist two approaches for realizing platform portability: (1) implementing software using a portability layer leveraging any technique which always generates specific kernels from another language or through an interface for running on different architectures; and (2) providing backends for different hardware architectures, with the backends typically differing in how and in which programming language functionality is realized due to using the language of choice for each hardware (e.g., CUDA kernels for NVIDIA GPUs, SYCL (DPC++) kernels to targeting Intel GPUs and other supported hardware, {\textellipsis}). In practice, these two approaches can be combined in applications to leverage their respective strengths. In this paper, we present how we realize portability across different hardware architectures for the Ginkgo library by following the second strategy and the goal to not only port to new hardware architectures but also achieve good performance. We present the Ginkgo library design, separating algorithms from hardware-specific kernels forming the distinct hardware executors, and report our experience when adding execution backends for NVIDIA, AMD, and Intel GPUs. We also present the performance we achieve with this approach for distinct hardware backends.}, keywords = {AMD, Intel, nVidia, performance portability, Platform Portability, Porting to GPU accelerators}, issn = {0167-8191}, doi = {https://doi.org/10.1016/j.parco.2022.102902}, url = {https://www.sciencedirect.com/science/article/pii/S0167819122000096}, author = {Terry Cojean and Yu-Hsiang Mike Tsai and Hartwig Anzt} } @article {, title = {Gingko: A Sparse Linear Algebrea Library for HPC}, year = {2021}, month = {2021-04}, publisher = {2021 ECP Annual Meeting}, author = {Hartwig Anzt and Natalie Beams and Terry Cojean and Fritz G{\"o}bel and Thomas Gr{\"u}tzmacher and Aditya Kashi and Pratik Nayak and Tobias Ribizel and Yuhsiang M. Tsai} } @article {, title = {Evaluating Asynchronous Schwarz Solvers on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2020}, month = {2020-08}, abstract = {With the commencement of the exascale computing era, we realize that the majority of the leadership supercomputers are heterogeneous and massively parallel. Even a single node can contain multiple co-processors such as GPUs and multiple CPU cores. For example, ORNL{\textquoteright}s Summit accumulates six NVIDIA Tesla V100 GPUs and 42 IBM Power9 cores on each node. Synchronizing across compute resources of multiple nodes can be prohibitively expensive. Hence, it is necessary to develop and study asynchronous algorithms that circumvent this issue of bulk-synchronous computing. In this study, we examine the asynchronous version of the abstract Restricted Additive Schwarz method as a solver. We do not explicitly synchronize, but allow the communication between the sub-domains to be completely asynchronous, thereby removing the bulk synchronous nature of the algorithm. We accomplish this by using the one-sided Remote Memory Access (RMA) functions of the MPI standard. We study the benefits of using such an asynchronous solver over its synchronous counterpart. We also study the communication patterns governed by the partitioning and the overlap between the sub-domains on the global solver. Finally, we show that this concept can render attractive performance benefits over the synchronous counterparts even for a well-balanced problem.}, keywords = {abstract Schwarz methods, Asynchronous solvers, exascale, GPUs, multicore processors, parallel numerical linear algebra}, doi = {https://doi.org/10.1177/1094342020946814}, author = {Pratik Nayak and Terry Cojean and Hartwig Anzt} } @conference {, title = {Evaluating the Performance of NVIDIA{\textquoteright}s A100 Ampere GPU for Sparse and Batched Computations}, booktitle = {2020 IEEE/ACM Workshop on Performance Modeling, Benchmarking and Simulation of High Performance Computer Systems (PMBS)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {GPU accelerators have become an important backbone for scientific high performance-computing, and the performance advances obtained from adopting new GPU hardware are significant. In this paper we take a first look at NVIDIA{\textquoteright}s newest server-line GPU, the A100 architecture, part of the Ampere generation. Specifically, we assess its performance for sparse and batch computations, as these routines are relied upon in many scientific applications, and compare to the p}, keywords = {Batched linear algebra, NVIDIA A100 GPU, sparse linear algebra, Sparse Matrix Vector Product}, author = {Hartwig Anzt and Yuhsiang M. Tsai and Ahmad Abdelfattah and Terry Cojean and Jack Dongarra} } @article {, title = {Ginkgo: A High Performance Numerical Linear Algebra Library}, journal = {Journal of Open Source Software}, volume = {5}, year = {2020}, month = {2020-08}, abstract = {Ginkgo is a production-ready sparse linear algebra library for high performance computing on GPU-centric architectures with a high level of performance portability and focuses on software sustainability. The library focuses on solving sparse linear systems and accommodates a large variety of matrix formats, state-of-the-art iterative (Krylov) solvers and preconditioners, which make the library suitable for a variety of scientific applications. Ginkgo supports many architectures such as multi-threaded CPU, NVIDIA GPUs, and AMD GPUs. The heavy use of modern C++ features simplifies the addition of new executor paradigms and algorithmic functionality without introducing significant performance overhead. Solving linear systems is usually one of the most computationally and memory intensive aspects of any application. Hence there has been a significant amount of effort in this direction with software libraries such as UMFPACK (Davis, 2004) and CHOLMOD (Chen, Davis, Hager, \& Rajamanickam, 2008) for solving linear systems with direct methods and PETSc (Balay et al., 2020), Trilinos ({\textquotedblleft}The Trilinos Project Website,{\textquotedblright} 2020), Eigen (Guennebaud, Jacob, \& others, 2010) and many more to solve linear systems with iterative methods. With Ginkgo, we aim to ensure high performance while not compromising portability. Hence, we provide very efficient low level kernels optimized for different architectures and separate these kernels from the algorithms thereby ensuring extensibility and ease of use. Ginkgo is also a part of the xSDK effort (Bartlett et al., 2017) and available as a Spack (Gamblin et al., 2015) package. xSDK aims to provide infrastructure for and interoperability between a collection of related and complementary software elements to foster rapid and efficient development of scientific applications using High Performance Computing. Within this effort, we provide interoperability with application libraries such as deal.ii (Arndt et al., 2019) and mfem (Anderson et al., 2020). Ginkgo provides wrappers within these two libraries so that they can take advantage of the features of Ginkgo.}, doi = {https://doi.org/10.21105/joss.02260}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai} } @article {, title = {Ginkgo: A Node-Level Sparse Linear Algebra Library for HPC (Poster)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai and Jack Dongarra} } @article {, title = {Load-Balancing Sparse Matrix Vector Product Kernels on GPUs}, journal = {ACM Transactions on Parallel Computing}, volume = {7}, year = {2020}, month = {2020-03}, abstract = {Efficient processing of Irregular Matrices on Single Instruction, Multiple Data (SIMD)-type architectures is a persistent challenge. Resolving it requires innovations in the development of data formats, computational techniques, and implementations that strike a balance between thread divergence, which is inherent for Irregular Matrices, and padding, which alleviates the performance-detrimental thread divergence but introduces artificial overheads. To this end, in this article, we address the challenge of designing high performance sparse matrix-vector product (SpMV) kernels designed for Nvidia Graphics Processing Units (GPUs). We present a compressed sparse row (CSR) format suitable for unbalanced matrices. We also provide a load-balancing kernel for the coordinate (COO) matrix format and extend it to a hybrid algorithm that stores part of the matrix in SIMD-friendly Ellpack format (ELL) format. The ratio between the ELL- and the COO-part is determined using a theoretical analysis of the nonzeros-per-row distribution. For the over 2,800 test matrices available in the Suite Sparse matrix collection, we compare the performance against SpMV kernels provided by NVIDIA{\textquoteright}s cuSPARSE library and a heavily-tuned sliced ELL (SELL-P) kernel that prevents unnecessary padding by considering the irregular matrices as a combination of matrix blocks stored in ELL format.}, doi = {https://doi.org/10.1145/3380930}, author = {Hartwig Anzt and Terry Cojean and Chen Yen-Chen and Jack Dongarra and Goran Flegar and Pratik Nayak and Stanimire Tomov and Yuhsiang M. Tsai and Weichung Wang} } @conference {, title = {Multiprecision Block-Jacobi for Iterative Triangular Solves}, booktitle = {European Conference on Parallel Processing (Euro-Par 2020)}, year = {2020}, month = {2020-08}, publisher = {Springer}, organization = {Springer}, abstract = {Recent research efforts have shown that Jacobi and block-Jacobi relaxation methods can be used as an effective and highly parallel approach for the solution of sparse triangular linear systems arising in the application of ILU-type preconditioners. Simultaneously, a few independent works have focused on designing efficient high performance adaptive-precision block-Jacobi preconditioning (block-diagonal scaling), in the context of the iterative solution of sparse linear systems, on manycore architectures. In this paper, we bridge the gap between relaxation methods based on regular splittings and preconditioners by demonstrating that iterative refinement can be leveraged to construct a relaxation method from the preconditioner. In addition, we exploit this insight to construct a highly-efficient sparse triangular system solver for graphics processors that combines iterative refinement with the block-Jacobi preconditioner available in the Ginkgo library.}, keywords = {Block-Jacobi, graphics processing units (GPUs), incomplete factorization preconditioning, multiprecision, sparse linear algebra}, doi = {https://doi.org/10.1007/978-3-030-57675-2_34}, author = {Fritz Goebel and Hartwig Anzt and Terry Cojean and Goran Flegar and Enrique S. Quintana-Orti} } @conference {, title = {Sparse Linear Algebra on AMD and NVIDIA GPUs{\textemdash}The Race is On}, booktitle = {ISC High Performance}, year = {2020}, month = {2020-06}, publisher = {Springer}, organization = {Springer}, abstract = {Efficiently processing sparse matrices is a central and performance-critical part of many scientific simulation codes. Recognizing the adoption of manycore accelerators in HPC, we evaluate in this paper the performance of the currently best sparse matrix-vector product (SpMV) implementations on high-end GPUs from AMD and NVIDIA. Specifically, we optimize SpMV kernels for the CSR, COO, ELL, and HYB format taking the hardware characteristics of the latest GPU technologies into account. We compare for 2,800 test matrices the performance of our kernels against AMD{\textquoteright}s hipSPARSE library and NVIDIA{\textquoteright}s cuSPARSE library, and ultimately assess how the GPU technologies from AMD and NVIDIA compare in terms of SpMV performance.}, keywords = {AMD, GPUs, nVidia, sparse matrix vector product (SpMV)}, doi = {https://doi.org/10.1007/978-3-030-50743-5_16}, author = {Yuhsiang M. Tsai and Terry Cojean and Hartwig Anzt} } @techreport {, title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic}, journal = {SLATE Working Notes}, number = {15, ICL-UT-20-08}, year = {2020}, month = {2020-07}, publisher = {University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang} } @article {1369, title = {A Customized Precision Format Based on Mantissa Segmentation for Accelerating Sparse Linear Algebra}, journal = {Concurrency and Computation: Practice and Experience}, volume = {40319}, year = {2019}, month = {2019-01}, issn = {1532-0626}, doi = {https://doi.org/10.1002/cpe.5418}, author = {Thomas Gruetzmacher and Terry Cojean and Goran Flegar and Fritz G{\"o}bel and Hartwig Anzt} } @article {1438, title = {Towards a New Peer Review Concept for Scientific Computing ensuring Technical Quality, Software Sustainability, and Result Reproducibility}, journal = {Proceedings in Applied Mathematics and Mechanics}, volume = {19}, year = {2019}, month = {2019-11}, abstract = {In this position paper we argue for implementing an alternative peer review process for scientific computing contributions that promotes high quality scientific software developments as fully-recognized conference submission. The idea is based on leveraging the code reviewers{\textquoteright} feedback on scientific software contributions to community software developments as a third-party review involvement. Providing open access to this technical review would complement the scientific review of the contribution, efficiently reduce the workload of the undisclosed reviewers, improve the algorithm implementation quality and software sustainability, and ensure full reproducibility of the reported results. Using this process creates incentives to publish scientific algorithms in open source software {\textendash} instead of designing prototype algorithms with the unique purpose of publishing a paper. In addition, the comments and suggestions of the community being archived in the versioning control systems ensure that also community reviewers are receiving credit for the review contributions {\textendash} unlike reviewers in the traditional peer review process. Finally, it reflects the particularity of the scientific computing community using conferences rather than journals as the main publication venue.}, issn = {1617-7061}, doi = {https://doi.org/10.1002/pamm.201900490}, author = {Hartwig Anzt and Terry Cojean and Eileen Kuhn} } @conference {1318, title = {Towards Continuous Benchmarking}, booktitle = {Platform for Advanced Scientific Computing Conference (PASC 2019)}, year = {2019}, month = {2019-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Zurich, Switzerland}, abstract = {We present an automated performance evaluation framework that enables an automated workflow for testing and performance evaluation of software libraries. Integrating this component into an ecosystem enables sustainable software development, as a community effort, via a web application for interactively evaluating the performance of individual software components. The performance evaluation tool is based exclusively on web technologies, which removes the burden of downloading performance data or installing additional software. We employ this framework for the Ginkgo software ecosystem, but the framework can be used with essentially any software project, including the comparison between different software libraries. The Continuous Integration (CI) framework of Ginkgo is also extended to automatically run a benchmark suite on predetermined HPC systems, store the state of the machine and the environment along with the compiled binaries, and collect results in a publicly accessible performance data repository based on Git. The Ginkgo performance explorer (GPE) can be used to retrieve the performance data from the repository, and visualizes it in a web browser. GPE also implements an interface that allows users to write scripts, archived in a Git repository, to extract particular data, compute particular metrics, and visualize them in many different formats (as specified by the script). The combination of these approaches creates a workflow which enables performance reproducibility and software sustainability of scientific software. In this paper, we present example scripts that extract and visualize performance data for Ginkgo{\textquoteright}s SpMV kernels that allow users to identify the optimal kernel for specific problem characteristics.}, isbn = {9781450367707}, doi = {https://doi.org/10.1145/3324989.3325719}, author = {Hartwig Anzt and Yen Chen Chen and Terry Cojean and Jack Dongarra and Goran Flegar and Pratik Nayak and Enrique S. Quintana-Orti and Yuhsiang M. Tsai and Weichung Wang} }