@article {, title = {Evaluating Asynchronous Schwarz Solvers on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2020}, month = {2020-08}, abstract = {With the commencement of the exascale computing era, we realize that the majority of the leadership supercomputers are heterogeneous and massively parallel. Even a single node can contain multiple co-processors such as GPUs and multiple CPU cores. For example, ORNL{\textquoteright}s Summit accumulates six NVIDIA Tesla V100 GPUs and 42 IBM Power9 cores on each node. Synchronizing across compute resources of multiple nodes can be prohibitively expensive. Hence, it is necessary to develop and study asynchronous algorithms that circumvent this issue of bulk-synchronous computing. In this study, we examine the asynchronous version of the abstract Restricted Additive Schwarz method as a solver. We do not explicitly synchronize, but allow the communication between the sub-domains to be completely asynchronous, thereby removing the bulk synchronous nature of the algorithm. We accomplish this by using the one-sided Remote Memory Access (RMA) functions of the MPI standard. We study the benefits of using such an asynchronous solver over its synchronous counterpart. We also study the communication patterns governed by the partitioning and the overlap between the sub-domains on the global solver. Finally, we show that this concept can render attractive performance benefits over the synchronous counterparts even for a well-balanced problem.}, keywords = {abstract Schwarz methods, Asynchronous solvers, exascale, GPUs, multicore processors, parallel numerical linear algebra}, doi = {https://doi.org/10.1177/1094342020946814}, author = {Pratik Nayak and Terry Cojean and Hartwig Anzt} } @conference {, title = {Sparse Linear Algebra on AMD and NVIDIA GPUs{\textemdash}The Race is On}, booktitle = {ISC High Performance}, year = {2020}, month = {2020-06}, publisher = {Springer}, organization = {Springer}, abstract = {Efficiently processing sparse matrices is a central and performance-critical part of many scientific simulation codes. Recognizing the adoption of manycore accelerators in HPC, we evaluate in this paper the performance of the currently best sparse matrix-vector product (SpMV) implementations on high-end GPUs from AMD and NVIDIA. Specifically, we optimize SpMV kernels for the CSR, COO, ELL, and HYB format taking the hardware characteristics of the latest GPU technologies into account. We compare for 2,800 test matrices the performance of our kernels against AMD{\textquoteright}s hipSPARSE library and NVIDIA{\textquoteright}s cuSPARSE library, and ultimately assess how the GPU technologies from AMD and NVIDIA compare in terms of SpMV performance.}, keywords = {AMD, GPUs, nVidia, sparse matrix vector product (SpMV)}, doi = {https://doi.org/10.1007/978-3-030-50743-5_16}, author = {Yuhsiang M. Tsai and Terry Cojean and Hartwig Anzt} } @article {1317, title = {Toward a Modular Precision Ecosystem for High-Performance Computing}, journal = {The International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1069-1078}, abstract = {With the memory bandwidth of current computer architectures being significantly slower than the (floating point) arithmetic performance, many scientific computations only leverage a fraction of the computational power in today{\textquoteright}s high-performance architectures. At the same time, memory operations are the primary energy consumer of modern architectures, heavily impacting the resource cost of large-scale applications and the battery life of mobile devices. This article tackles this mismatch between floating point arithmetic throughput and memory bandwidth by advocating a disruptive paradigm change with respect to how data are stored and processed in scientific applications. Concretely, the goal is to radically decouple the data storage format from the processing format and, ultimately, design a {\textquotedblleft}modular precision ecosystem{\textquotedblright} that allows for more flexibility in terms of customized data access. For memory-bounded scientific applications, dynamically adapting the memory precision to the numerical requirements allows for attractive resource savings. In this article, we demonstrate the potential of employing a modular precision ecosystem for the block-Jacobi preconditioner and the PageRank algorithm{\textemdash}two applications that are popular in the communities and at the same characteristic representatives for the field of numerical linear algebra and data analytics, respectively.}, keywords = {conjugate gradient, GPUs, Jacobi method, Modular precision, multicore processors, PageRank, parallel numerical linear algebra}, issn = {1094-3420}, doi = {https://doi.org/10.1177/1094342019846547}, author = {Hartwig Anzt and Goran Flegar and Thomas Gruetzmacher and Enrique S. Quintana-Orti} } @article {826, title = {Unveiling the Performance-energy Trade-off in Iterative Linear System Solvers for Multithreaded Processors}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2014}, month = {2014-09}, pages = {885-904}, chapter = {885}, abstract = {In this paper, we analyze the interactions occurring in the triangle performance-power-energy for the execution of a pivotal numerical algorithm, the iterative conjugate gradient (CG) method, on a diverse collection of parallel multithreaded architectures. This analysis is especially timely in a decade where the power wall has arisen as a major obstacle to build faster processors. Moreover, the CG method has recently been proposed as a complement to the LINPACK benchmark, as this iterative method is argued to be more archetypical of the performance of today{\textquoteright}s scientific and engineering applications. To gain insights about the benefits of hands-on optimizations we include runtime and energy efficiency results for both out-of-the-box usage relying exclusively on compiler optimizations, and implementations manually optimized for target architectures, that range from general-purpose and digital signal multicore processors to manycore graphics processing units, all representative of current multithreaded systems.}, keywords = {CG, CPUs, energy efficiency, GPUs, low-power architectures}, doi = {10.1002/cpe.3341}, url = {http://dx.doi.org/10.1002/cpe.3341}, author = {Jos{\'e} I. Aliaga and Hartwig Anzt and Maribel Castillo and Juan C. Fern{\'a}ndez and Germ{\'a}n Le{\'o}n and Joaqu{\'\i}n P{\'e}rez and Enrique S. Quintana-Orti} }