@article {, title = {Evaluating Asynchronous Schwarz Solvers on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2020}, month = {2020-08}, abstract = {With the commencement of the exascale computing era, we realize that the majority of the leadership supercomputers are heterogeneous and massively parallel. Even a single node can contain multiple co-processors such as GPUs and multiple CPU cores. For example, ORNL{\textquoteright}s Summit accumulates six NVIDIA Tesla V100 GPUs and 42 IBM Power9 cores on each node. Synchronizing across compute resources of multiple nodes can be prohibitively expensive. Hence, it is necessary to develop and study asynchronous algorithms that circumvent this issue of bulk-synchronous computing. In this study, we examine the asynchronous version of the abstract Restricted Additive Schwarz method as a solver. We do not explicitly synchronize, but allow the communication between the sub-domains to be completely asynchronous, thereby removing the bulk synchronous nature of the algorithm. We accomplish this by using the one-sided Remote Memory Access (RMA) functions of the MPI standard. We study the benefits of using such an asynchronous solver over its synchronous counterpart. We also study the communication patterns governed by the partitioning and the overlap between the sub-domains on the global solver. Finally, we show that this concept can render attractive performance benefits over the synchronous counterparts even for a well-balanced problem.}, keywords = {abstract Schwarz methods, Asynchronous solvers, exascale, GPUs, multicore processors, parallel numerical linear algebra}, doi = {https://doi.org/10.1177/1094342020946814}, author = {Pratik Nayak and Terry Cojean and Hartwig Anzt} } @article {1317, title = {Toward a Modular Precision Ecosystem for High-Performance Computing}, journal = {The International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1069-1078}, abstract = {With the memory bandwidth of current computer architectures being significantly slower than the (floating point) arithmetic performance, many scientific computations only leverage a fraction of the computational power in today{\textquoteright}s high-performance architectures. At the same time, memory operations are the primary energy consumer of modern architectures, heavily impacting the resource cost of large-scale applications and the battery life of mobile devices. This article tackles this mismatch between floating point arithmetic throughput and memory bandwidth by advocating a disruptive paradigm change with respect to how data are stored and processed in scientific applications. Concretely, the goal is to radically decouple the data storage format from the processing format and, ultimately, design a {\textquotedblleft}modular precision ecosystem{\textquotedblright} that allows for more flexibility in terms of customized data access. For memory-bounded scientific applications, dynamically adapting the memory precision to the numerical requirements allows for attractive resource savings. In this article, we demonstrate the potential of employing a modular precision ecosystem for the block-Jacobi preconditioner and the PageRank algorithm{\textemdash}two applications that are popular in the communities and at the same characteristic representatives for the field of numerical linear algebra and data analytics, respectively.}, keywords = {conjugate gradient, GPUs, Jacobi method, Modular precision, multicore processors, PageRank, parallel numerical linear algebra}, issn = {1094-3420}, doi = {https://doi.org/10.1177/1094342019846547}, author = {Hartwig Anzt and Goran Flegar and Thomas Gruetzmacher and Enrique S. Quintana-Orti} } @article {825, title = {Improving the Energy Efficiency of Sparse Linear System Solvers on Multicore and Manycore Systems}, journal = {Philosophical Transactions of the Royal Society A -- Mathematical, Physical and Engineering Sciences}, volume = {372}, year = {2014}, month = {2014-07}, abstract = {While most recent breakthroughs in scientific research rely on complex simulations carried out in large-scale supercomputers, the power draft and energy spent for this purpose is increasingly becoming a limiting factor to this trend. In this paper, we provide an overview of the current status in energy-efficient scientific computing by reviewing different technologies used to monitor power draft as well as power- and energy-saving mechanisms available in commodity hardware. For the particular domain of sparse linear algebra, we analyze the energy efficiency of a broad collection of hardware architectures and investigate how algorithmic and implementation modifications can improve the energy performance of sparse linear system solvers, without negatively impacting their performance.}, keywords = {energy efficiency, graphics processing units, High Performance Computing, iterative solvers, multicore processors, sparse linear systems}, doi = {10.1098/rsta.2013.0279}, author = {Hartwig Anzt and Enrique S. Quintana-Orti} }