@techreport {, title = {P1673R3: A Free Function Linear algebra Interface Based on the BLAS}, journal = {ISO JTC1 SC22 WG22}, number = {P1673R3}, year = {2021}, month = {2021-04}, publisher = {ISO}, type = {standard}, abstract = {We believe this proposal is complementary to P1385, a proposal for a C++ Standard linear algebra library that introduces matrix and vector classes and overloaded arithmetic operators. In fact, we think that our proposal would make a natural foundation for a library like what P1385 proposes. However, a free function interface -- which clearly separates algorithms from data structures -- more naturally allows for a richer set of operations such as what the BLAS provides. A natural extension of the present proposal would include accepting P1385{\textquoteright}s matrix and vector objects as input for the algorithms proposed here. A straightforward way to do that would be for P1385{\textquoteright}s matrix and vector objects to make views of their data available as basic_mdspan.}, keywords = {C++, linear algebra}, url = {http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2021/p1673r3.pdf}, author = {Mark Hoemmen and Daisy Hollman and Christian Trott and Daniel Sunderland and Nevin Liber and Li-Ta Lo and Damien Lebrun-Grandie and Graham Lopez and Peter Caday and Sarah Knepper and Piotr Luszczek and Timothy Costa} } @article {1192, title = {Production Implementations of Pipelined \& Communication-Avoiding Iterative Linear Solvers}, year = {2018}, month = {2018-03}, publisher = {SIAM Conference on Parallel Processing for Scientific Computing}, address = {Tokyo, Japan}, author = {Mark Hoemmen and Ichitaro Yamazaki} } @techreport {1275, title = {Software-Defined Events (SDEs) in MAGMA-Sparse}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-12}, year = {2018}, month = {2018-12}, publisher = {University of Tennessee}, author = {Heike Jagode and Anthony Danalis and Hartwig Anzt and Ichitaro Yamazaki and Mark Hoemmen and Erik Boman and Stanimire Tomov and Jack Dongarra} } @techreport {1204, title = {Solver Interface \& Performance on Cori}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-05}, year = {2018}, month = {2018-06}, publisher = {University of Tennessee}, author = {Hartwig Anzt and Ichitaro Yamazaki and Mark Hoemmen and Erik Boman and Jack Dongarra} } @article {1131, title = {Comparing performance of s-step and pipelined GMRES on distributed-memory multicore CPUs}, year = {2017}, month = {2017-07}, publisher = {SIAM Annual Meeting}, address = {Pittsburgh, Pennsylvania}, author = {Ichitaro Yamazaki and Mark Hoemmen and Piotr Luszczek and Jack Dongarra} } @inproceedings {1011, title = {Improving Performance of GMRES by Reducing Communication and Pipelining Global Collectives}, journal = {Proceedings of The 18th IEEE International Workshop on Parallel and Distributed Scientific and Engineering Computing (PDSEC 2017), Best Paper Award}, year = {2017}, month = {2017-06}, address = {Orlando, FL}, abstract = {We compare the performance of pipelined and s-step GMRES, respectively referred to as l-GMRES and s-GMRES, on distributed multicore CPUs. Compared to standard GMRES, s-GMRES requires fewer all-reduces, while l-GMRES overlaps the all-reduces with computation. To combine the best features of two algorithms, we propose another variant, (l, t)-GMRES, that not only does fewer global all-reduces than standard GMRES, but also overlaps those all-reduces with other work. We implemented the thread-parallelism and communication-overlap in two different ways. The first uses nonblocking MPI collectives with thread-parallel computational kernels. The second relies on a shared-memory task scheduler. In our experiments, (l, t)-GMRES performed better than l-GMRES by factors of up to 1.67{\texttimes}. In addition, though we only used 50 nodes, when the latency cost became significant, our variant performed up to 1.22{\texttimes} better than s-GMRES by hiding all-reduces.}, doi = {https://doi.org/10.1109/IPDPSW.2017.65}, author = {Ichitaro Yamazaki and Mark Hoemmen and Piotr Luszczek and Jack Dongarra} } @techreport {1130, title = {MAGMA-sparse Interface Design Whitepaper}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-05}, year = {2017}, month = {2017-09}, type = {Technical Report}, abstract = {In this report we describe the logic and interface we develop for the MAGMA-sparse library to allow for easy integration as third-party library into a top-level software ecosystem. The design choices are based on extensive consultation with other software library developers, in particular the Trilinos software development team. The interface documentation is at this point not exhaustive, but a first proposal for setting a standard. Although the interface description targets the MAGMA-sparse software module, we hope that the design choices carry beyond this specific library, and are attractive for adoption in other packages. This report is not intended as static document, but will be updated over time to reflect the agile software development in the ECP 1.3.3.11 STMS11-PEEKS project.}, author = {Hartwig Anzt and Erik Boman and Jack Dongarra and Goran Flegar and Mark Gates and Mike Heroux and Mark Hoemmen and Jakub Kurzak and Piotr Luszczek and Sivasankaran Rajamanickam and Stanimire Tomov and Stephen Wood and Ichitaro Yamazaki} } @conference {816, title = {Domain Decomposition Preconditioners for Communication-Avoiding Krylov Methods on a Hybrid CPU/GPU Cluster}, booktitle = {The International Conference for High Performance Computing, Networking, Storage and Analysis (SC 14)}, year = {2014}, month = {2014-11}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, author = {Ichitaro Yamazaki and Sivasankaran Rajamanickam and Eric G. Boman and Mark Hoemmen and Michael A. Heroux and Stanimire Tomov} } @conference {807, title = {Improving the performance of CA-GMRES on multicores with multiple GPUs}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Abstract{\textemdash}The Generalized Minimum Residual (GMRES) method is one of the most widely-used iterative methods for solving nonsymmetric linear systems of equations. In recent years, techniques to avoid communication in GMRES have gained attention because in comparison to floating-point operations, communication is becoming increasingly expensive on modern computers. Since graphics processing units (GPUs) are now becoming crucial component in computing, we investigate the effectiveness of these techniques on multicore CPUs with multiple GPUs. While we present the detailed performance studies of a matrix powers kernel on multiple GPUs, we particularly focus on orthogonalization strategies that have a great impact on both the numerical stability and performance of GMRES, especially as the matrix becomes sparser or ill-conditioned. We present the experimental results on two eight-core Intel Sandy Bridge CPUs with three NDIVIA Fermi GPUs and demonstrate that significant speedups can be obtained by avoiding communication, either on a GPU or between the GPUs. As part of our study, we investigate several optimization techniques for the GPU kernels that can also be used in other iterative solvers besides GMRES. Hence, our studies not only emphasize the importance of avoiding communication on GPUs, but they also provide insight about the effects of these optimization techniques on the performance of the sparse solvers, and may have greater impact beyond GMRES.}, author = {Ichitaro Yamazaki and Hartwig Anzt and Stanimire Tomov and Mark Hoemmen and Jack Dongarra} }