@article {1219, title = {Optimization and Performance Evaluation of the IDR Iterative Krylov Solver on GPUs}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, number = {2}, year = {2018}, month = {2018-03}, pages = {220{\textendash}230}, abstract = {In this paper, we present an optimized GPU implementation for the induced dimension reduction algorithm. We improve data locality, combine it with an efficient sparse matrix vector kernel, and investigate the potential of overlapping computation with communication as well as the possibility of concurrent kernel execution. A comprehensive performance evaluation is conducted using a suitable performance model. The analysis reveals efficiency of up to 90\%, which indicates that the implementation achieves performance close to the theoretically attainable bound.}, keywords = {co-design, gpu, Induced dimension reduction (IDR), kernel fusion, kernel overlap, roofline performance model}, doi = {https://doi.org/10.1177/1094342016646844}, author = {Hartwig Anzt and Moritz Kreutzer and Eduardo Ponce and Gregory D. Peterson and Gerhard Wellein and Jack Dongarra} } @conference {913, title = {GPU-accelerated Co-design of Induced Dimension Reduction: Algorithmic Fusion and Kernel Overlap}, booktitle = {2nd International Workshop on Hardware-Software Co-Design for High Performance Computing}, year = {2015}, month = {2015-11}, publisher = {ACM}, organization = {ACM}, address = {Austin, TX}, abstract = {In this paper we present an optimized GPU co-design of the Induced Dimension Reduction (IDR) algorithm for solving linear systems. Starting from a baseline implementation based on the generic BLAS routines from the MAGMA software library, we apply optimizations that are based on kernel fusion and kernel overlap. Runtime experiments are used to investigate the benefit of the distinct optimization techniques for different variants of the IDR algorithm. A comparison to the reference implementation reveals that the interplay between them can succeed in cutting the overall runtime by up to about one third.}, author = {Hartwig Anzt and Eduardo Ponce and Gregory D. Peterson and Jack Dongarra} } @article {icl:725, title = {From CUDA to OpenCL: Towards a Performance-portable Solution for Multi-platform GPU Programming}, journal = {Parallel Computing}, volume = {38}, number = {8}, year = {2012}, month = {2012-08}, pages = {391-407}, author = {Peng Du and Rick Weber and Piotr Luszczek and Stanimire Tomov and Gregory D. Peterson and Jack Dongarra} } @article {icl:686, title = {Power Aware Computing on GPUs}, journal = {SAAHPC {\textquoteright}12 (Best Paper Award)}, year = {2012}, month = {2012-07}, address = {Argonne, IL}, keywords = {magma}, author = {Kiran Kasichayanula and Dan Terpstra and Piotr Luszczek and Stanimire Tomov and Shirley Moore and Gregory D. Peterson} } @inproceedings {icl:440, title = {Exploring New Architectures in Accelerating CFD for Air Force Applications}, journal = {Proceedings of the DoD HPCMP User Group Conference}, year = {2008}, month = {2008-01}, address = {Seattle, Washington}, keywords = {magma}, author = {Jack Dongarra and Shirley Moore and Gregory D. Peterson and Stanimire Tomov and Jeff Allred and Vincent Natoli and David Richie} }