@inproceedings {998, title = {Batched Gauss-Jordan Elimination for Block-Jacobi Preconditioner Generation on GPUs}, journal = {Proceedings of the 8th International Workshop on Programming Models and Applications for Multicores and Manycores}, year = {2017}, month = {2017-02}, pages = {1{\textendash}10}, publisher = {ACM}, address = {New York, NY, USA}, abstract = {In this paper, we design and evaluate a routine for the efficient generation of block-Jacobi preconditioners on graphics processing units (GPUs). Concretely, to exploit the architecture of the graphics accelerator, we develop a batched Gauss-Jordan elimination CUDA kernel for matrix inversion that embeds an implicit pivoting technique and handles the entire inversion process in the GPU registers. In addition, we integrate extraction and insertion CUDA kernels to rapidly set up the block-Jacobi preconditioner. Our experiments compare the performance of our implementation against a sequence of batched routines from the MAGMA library realizing the inversion via the LU factorization with partial pivoting. Furthermore, we evaluate the costs of different strategies for the block-Jacobi extraction and insertion steps, using a variety of sparse matrices from the SuiteSparse matrix collection. Finally, we assess the efficiency of the complete block-Jacobi preconditioner generation in the context of an iterative solver applied to a set of computational science problems, and quantify its benefits over a scalar Jacobi preconditioner.}, keywords = {block-Jacobi preconditioner, Gauss-Jordan elimination, graphics processing units (GPUs), iterative methods, matrix inversion, sparse linear systems}, isbn = {978-1-4503-4883-6}, doi = {10.1145/3026937.3026940}, url = {http://doi.acm.org/10.1145/3026937.3026940}, author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti} }