@article {1220,
	title = {Variable-Size Batched Gauss-Jordan Elimination for Block-Jacobi Preconditioning on Graphics Processors},
	journal = {Parallel Computing},
	volume = {81},
	year = {2019},
	month = {2019-01},
	pages = {131-146},
	abstract = {In this work, we address the efficient realization of block-Jacobi preconditioning on graphics processing units (GPUs). This task requires the solution of a collection of small and independent linear systems. To fully realize this implementation, we develop a variable-size batched matrix inversion kernel that uses Gauss-Jordan elimination (GJE) along with a variable-size batched matrix{\textendash}vector multiplication kernel that transforms the linear systems{\textquoteright} right-hand sides into the solution vectors. Our kernels make heavy use of the increased register count and the warp-local communication associated with newer GPU architectures. Moreover, in the matrix inversion, we employ an implicit pivoting strategy that migrates the workload (i.e., operations) to the place where the data resides instead of moving the data to the executing cores. We complement the matrix inversion with extraction and insertion strategies that allow the block-Jacobi preconditioner to be set up rapidly. The experiments on NVIDIA{\textquoteright}s K40 and P100 architectures reveal that our variable-size batched matrix inversion routine outperforms the CUDA basic linear algebra subroutine (cuBLAS) library functions that provide the same (or even less) functionality. We also show that the preconditioner setup and preconditioner application cost can be somewhat offset by the faster convergence of the iterative solver.},
	keywords = {Batched algorithms, Block-Jacobi, Gauss{\textendash}Jordan elimination, Graphics processor, matrix inversion, sparse linear systems},
	doi = {https://doi.org/10.1016/j.parco.2017.12.006},
	author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti}
}
@conference {1234,
	title = {Variable-Size Batched Condition Number Calculation on GPUs},
	booktitle = {SBAC-PAD},
	year = {2018},
	month = {2018-09},
	address = {Lyon, France},
	url = {https://ieeexplore.ieee.org/document/8645907},
	author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Thomas Gruetzmacher}
}
@inproceedings {1088,
	title = {Variable-Size Batched Gauss-Huard for Block-Jacobi Preconditioning},
	journal = {International Conference on Computational Science (ICCS 2017)},
	volume = {108},
	year = {2017},
	month = {2017-06},
	pages = {1783-1792},
	publisher = {Procedia Computer Science},
	address = {Zurich, Switzerland},
	abstract = {In this work we present new kernels for the generation and application of block-Jacobi precon-ditioners that accelerate the iterative solution of sparse linear systems on graphics processing units (GPUs). Our approach departs from the conventional LU factorization and decomposes the diagonal blocks of the matrix using the Gauss-Huard method. When enhanced with column pivoting, this method is as stable as LU with partial/row pivoting. Due to extensive use of GPU registers and integration of implicit pivoting, our variable size batched Gauss-Huard implementation outperforms the batched version of LU factorization. In addition, the application kernel combines the conventional two-stage triangular solve procedure, consisting of a backward solve followed by a forward solve, into a single stage that performs both operations simultaneously.},
	doi = {https://doi.org/10.1016/j.procs.2017.05.186},
	author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti and Andres E. Thomas}
}
@conference {1160,
	title = {Variable-Size Batched LU for Small Matrices and Its Integration into Block-Jacobi Preconditioning},
	booktitle = {46th International Conference on Parallel Processing (ICPP)},
	year = {2017},
	month = {2017-08},
	publisher = {IEEE},
	organization = {IEEE},
	address = {Bristol, United Kingdom},
	abstract = {We present a set of new batched CUDA kernels for the LU factorization of a large collection of independent problems of different size, and the subsequent triangular solves. All kernels heavily exploit the registers of the graphics processing unit (GPU) in order to deliver high performance for small problems. The development of these kernels is motivated by the need for tackling this embarrassingly parallel scenario in the context of block-Jacobi preconditioning that is relevant for the iterative solution of sparse linear systems.},
	keywords = {graphics processing units, Jacobian matrices, Kernel, linear systems, Parallel processing, Sparse matrices},
	doi = {10.1109/ICPP.2017.18},
	url = {http://ieeexplore.ieee.org/abstract/document/8025283/?reload=true},
	author = {Hartwig Anzt and Jack Dongarra and Goran Flegar and Enrique S. Quintana-Orti}
}