@article {1209, title = {Batched One-Sided Factorizations of Tiny Matrices Using GPUs: Challenges and Countermeasures}, journal = {Journal of Computational Science}, volume = {26}, year = {2018}, month = {2018-05}, pages = {226{\textendash}236}, abstract = {The use of batched matrix computations recently gained a lot of interest for applications, where the same operation is applied to many small independent matrices. The batched computational pattern is frequently encountered in applications of data analytics, direct/iterative solvers and preconditioners, computer vision, astrophysics, and more, and often requires specific designs for vectorization and extreme parallelism to map well on today{\textquoteright}s high-end many-core architectures. This has led to the development of optimized software for batch computations, and to an ongoing community effort to develop standard interfaces for batched linear algebra software. Furthering these developments, we present GPU design and optimization techniques for high-performance batched one-sided factorizations of millions of tiny matrices (of size 32 and less). We quantify the effects and relevance of different techniques in order to select the best-performing LU, QR, and Cholesky factorization designs. While we adapt common optimization techniques, such as optimal memory traffic, register blocking, and concurrency control, we also show that a different mindset and techniques are needed when matrices are tiny, and in particular, sub-vector/warp in size. The proposed routines are part of the MAGMA library and deliver significant speedups compared to their counterparts in currently available vendor-optimized libraries. Notably, we tune the developments for the newest V100 GPU from NVIDIA to show speedups of up to 11.8{\texttimes}.}, keywords = {batch computation, GPU computing, matrix factorization}, doi = {https://doi.org/10.1016/j.jocs.2018.01.005}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} }