@article {1260, title = {Analysis and Design Techniques towards High-Performance and Energy-Efficient Dense Linear Solvers on GPUs}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {29}, year = {2018}, month = {2018-12}, pages = {2700{\textendash}2712}, abstract = {Graphics Processing Units (GPUs) are widely used in accelerating dense linear solvers. The matrix factorizations, which dominate the runtime for these solvers, are often designed using a hybrid scheme, where GPUs perform trailing matrix updates, while the CPUs perform the panel factorizations. Consequently, hybrid solutions require high-end CPUs and optimized CPU software in order to deliver high performance. Furthermore, they lack the energy efficiency inherent for GPUs due to the use of less energy-efficient CPUs, as well as CPU-GPU communications. This paper presents analysis and design techniques that overcome the shortcomings of the hybrid algorithms, and allow the design of high-performance and energy-efficient dense LU and Cholesky factorizations that use GPUs only. The full GPU solution eliminates the need for a high-end CPU and optimized CPU software, which leads to a better energy efficiency. We discuss different design choices, and introduce optimized GPU kernels for panel factorizations. The developed solutions achieve 90+ percent of the performance of optimized hybrid solutions, while improving the energy efficiency by 50 percent. They outperform the vendor library by 30-50 percent in single precision, and 15-50 percent in double precision. We also show that hybrid designs trail the proposed solutions in performance when optimized CPU software is not available.}, keywords = {Dense linear solvers, energy efficiency, GPU computing}, doi = {10.1109/TPDS.2018.2842785}, author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra} }