@conference {914, title = {Accelerating the LOBPCG method on GPUs using a blocked Sparse Matrix Vector Product}, booktitle = {Spring Simulation Multi-Conference 2015 (SpringSim{\textquoteright}15)}, year = {2015}, month = {2015-04}, publisher = {SCS}, organization = {SCS}, address = {Alexandria, VA}, abstract = {This paper presents a heterogeneous CPU-GPU implementation for a sparse iterative eigensolver the Locally Optimal Block Preconditioned Conjugate Gradient (LOBPCG). For the key routine generating the Krylov search spaces via the product of a sparse matrix and a block of vectors, we propose a GPU kernel based on a modi ed sliced ELLPACK format. Blocking a set of vectors and processing them simultaneously accelerates the computation of a set of consecutive SpMVs significantly. Comparing the performance against similar routines from Intel{\textquoteright}s MKL and NVIDIA{\textquoteright}s cuSPARSE library we identify appealing performance improvements. We integrate it into the highly optimized LOBPCG implementation. Compared to the BLOBEX CPU implementation running on two eight-core Intel Xeon E5-2690s, we accelerate the computation of a small set of eigenvectors using NVIDIA{\textquoteright}s K40 GPU by typically more than an order of magnitude.}, author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra} }