@conference {914,
	title = {Accelerating the LOBPCG method on GPUs using a blocked Sparse Matrix Vector Product},
	booktitle = {Spring Simulation Multi-Conference 2015 (SpringSim{\textquoteright}15)},
	year = {2015},
	month = {2015-04},
	publisher = {SCS},
	organization = {SCS},
	address = {Alexandria, VA},
	abstract = {This paper presents a heterogeneous CPU-GPU implementation for a sparse iterative eigensolver the Locally Optimal Block Preconditioned Conjugate Gradient (LOBPCG). For the key routine generating the Krylov search spaces via the product of a sparse matrix and a block of vectors, we propose a GPU kernel based on a modied sliced ELLPACK format. Blocking a set of vectors and processing them simultaneously accelerates the computation of a set of consecutive SpMVs significantly. Comparing the performance against similar routines from Intel{\textquoteright}s MKL and NVIDIA{\textquoteright}s cuSPARSE library we identify appealing performance improvements. We integrate it into the highly optimized LOBPCG implementation. Compared to the BLOBEX CPU implementation running on two eight-core Intel Xeon E5-2690s, we accelerate the computation of a small set of eigenvectors using NVIDIA{\textquoteright}s K40 GPU by typically more than an order of magnitude.},
	author = {Hartwig Anzt and Stanimire Tomov and Jack Dongarra}
}