@techreport {689,
	title = {Dynamically balanced synchronization-avoiding LU factorization with multicore and GPUs},
	journal = {University of Tennessee Computer Science Technical Report},
	number = {ut-cs-13-713},
	year = {2013},
	month = {2013-07},
	abstract = {Graphics processing units (GPUs) brought huge performance improvements in the scientific and numerical fields. We present an efficient hybrid CPU/GPU computing approach that is portable, dynamically and efficiently balances the workload between the CPUs and the GPUs, and avoids data transfer bottlenecks that are frequently present in numerical algorithms. Our approach determines the amount of initial work to assign to the CPUs before the execution, and then dynamically balances workloads during the execution. Then, we present a theoretical model to guide the choice of the initial amount of work for the CPUs. The validation of our model allows our approach to self-adapt on any architecture using the manufacturer{\textquoteright}s characteristics of the underlying machine. We illustrate our method for the LU factorization. For this case, we show that the use of our approach combined with a communication avoiding LU algorithm is efficient. For example, our experiments on high-end hybrid CPU/GPU systems show that our dynamically balanced synchronization-avoiding LU is both multicore and GPU scalable. Comparisons with state-of-the-art libraries like MKL (for multicore) and MAGMA (for hybrid systems) are provided, demonstrating significant performance improvements. The approach is applicable to other linear algebra algorithms. The scheduling mechanisms and tuning models can be incorporated into respectively dynamic runtime systems/schedulers and autotuning frameworks for hybrid CPU/MIC/GPU architectures.},
	author = {Simplice Donfack and Stanimire Tomov and Jack Dongarra}
}