@conference {887, title = {Flexible Linear Algebra Development and Scheduling with Cholesky Factorization}, booktitle = {17th IEEE International Conference on High Performance Computing and Communications}, year = {2015}, month = {2015-08}, address = {Newark, NJ}, abstract = {Modern high performance computing environments are composed of networks of compute nodes that often contain a variety of heterogeneous compute resources, such as multicore-CPUs, GPUs, and coprocessors. One challenge faced by domain scientists is how to efficiently use all these distributed, heterogeneous resources. In order to use the GPUs effectively, the workload parallelism needs to be much greater than the parallelism for a multicore-CPU. On the other hand, a Xeon Phi coprocessor will work most effectively with degree of parallelism between GPUs and multicore-CPUs. Additionally, effectively using distributed memory nodes brings out another level of complexity where the workload must be carefully partitioned over the nodes. In this work we are using a lightweight runtime environment to handle many of the complexities in such distributed, heterogeneous systems. The runtime environment uses task-superscalar concepts to enable the developer to write serial code while providing parallel execution. The task-programming model allows the developer to write resource-specialization code, so that each resource gets the appropriate sized workload-grain. Our task programming abstraction enables the developer to write a single algorithm that will execute efficiently across the distributed heterogeneous machine. We demonstrate the effectiveness of our approach with performance results for dense linear algebra applications, specifically the Cholesky factorization.}, author = {Azzam Haidar and Asim YarKhan and Chongxiao Cao and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} }