@article {925, title = {A Scalable Approach to Solving Dense Linear Algebra Problems on Hybrid CPU-GPU Systems}, journal = {Concurrency and Computation: Practice and Experience}, volume = {27}, year = {2015}, month = {2015-09}, pages = {3702-3723}, abstract = {Aiming to fully exploit the computing power of all CPUs and all graphics processing units (GPUs) on hybrid CPU-GPU systems to solve dense linear algebra problems, we design a class of heterogeneous tile algorithms to maximize the degree of parallelism, to minimize the communication volume, and to accommodate the heterogeneity between CPUs and GPUs. The new heterogeneous tile algorithms are executed upon our decentralized dynamic scheduling runtime system, which schedules a task graph dynamically and transfers data between compute nodes automatically. The runtime system uses a new distributed task assignment protocol to solve data dependencies between tasks without any coordination between processing units. By overlapping computation and communication through dynamic scheduling, we are able to attain scalable performance for the double-precision Cholesky factorization and QR factorization. Our approach demonstrates a performance comparable to Intel MKL on shared-memory multicore systems and better performance than both vendor (e.g., Intel MKL) and open source libraries (e.g., StarPU) in the following three environments: heterogeneous clusters with GPUs, conventional clusters without GPUs, and shared-memory systems with multiple GPUs.}, keywords = {dense linear algebra, distributed dataflow scheduling, heterogeneous HPC systems, runtime systems}, doi = {10.1002/cpe.3403}, author = {Fengguang Song and Jack Dongarra} } @inproceedings {835, title = {Scaling Up Matrix Computations on Shared-Memory Manycore Systems with 1000 CPU Cores}, journal = {International conference on Supercomputing}, year = {2014}, month = {2014-06}, pages = {333-342}, publisher = {ACM}, address = {Munich, Germany}, abstract = {While the growing number of cores per chip allows researchers to solve larger scientific and engineering problems, the parallel efficiency of the deployed parallel software starts to decrease. This unscalability problem happens to both vendor-provided and open-source software and wastes CPU cycles and energy. By expecting CPUs with hundreds of cores to be imminent, we have designed a new framework to perform matrix computations for massively many cores. Our performance analysis on manycore systems shows that the unscalability bottleneck is related to Non-Uniform Memory Access (NUMA): memory bus contention and remote memory access latency. To overcome the bottleneck, we have designed NUMA-aware tile algorithms with the help of a dynamic scheduling runtime system to minimize NUMA memory accesses. The main idea is to identify the data that is, either read a number of times or written once by a thread resident on a remote NUMA node, then utilize the runtime system to conduct data caching and movement between different NUMA nodes. Based on the experiments with QR factorizations, we demonstrate that our framework is able to achieve great scalability on a 48-core AMD Opteron system (e.g., parallel efficiency drops only 3\% from one core to 48 cores). We also deploy our framework to an extreme-scale shared-memory SGI machine which has 1024 CPU cores and runs a single Linux operating system image. Our framework continues to scale well, and can outperform the vendor-optimized Intel MKL library by up to 750\%.}, isbn = {978-1-4503-2642-1}, doi = {10.1145/2597652.2597670}, author = {Fengguang Song and Jack Dongarra} } @inproceedings {icl:669, title = {Enabling and Scaling Matrix Computations on Heterogeneous Multi-Core and Multi-GPU Systems}, journal = {26th ACM International Conference on Supercomputing (ICS 2012)}, year = {2012}, month = {2012-06}, publisher = {ACM}, address = {San Servolo Island, Venice, Italy}, keywords = {magma}, author = {Fengguang Song and Stanimire Tomov and Jack Dongarra} } @inproceedings {icl:681, title = {A Scalable Framework for Heterogeneous GPU-Based Clusters}, journal = {The 24th ACM Symposium on Parallelism in Algorithms and Architectures (SPAA 2012)}, year = {2012}, month = {2012-06}, publisher = {ACM}, address = {Pittsburgh, PA, USA}, keywords = {magma}, author = {Fengguang Song and Jack Dongarra} } @techreport {icl:628, title = {Efficient Support for Matrix Computations on Heterogeneous Multi-core and Multi-GPU Architectures}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-11-668, (also Lawn 250)}, year = {2011}, month = {2011-06}, keywords = {magma, plasma}, author = {Fengguang Song and Stanimire Tomov and Jack Dongarra} } @techreport {icl:530, title = {Scalable Tile Communication-Avoiding QR Factorization on Multicore Cluster Systems}, journal = {University of Tennessee Computer Science Technical Report}, volume = {{\textendash}10-653}, year = {2010}, month = {2010-04}, keywords = {plasma}, author = {Fengguang Song and Hatem Ltaeif and Bilel Hadri and Jack Dongarra} } @article {icl:559, title = {Scalable Tile Communication-Avoiding QR Factorization on Multicore Cluster Systems}, journal = {SC{\textquoteright}10}, year = {2010}, month = {2010-11}, publisher = {ACM SIGARCH/ IEEE Computer Society}, address = {New Orleans, LA}, keywords = {plasma}, author = {Fengguang Song and Hatem Ltaeif and Bilel Hadri and Jack Dongarra} } @article {icl:479, title = {Analytical Modeling and Optimization for Affinity Based Thread Scheduling on Multicore Systems}, journal = {IEEE Cluster 2009}, year = {2009}, month = {2009-08}, address = {New Orleans}, keywords = {gridpac, mumi}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @inproceedings {icl:493, title = {Dynamic Task Scheduling for Linear Algebra Algorithms on Distributed-Memory Multicore Systems}, journal = {International Conference for High Performance Computing, Networking, Storage, and Analysis (SC {\textquoteright}09)}, year = {2009}, month = {2009-11}, address = {Portland, OR}, keywords = {mumi, plasma}, author = {Fengguang Song and Asim YarKhan and Jack Dongarra} } @inproceedings {icl:501, title = {A Scalable Non-blocking Multicast Scheme for Distributed DAG Scheduling}, journal = {The International Conference on Computational Science 2009 (ICCS 2009)}, volume = {5544}, year = {2009}, month = {2009-05}, pages = {195-204}, address = {Baton Rouge, LA}, keywords = {plasma}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @techreport {icl:432, title = {Analytical Modeling for Affinity-Based Thread Scheduling on Multicore Platforms}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-626}, year = {2008}, month = {2008-01}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @article {icl:417, title = {Performance Instrumentation and Compiler Optimizations for MPI/OpenMP Applications}, journal = {Lecture Notes in Computer Science, OpenMP Shared Memory Parallel Programming}, volume = {4315}, year = {2008}, month = {2008-00}, publisher = {Springer Berlin / Heidelberg}, author = {Oscar Hernandez and Fengguang Song and Barbara Chapman and Jack Dongarra and Bernd Mohr and Shirley Moore and Felix Wolf} } @inproceedings {icl:367, title = {Feedback-Directed Thread Scheduling with Memory Considerations}, journal = {IEEE International Symposium on High Performance Distributed Computing}, year = {2007}, month = {2007-06}, address = {Monterey Bay, CA}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @inproceedings {icl:386, title = {L2 Cache Modeling for Scientific Applications on Chip Multi-Processors}, journal = {Proceedings of the 2007 International Conference on Parallel Processing}, year = {2007}, month = {2007-01}, publisher = {IEEE Computer Society}, address = {Xi{\textquoteright}an, China}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @inproceedings {icl:329, title = {Experiments with Strassen{\textquoteright}s Algorithm: From Sequential to Parallel}, journal = {18th IASTED International Conference on Parallel and Distributed Computing and Systems PDCS 2006 (submitted)}, year = {2006}, month = {2006-01}, address = {Dallas, Texas}, author = {Fengguang Song and Jack Dongarra and Shirley Moore} } @techreport {icl:334, title = {Modeling of L2 Cache Behavior for Thread-Parallel Scientific Programs on Chip Multi-Processors}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-06-583}, year = {2006}, month = {2006-01}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @inproceedings {icl:319, title = {Performance Instrumentation and Compiler Optimizations for MPI/OpenMP Applications}, journal = {Second International Workshop on OpenMP}, year = {2006}, month = {2006-01}, address = {Reims, France}, keywords = {kojak}, author = {Oscar Hernandez and Fengguang Song and Barbara Chapman and Jack Dongarra and Bernd Mohr and Shirley Moore and Felix Wolf} } @inproceedings {icl:288, title = {Automatic Experimental Analysis of Communication Patterns in Virtual Topologies}, journal = {In Proceedings of the International Conference on Parallel Processing}, year = {2005}, month = {2005-06}, publisher = {IEEE Computer Society}, address = {Oslo, Norway}, keywords = {kojak}, author = {Nikhil Bhatia and Fengguang Song and Felix Wolf and Jack Dongarra and Bernd Mohr and Shirley Moore} } @inproceedings {icl:233, title = {An Algebra for Cross-Experiment Performance Analysis}, journal = {2004 International Conference on Parallel Processing (ICCP-04)}, year = {2004}, month = {2004-08}, address = {Montreal, Quebec, Canada}, keywords = {kojak}, author = {Fengguang Song and Felix Wolf and Nikhil Bhatia and Jack Dongarra and Shirley Moore} } @conference {icl:239, title = {Automating the Large-Scale Collection and Analysis of Performance}, booktitle = {5th LCI International Conference on Linux Clusters: The HPC Revolution}, year = {2004}, month = {2004-05}, address = {Austin, Texas}, keywords = {kojak, papi}, author = {Phil Mucci and Jack Dongarra and Rick Kufrin and Shirley Moore and Fengguang Song and Felix Wolf} } @techreport {icl:196, title = {CUBE User Manual}, journal = {ICL Technical Report}, number = {ICL-UT-04-01}, year = {2004}, month = {2004-02}, keywords = {kojak}, author = {Fengguang Song and Felix Wolf} }