@techreport {1465, title = {Asynchronous SGD for DNN Training on Shared-Memory Parallel Architectures}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-04}, year = {2020}, month = {2020-03}, publisher = {University of Tennessee, Knoxville}, abstract = {We present a parallel asynchronous Stochastic Gradient Descent algorithm for shared memory architectures. Different from previous asynchronous algorithms, we consider the case where the gradient updates are not particularly sparse. In the context of the MagmaDNN framework, we compare the parallel efficiency of the asynchronous implementation with that of the traditional synchronous implementation. Tests are performed for training deep neural networks on multicore CPUs and GPU devices.}, keywords = {Asynchronous iterative methods, Deep learning, gpu, multicore CPU, Stochastic Gradient Descent}, author = {Florent Lopez and Edmond Chow and Stanimire Tomov and Jack Dongarra} } @conference {1481, title = {heFFTe: Highly Efficient FFT for Exascale}, booktitle = {International Conference on Computational Science (ICCS 2020)}, year = {2020}, month = {2020-06}, address = {Amsterdam, Netherlands}, abstract = {Exascale computing aspires to meet the increasing demands from large scientific applications. Software targeting exascale is typically designed for heterogeneous architectures; henceforth, it is not only important to develop well-designed software, but also make it aware of the hardware architecture and efficiently exploit its power. Currently, several and diverse applications, such as those part of the Exascale Computing Project (ECP) in the United States, rely on efficient computation of the Fast Fourier Transform (FFT). In this context, we present the design and implementation of heFFTe (Highly Efficient FFT for Exascale) library, which targets the upcoming exascale supercomputers. We provide highly (linearly) scalable GPU kernels that achieve more than 40{\texttimes} speedup with respect to local kernels from CPU state-of-the-art libraries, and over 2{\texttimes} speedup for the whole FFT computation. A communication model for parallel FFTs is also provided to analyze the bottleneck for large-scale problems. We show experiments obtained on Summit supercomputer at Oak Ridge National Laboratory, using up to 24,576 IBM Power9 cores and 6,144 NVIDIA V-100 GPUs.}, keywords = {exascale, FFT, gpu, scalable algorithm}, doi = {https://doi.org/10.1007/978-3-030-50371-0_19}, author = {Alan Ayala and Stanimire Tomov and Azzam Haidar and Jack Dongarra} } @conference {, title = {High-Order Finite Element Method using Standard and Device-Level Batch GEMM on GPUs}, booktitle = {2020 IEEE/ACM 11th Workshop on Latest Advances in Scalable Algorithms for Large-Scale Systems (ScalA)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {We present new GPU implementations of the tensor contractions arising from basis-related computations for highorder finite element methods. We consider both tensor and nontensor bases. In the case of tensor bases, we introduce new kernels based on a series of fused device-level matrix multiplications (GEMMs), specifically designed to utilize the fast memory of the GPU. For non-tensor bases, we develop a tuned framework for choosing standard batch-BLAS GEMMs that will maximize performance across groups of elements. The implementations are included in a backend of the libCEED library. We present benchmark results for the diffusion and mass operators using libCEED integration through the MFEM finite element library and compare to those of the previously best-performing GPU backends for stand-alone basis computations. In tensor cases, we see improvements of approximately 10-30\% for some cases, particularly for higher basis orders. For the non-tensor tests, the new batch-GEMMs implementation is twice as fast as what was previously available for basis function order greater than five and greater than approximately 105 degrees of freedom in the mesh; up to ten times speedup is seen for eighth-order basis functions.}, keywords = {Batched linear algebra, finite elements, gpu, high-order methods, matrix-free FEM, Tensor contractions}, author = {Natalie Beams and Ahmad Abdelfattah and Stanimire Tomov and Jack Dongarra and Tzanio Kolev and Yohann Dudouit} } @article {1466, title = {Reducing the Amount of out-of-core Data Access for GPU-Accelerated Randomized SVD}, journal = {Concurrency and Computation: Practice and Experience}, year = {2020}, month = {2020-04}, keywords = {Divide and conquer, gpu, out-of-core computation, Singular value decomposition}, doi = { https://doi.org/10.1002/cpe.5754}, author = {Yuechao Lu and Ichitaro Yamazaki and Fumihiko Ino and Yasuyuki Matsushita and Stanimire Tomov and Jack Dongarra} } @article {1385, title = {GPUDirect MPI Communications and Optimizations to Accelerate FFTs on Exascale Systems}, journal = {EuroMPI{\textquoteright}19 Posters, Zurich, Switzerland}, number = {icl-ut-19-06}, year = {2019}, month = {2019-09}, publisher = {ICL}, type = {Extended Abstract}, abstract = {Fast Fourier transforms (FFTs) are used in applications ranging from molecular dynamics and spectrum estimation to machine learn- ing, fast convolution and correlation, signal modulation, wireless multimedia applications, and others. However, FFTs are memory bound, and therefore, to accelerate them, it is crucial to avoid and optimize the FFTs{\textquoteright} communications. To this end, we present a 3-D FFT design for distributed graphics processing unit (GPU) systems that: (1) efficiently uses GPUs{\textquoteright} high bandwidth, (2) reduces global communications algorithmically, when possible, and (3) employs GPUDirect technologies as well as MPI optimizations in the development of high-performance FFTs for large-scale GPU-accelerated systems. We show that these developments and optimizations lead to very good strong scalability and a performance that is close to 90\% of the theoretical peak.}, keywords = {CUDA-Aware MPI, ECP, FFT, FFT-ECP, gpu, GPUDirect}, author = {Hejer Shaiek and Stanimire Tomov and Alan Ayala and Azzam Haidar and Jack Dongarra} } @article {1437, title = {Parallel Selection on GPUs}, journal = {Parallel Computing}, volume = {91}, year = {2019}, month = {2020-03}, abstract = {We present a novel parallel selection algorithm for GPUs capable of handling single rank selection (single selection) and multiple rank selection (multiselection). The algorithm requires no assumptions on the input data distribution, and has a much lower recursion depth compared to many state-of-the-art algorithms. We implement the algorithm for different GPU generations, always leveraging the respectively-available low-level communication features, and assess the performance on server-line hardware. The computational complexity of our SampleSelect algorithm is comparable to specialized algorithms designed for {\textendash} and exploiting the characteristics of {\textendash} {\textquotedblleft}pleasant{\textquotedblright} data distributions. At the same time, as the proposed SampleSelect algorithm does not work on the actual element values but on the element ranks of the elements only, it is robust to the input data and can complete significantly faster for adversarial data distributions. We also address the use case of approximate selection by designing a variant that radically reduces the computational cost while preserving high approximation accuracy.}, keywords = {approximate selection, gpu, kth order statistics, multiselection, parallel selection algorithm}, doi = {https://doi.org/10.1016/j.parco.2019.102588}, url = {https://www.sciencedirect.com/science/article/pii/S0167819119301796}, author = {Tobias Ribizel and Hartwig Anzt} } @article {1161, title = {Accelerating the SVD Two Stage Bidiagonal Reduction and Divide and Conquer Using GPUs}, journal = {Parallel Computing}, volume = {74}, year = {2018}, month = {2018-05}, pages = {3{\textendash}18}, abstract = {The increasing gap between memory bandwidth and computation speed motivates the choice of algorithms to take full advantage of today{\textquoteright}s high performance computers. For dense matrices, the classic algorithm for the singular value decomposition (SVD) uses a one stage reduction to bidiagonal form, which is limited in performance by the memory bandwidth. To overcome this limitation, a two stage reduction to bidiagonal has been gaining popularity. It first reduces the matrix to band form using high performance Level 3 BLAS, then reduces the band matrix to bidiagonal form. As accelerators such as GPUs and co-processors are becoming increasingly widespread in high-performance computing, a question of great interest to many SVD users is how much the employment of a two stage reduction, as well as other current best practices in GPU computing, can accelerate this important routine. To fulfill this interest, we have developed an accelerated SVD employing a two stage reduction to bidiagonal and a number of other algorithms that are highly optimized for GPUs. Notably, we also parallelize and accelerate the divide and conquer algorithm used to solve the subsequent bidiagonal SVD. By accelerating all phases of the SVD algorithm, we provide a significant speedup compared to existing multi-core and GPU-based SVD implementations. In particular, using a P100 GPU, we illustrate a performance of up to 804 Gflop/s in double precision arithmetic to compute the full SVD of a 20k {\texttimes} 20k matrix in 90 seconds, which is 8.9 {\texttimes} faster than MKL on two 10 core Intel Haswell E5-2650 v3 CPUs, 3.7 {\texttimes} over the multi-core PLASMA two stage version, and 2.6 {\texttimes} over the previously accelerated one stage MAGMA version.}, keywords = {2-stage, accelerator, Divide and conquer, gpu, Singular value decomposition, SVD}, issn = {01678191}, doi = {10.1016/j.parco.2017.10.004}, url = {https://www.sciencedirect.com/science/article/pii/S0167819117301758}, author = {Mark Gates and Stanimire Tomov and Jack Dongarra} } @article {1263, title = {Computational Benefit of GPU Optimization for Atmospheric Chemistry Modeling}, journal = {Journal of Advances in Modeling Earth Systems}, volume = {10}, year = {2018}, month = {2018-08}, pages = {1952{\textendash}1969}, abstract = {Global chemistry-climate models are computationally burdened as the chemical mechanisms become more complex and realistic. Optimization for graphics processing units (GPU) may make longer global simulation with regional detail possible, but limited study has been done to explore the potential benefit for the atmospheric chemistry modeling. Hence, in this study, the second-order Rosenbrock solver of the chemistry module of CAM4-Chem is ported to the GPU to gauge potential speed-up. We find that on the CPU, the fastest performance is achieved using the Intel compiler with a block interleaved memory layout. Different combinations of compiler and memory layout lead to ~11.02{\texttimes} difference in the computational time. In contrast, the GPU version performs the best when using a combination of fully interleaved memory layout with block size equal to the warp size, CUDA streams for independent kernels, and constant memory. Moreover, the most efficient data transfer between CPU and GPU is gained by allocating the memory contiguously during the data initialization on the GPU. Compared to one CPU core, the speed-up of using one GPU alone reaches a factor of ~11.7{\texttimes} for the computation alone and ~3.82{\texttimes} when the data transfer between CPU and GPU is considered. Using one GPU alone is also generally faster than the multithreaded implementation for 16 CPU cores in a compute node and the single-source solution (OpenACC). The best performance is achieved by the implementation of the hybrid CPU/GPU version, but rescheduling the workload among the CPU cores is required before the practical CAM4-Chem simulation.}, keywords = {compiler, CUDA, data transfer, gpu, hybrid, memory layout}, doi = {https://doi.org/10.1029/2018MS001276}, author = {Jian Sun and Joshua Fu and John Drake and Qingzhao Zhu and Azzam Haidar and Mark Gates and Stanimire Tomov and Jack Dongarra} } @article {1219, title = {Optimization and Performance Evaluation of the IDR Iterative Krylov Solver on GPUs}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, number = {2}, year = {2018}, month = {2018-03}, pages = {220{\textendash}230}, abstract = {In this paper, we present an optimized GPU implementation for the induced dimension reduction algorithm. We improve data locality, combine it with an efficient sparse matrix vector kernel, and investigate the potential of overlapping computation with communication as well as the possibility of concurrent kernel execution. A comprehensive performance evaluation is conducted using a suitable performance model. The analysis reveals efficiency of up to 90\%, which indicates that the implementation achieves performance close to the theoretically attainable bound.}, keywords = {co-design, gpu, Induced dimension reduction (IDR), kernel fusion, kernel overlap, roofline performance model}, doi = {https://doi.org/10.1177/1094342016646844}, author = {Hartwig Anzt and Moritz Kreutzer and Eduardo Ponce and Gregory D. Peterson and Gerhard Wellein and Jack Dongarra} } @article {1067, title = {Preconditioned Krylov Solvers on GPUs}, journal = {Parallel Computing}, year = {2017}, month = {2017-06}, abstract = {In this paper, we study the effect of enhancing GPU-accelerated Krylov solvers with preconditioners. We consider the BiCGSTAB, CGS, QMR, and IDR(s) Krylov solvers. For a large set of test matrices, we assess the impact of Jacobi and incomplete factorization preconditioning on the solvers{\textquoteright} numerical stability and time-to-solution performance. We also analyze how the use of a preconditioner impacts the choice of the fastest solver.}, keywords = {gpu, ILU, Jacobi, Krylov solvers, Preconditioning}, issn = {01678191}, doi = {10.1016/j.parco.2017.05.006}, url = {http://www.sciencedirect.com/science/article/pii/S0167819117300777}, author = {Hartwig Anzt and Mark Gates and Jack Dongarra and Moritz Kreutzer and Gerhard Wellein and Martin Kohler} } @inproceedings {992, title = {Efficiency of General Krylov Methods on GPUs {\textendash} An Experimental Study}, journal = {2016 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2016}, month = {2016-05}, pages = {683-691}, abstract = {This paper compares different Krylov methods based on short recurrences with respect to their efficiency whenimplemented on GPUs. The comparison includes BiCGSTAB, CGS, QMR, and IDR using different shadow space dimensions. These methods are known for their good convergencecharacteristics. For a large set of test matrices taken from theUniversity of Florida Matrix Collection, we evaluate the methods{\textquoteright}performance against different target metrics: convergence, number of sparse matrix-vector multiplications, and executiontime. We also analyze whether the methods are "orthogonal"in terms of problem suitability. We propose best practicesfor choosing methods in a "black box" scenario, where noinformation about the optimal solver is available.}, keywords = {algorithmic bombardment, BiCGSTAB, CGS, Convergence, Electric breakdown, gpu, graphics processing units, Hardware, IDR(s), Krylov solver, Libraries, linear systems, QMR, Sparse matrices}, doi = {10.1109/IPDPSW.2016.45}, author = {Hartwig Anzt and Jack Dongarra and Moritz Kreutzer and Gerhard Wellein and Martin Kohler} } @conference {937, title = {Efficiency of General Krylov Methods on GPUs {\textendash} An Experimental Study}, booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES)}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This paper compares different Krylov methods based on short recurrences with respect to their efficiency when implemented on GPUs. The comparison includes BiCGSTAB, CGS, QMR, and IDR using different shadow space dimensions. These methods are known for their good convergence characteristics. For a large set of test matrices taken from the University of Florida Matrix Collection, we evaluate the methods{\textquoteright} performance against different target metrics: convergence, number of sparse matrix-vector multiplications, and execution time. We also analyze whether the methods are {\textquotedblleft}orthogonal{\textquotedblright} in terms of problem suitability. We propose best practices for choosing methods in a {\textquotedblleft}black box{\textquotedblright} scenario, where no information about the optimal solver is available.}, keywords = {algorithmic bombardment, BiCGSTAB, CGS, gpu, IDR(s), Krylov solver, QMR}, doi = {10.1109/IPDPSW.2016.45}, author = {Hartwig Anzt and Jack Dongarra and Moritz Kreutzer and Gerhard Wellein and Martin Kohler} } @conference {941, title = {GPU-Aware Non-contiguous Data Movement In Open MPI}, booktitle = {25th International Symposium on High-Performance Parallel and Distributed Computing (HPDC{\textquoteright}16)}, year = {2016}, month = {2016-06}, publisher = {ACM}, organization = {ACM}, address = {Kyoto, Japan}, abstract = {
Due to better parallel density and power efficiency, GPUs have become more popular for use in scientific applications. Many of these applications are based on the ubiquitous Message Passing Interface (MPI) programming paradigm, and take advantage of non-contiguous memory layouts to exchange data between processes. However, support for efficient non-contiguous data movements for GPU-resident data is still in its infancy, imposing a negative impact on the overall application performance.
To address this shortcoming, we present a solution where we take advantage of the inherent parallelism in the datatype packing and unpacking operations. We developed a close integration between Open MPI{\textquoteright}s stack-based datatype engine, NVIDIA{\textquoteright}s Unied Memory Architecture and GPUDirect capabilities. In this design the datatype packing and unpacking operations are offloaded onto the GPU and handled by specialized GPU kernels, while the CPU remains the driver for data movements between nodes. By incorporating our design into the Open MPI library we have shown significantly better performance for non-contiguous GPU-resident data transfers on both shared and distributed memory machines.
}, keywords = {datatype, gpu, hybrid architecture, MPI, non-contiguous data}, doi = {http://dx.doi.org/10.1145/2907294.2907317}, author = {Wei Wu and George Bosilca and Rolf vandeVaart and Sylvain Jeaugey and Jack Dongarra} } @conference {942, title = {High-Performance Tensor Contractions for GPUs}, booktitle = {International Conference on Computational Science (ICCS{\textquoteright}16)}, year = {2016}, month = {2016-06}, address = {San Diego, CA}, abstract = {We present a computational framework for high-performance tensor contractions on GPUs. High-performance is difficult to obtain using existing libraries, especially for many independent contractions where each contraction is very small, e.g., sub-vector/warp in size. However, using our framework to batch contractions plus application-specifics, we demonstrate close to peak performance results. In particular, to accelerate large scale tensor-formulated high-order finite element method (FEM) simulations, which is the main focus and motivation for this work, we represent contractions as tensor index reordering plus matrix-matrix multiplications (GEMMs). This is a key factor to achieve algorithmically many-fold acceleration (vs. not using it) due to possible reuse of data loaded in fast memory. In addition to using this context knowledge, we design tensor data-structures, tensor algebra interfaces, and new tensor contraction algorithms and implementations to achieve 90+\% of a theoretically derived peak on GPUs. On a K40c GPU for contractions resulting in GEMMs on square matrices of size 8 for example, we are 2.8{\texttimes} faster than CUBLAS, and 8.5{\texttimes} faster than MKL on 16 cores of Intel Xeon E5-2670 (Sandy Bridge) 2.60GHz CPUs. Finally, we apply autotuning and code generation techniques to simplify tuning and provide an architecture-aware, user-friendly interface.}, keywords = {Applications, Batched linear algebra, FEM, gpu, Tensor contractions, Tensor HPC}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @conference {841, title = {Hierarchical DAG scheduling for Hybrid Distributed Systems}, booktitle = {29th IEEE International Parallel \& Distributed Processing Symposium (IPDPS)}, year = {2015}, month = {2015-05}, publisher = {IEEE}, organization = {IEEE}, address = {Hyderabad, India}, abstract = {Accelerator-enhanced computing platforms have drawn a lot of attention due to their massive peak com-putational capacity. Despite significant advances in the pro-gramming interfaces to such hybrid architectures, traditional programming paradigms struggle mapping the resulting multi-dimensional heterogeneity and the expression of algorithm parallelism, resulting in sub-optimal effective performance. Task-based programming paradigms have the capability to alleviate some of the programming challenges on distributed hybrid many-core architectures. In this paper we take this concept a step further by showing that the potential of task-based programming paradigms can be greatly increased with minimal modification of the underlying runtime combined with the right algorithmic changes. We propose two novel recursive algorithmic variants for one-sided factorizations and describe the changes to the PaRSEC task-scheduling runtime to build a framework where the task granularity is dynamically adjusted to adapt the degree of available parallelism and kernel effi-ciency according to runtime conditions. Based on an extensive set of results we show that, with one-sided factorizations, i.e. Cholesky and QR, a carefully written algorithm, supported by an adaptive tasks-based runtime, is capable of reaching a degree of performance and scalability never achieved before in distributed hybrid environments. }, keywords = {dense linear algebra, gpu, heterogeneous architecture, PaRSEC runtime}, author = {Wei Wu and Aurelien Bouteiller and George Bosilca and Mathieu Faverge and Jack Dongarra} } @article {936, title = {Parallel Programming Models for Dense Linear Algebra on Heterogeneous Systems}, journal = {Supercomputing Frontiers and Innovations}, volume = {2}, number = {4}, year = {2015}, month = {2015-10}, abstract = {We present a review of the current best practices in parallel programming models for dense linear algebra (DLA) on heterogeneous architectures. We consider multicore CPUs, stand alone manycore coprocessors, GPUs, and combinations of these. Of interest is the evolution of the programming models for DLA libraries {\textendash} in particular, the evolution from the popular LAPACK and ScaLAPACK libraries to their modernized counterparts PLASMA (for multicore CPUs) and MAGMA (for heterogeneous architectures), as well as other programming models and libraries. Besides providing insights into the programming techniques of the libraries considered, we outline our view of the current strengths and weaknesses of their programming models {\textendash} especially in regards to hardware trends and ease of programming high-performance numerical software that current applications need {\textendash} in order to motivate work and future directions for the next generation of parallel programming models for high-performance linear algebra libraries on heterogeneous systems.}, keywords = {dense linear algebra, gpu, HPC, Multicore, plasma, Programming models, runtime}, doi = {10.14529/jsfi1504}, author = {Maksims Abalenkovs and Ahmad Abdelfattah and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Ichitaro Yamazaki and Asim YarKhan} } @article {758, title = {A Novel Hybrid CPU-GPU Generalized Eigensolver for Electronic Structure Calculations Based on Fine Grained Memory Aware Tasks}, journal = {International Journal of High Performance Computing Applications}, volume = {28}, year = {2014}, month = {2014-05}, pages = {196-209}, chapter = {196}, abstract = {The adoption of hybrid CPU{\textendash}GPU nodes in traditional supercomputing platforms such as the Cray-XK6 opens acceleration opportunities for electronic structure calculations in materials science and chemistry applications, where medium-sized generalized eigenvalue problems must be solved many times. These eigenvalue problems are too small to effectively solve on distributed systems, but can benefit from the massive computing power concentrated on a single-node, hybrid CPU{\textendash}GPU system. However, hybrid systems call for the development of new algorithms that efficiently exploit heterogeneity and massive parallelism of not just GPUs, but of multicore/manycore CPUs as well. Addressing these demands, we developed a generalized eigensolver featuring novel algorithms of increased computational intensity (compared with the standard algorithms), decomposition of the computation into fine-grained memory aware tasks, and their hybrid execution. The resulting eigensolvers are state-of-the-art in high-performance computing, significantly outperforming existing libraries. We describe the algorithm and analyze its performance impact on applications of interest when different fractions of eigenvectors are needed by the host electronic structure code. }, keywords = {Eigensolver, electronic structure calculations, generalized eigensolver, gpu, high performance, hybrid, Multicore, two-stage}, doi = {10.1177/1094342013502097 }, author = {Azzam Haidar and Raffaele Solc{\`a} and Mark Gates and Stanimire Tomov and Thomas C. Schulthess and Jack Dongarra} } @conference {709, title = {Taking Advantage of Hybrid Systems for Sparse Direct Solvers via Task-Based Runtimes}, booktitle = {23rd International Heterogeneity in Computing Workshop, IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {The ongoing hardware evolution exhibits an escalation in the number, as well as in the heterogeneity, of the computing resources. The pressure to maintain reasonable levels of performance and portability, forces the application developers to leave the traditional programming paradigms and explore alternative solutions. PaStiX is a parallel sparse direct solver, based on a dynamic scheduler for modern hierarchical architectures. In this paper, we study the replacement of the highly specialized internal scheduler in PaStiX by two generic runtime frameworks: PaRSEC and StarPU. The tasks graph of the factorization step is made available to the two runtimes, providing them with the opportunity to optimize it in order to maximize the algorithm eefficiency for a predefined execution environment. A comparative study of the performance of the PaStiX solver with the three schedulers { native PaStiX, StarPU and PaRSEC schedulers { on different execution contexts is performed. The analysis highlights the similarities from a performance point of view between the different execution supports. These results demonstrate that these generic DAG-based runtimes provide a uniform and portable programming interface across heterogeneous environments, and are, therefore, a sustainable solution for hybrid environments.}, keywords = {DAG based runtime, gpu, Multicore, Sparse linear solver}, author = {Xavier Lacoste and Mathieu Faverge and Pierre Ramet and Samuel Thibault and George Bosilca} } @article {icl:693, title = {LU Factorization with Partial Pivoting for a Multicore System with Accelerators}, journal = {IEEE Transactions on Parallel and Distributed Computing}, volume = {24}, year = {2013}, month = {2013-08}, pages = {1613-1621}, chapter = {1613}, abstract = {LU factorization with partial pivoting is a canonical numerical procedure and the main component of the high performance LINPACK benchmark. This paper presents an implementation of the algorithm for a hybrid, shared memory, system with standard CPU cores and GPU accelerators. The difficulty of implementing the algorithm for such a system lies in the disproportion between the computational power of the CPUs, compared to the GPUs, and in the meager bandwidth of the communication link between their memory systems. An additional challenge comes from the complexity of the memory-bound and synchronization-rich nature of the panel factorization component of the block LU algorithm, imposed by the use of partial pivoting. The challenges are tackled with the use of a data layout geared toward complex memory hierarchies, autotuning of GPU kernels, fine-grain parallelization of memory-bound CPU operations and dynamic scheduling of tasks to different devices. Performance in excess of one TeraFLOPS is achieved using four AMD Magny Cours CPUs and four NVIDIA Fermi GPUs.}, keywords = {accelerator, Gaussian elimination, gpu, lu factorization, manycore, Multicore, partial pivoting, plasma}, doi = {http://doi.ieeecomputersociety.org/10.1109/TPDS.2012.242}, author = {Jakub Kurzak and Piotr Luszczek and Jack Dongarra} } @article {755, title = {Soft Error Resilient QR Factorization for Hybrid System with GPGPU}, journal = {Journal of Computational Science}, volume = {4}, year = {2013}, month = {2013-11}, pages = {457{\textendash}464}, abstract = {The general purpose graphics processing units (GPGPUs) are increasingly deployed for scientific computing due to their performance advantages over CPUs. What followed is the fact that fault tolerance has become a more serious concern compared to the period when GPGPUs were used exclusively for graphics applications. Using GPUs and CPUs together in a hybrid computing system increases flexibility and performance but also increases the possibility of the computations being affected by soft errors, for example, in the form of bit flips. In this work, we propose a soft error resilient algorithm for QR factorization on such hybrid systems. Our contributions include: (1) a checkpointing and recovery mechanism for the left-factor Q whose performance is scalable on hybrid systems; (2) optimized Givens rotation utilities on GPGPUs to efficiently reduce an upper Hessenberg matrix to an upper triangular form for the protection of the right factor R; and (3) a recovery algorithm based on QR update on GPGPUs. Experimental results show that our fault tolerant QR factorization can successfully detect and recover from soft errors in the entire matrix with little overhead on hybrid systems with GPGPUs.}, keywords = {gpgpu, gpu, magma}, doi = {http://dx.doi.org/10.1016/j.jocs.2013.01.004}, author = {Peng Du and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} }