@conference {961, title = {Hessenberg Reduction with Transient Error Resilience on GPU-Based Hybrid Architectures}, booktitle = { 30th IEEE International Parallel \& Distributed Processing Symposium (IPDPS)}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {Graphics Processing Units (GPUs) have been seeing widespread adoption in the field of scientific computing, owing to the performance gains provided on computation-intensive applications. In this paper, we present the design and implementation of a Hessenberg reduction algorithm immune to simultaneous soft-errors, capable of taking advantage of hybrid GPU-CPU platforms. These soft-errors are detected and corrected on the fly, preventing the propagation of the error to the rest of the data. Our design is at the intersection between several fault tolerant techniques and employs the algorithm-based fault tolerance technique, diskless checkpointing, and reverse computation to achieve its goal. By utilizing the idle time of the CPUs, and by overlapping both host-side and GPU-side workloads, we minimize the resilience overhead. Experimental results have validated our design decisions as our algorithm introduced less than 2\% performance overhead compared to the optimized, but fault-prone, hybrid Hessenberg reduction.}, author = {Yulu Jia and Piotr Luszczek and Jack Dongarra} } @conference {939, title = {Heterogeneous Streaming}, booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This paper introduces a new heterogeneous streaming library called hetero Streams (hStreams). We show how a simple FIFO streaming model can be applied to heterogeneous systems that include manycore coprocessors and multicore CPUs. This model supports concurrency across nodes, among tasks within a node, and between data transfers and computation. We give examples for different approaches, show how the implementation can be layered, analyze overheads among layers, and apply those models to parallelize applications using simple, intuitive interfaces. We compare the features and versatility of hStreams, OpenMP, CUDA Streams1 and OmpSs. We show how the use of hStreams makes it easier for scientists to identify tasks and easily expose concurrency among them, and how it enables tuning experts and runtime systems to tailor execution for different heterogeneous targets. Practical application examples are taken from the field of numerical linear algebra, commercial structural simulation software, and a seismic processing application.}, keywords = {plasma}, author = {Chris J. Newburn and Gaurav Bansal and Michael Wood and Luis Crivelli and Judit Planas and Alejandro Duran and Paulo Souza and Leonardo Borges and Piotr Luszczek and Stanimire Tomov and Jack Dongarra and Hartwig Anzt and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Ichitaro Yamazaki and Jesus Labarta} } @article {829, title = {HPC Programming on Intel Many-Integrated-Core Hardware with MAGMA Port to Xeon Phi}, journal = {Scientific Programming}, volume = {23}, year = {2015}, month = {2015-01}, abstract = {This paper presents the design and implementation of several fundamental dense linear algebra (DLA) algorithms for multicore with Intel Xeon Phi Coprocessors. In particular, we consider algorithms for solving linear systems. Further, we give an overview of the MAGMA MIC library, an open source, high performance library that incorporates the developments presented, and in general provides to heterogeneous architectures of multicore with coprocessors the DLA functionality of the popular LAPACK library. The LAPACK-compliance simplifies the use of the MAGMA MIC library in applications, while providing them with portably performant DLA. High performance is obtained through use of the high-performance BLAS, hardware-specific tuning, and a hybridization methodology where we split the algorithm into computational tasks of various granularities. Execution of those tasks is properly scheduled over the heterogeneous hardware components by minimizing data movements and mapping algorithmic requirements to the architectural strengths of the various heterogeneous hardware components. Our methodology and programming techniques are incorporated into the MAGMA MIC API, which abstracts the application developer from the specifics of the Xeon Phi architecture and is therefore applicable to algorithms beyond the scope of DLA.}, keywords = {communication and computation overlap, dynamic runtime scheduling using dataflow dependences, hardware accelerators and coprocessors, Intel Xeon Phi processor, Many Integrated Cores, numerical linear algebra}, issn = {1058-9244}, doi = {10.3233/SPR-140404}, author = {Azzam Haidar and Jack Dongarra and Khairul Kabir and Mark Gates and Piotr Luszczek and Stanimire Tomov and Yulu Jia} }