@conference {, title = {Elastic deep learning through resilient collective operations}, booktitle = {SC-W 2023: Workshops of The International Conference on High Performance Computing, Network, Storage, and Analysis}, year = {2023}, month = {2023-11}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, abstract = {A robust solution that incorporates fault tolerance and elastic scaling capabilities for distributed deep learning. Taking advantage of MPI resilient capabilities, aka. User-Level Failure Mitigation (ULFM), this novel approach promotes efficient and lightweight failure management and encourages smooth scaling in volatile computational settings. The proposed ULFM MPI-centered mechanism outperforms the only officially supported elastic learning framework, Elastic Horovod (using Gloo and NCCL), by a significant factor. These results reinforce the capability of MPI extension to deal with resiliency, and promote ULFM as an effective technique for fault management, minimizing downtime, and thereby enhancing the overall performance of distributed applications, in particular elastic training in high-performance computing (HPC) environments and machine learning applications.}, isbn = {9798400707858}, doi = {10.1145/3624062.3626080}, url = {https://dl.acm.org/doi/abs/10.1145/3624062.3626080}, author = {Li, Jiali and Bosilca, George and Bouteiller, Aur{\'e}lien and Nicolae, Bogdan} } @conference {, title = {Preconditioners for Batched Iterative Linear Solvers on GPUs}, booktitle = {Smoky Mountains Computational Sciences and Engineering Conference}, volume = {169075}, year = {2023}, month = {2023-01}, pages = {38 - 53}, publisher = {Springer Nature Switzerland}, organization = {Springer Nature Switzerland}, abstract = {Batched iterative solvers can be an attractive alternative to batched direct solvers if the linear systems allow for fast convergence. In non-batched settings, iterative solvers are often enhanced with sophisticated preconditioners to improve convergence. In this paper, we develop preconditioners for batched iterative solvers that improve the iterative solver convergence without incurring detrimental resource overhead and preserving much of the iterative solver flexibility. We detail the design and implementation considerations, present a user-friendly interface to the batched preconditioners, and demonstrate the convergence and runtime benefits over non-preconditioned batched iterative solvers on state-of-the-art GPUs for a variety of benchmark problems from finite difference stencil matrices, the Suitesparse matrix collection and a computational chemistry application.}, isbn = {978-3-031-23605-1}, doi = {10.1007/978-3-031-23606-810.1007/978-3-031-23606-8_3}, url = {https://link.springer.com/chapter/10.1007/978-3-031-23606-8_3}, author = {Aggarwal, Isha and Nayak, Pratik and Kashi, Aditya and Anzt, Hartwig}, editor = {Doug, Kothe and Al, Geist and Pophale, Swaroop and Liu, Hong and Parete-Koon, Suzanne} } @conference {, title = {Batched sparse iterative solvers on GPU for the collision operator for fusion plasma simulations}, booktitle = {2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2022}, month = {2022-07}, publisher = {IEEE}, organization = {IEEE}, address = {Lyon, France}, abstract = {Batched linear solvers, which solve many small related but independent problems, are important in several applications. This is increasingly the case for highly parallel processors such as graphics processing units (GPUs), which need a substantial amount of work to keep them operating efficiently and solving smaller problems one-by-one is not an option. Because of the small size of each problem, the task of coming up with a parallel partitioning scheme and mapping the problem to hardware is not trivial. In recent history, significant attention has been given to batched dense linear algebra. However, there is also an interest in utilizing sparse iterative solvers in a batched form, and this presents further challenges. An example use case is found in a gyrokinetic Particle-In-Cell (PIC) code used for modeling magnetically confined fusion plasma devices. The collision operator has been identified as a bottleneck, and a proxy app has been created for facilitating optimizations and porting to GPUs. The current collision kernel linear solver does not run on the GPU-a major bottleneck. As these matrices are well-conditioned, batched iterative sparse solvers are an attractive option. A batched sparse iterative solver capability has recently been developed in the Ginkgo library. In this paper, we describe how the software architecture can be used to develop an efficient solution for the XGC collision proxy app. Comparisons for the solve times on NVIDIA V100 and A100 GPUs and AMD MI100 GPUs with one dual-socket Intel Xeon Skylake CPU node with 40 OpenMP threads are presented for matrices representative of those required in the collision kernel of XGC. The results suggest that GINKGO{\textquoteright}s batched sparse iterative solvers are well suited for efficient utilization of the GPU for this problem, and the performance portability of Ginkgo in conjunction with Kokkos (used within XGC as the heterogeneous programming model) allows seamless execution for exascale oriented heterogeneous architectures at the various leadership supercomputing facilities.}, doi = {10.1109/IPDPS53621.2022.00024}, url = {https://ieeexplore.ieee.org/document/9820663}, author = {Kashi, Aditya and Nayak, Pratik and Kulkarni, Dhruva and Scheinberg, Aaron and Lin, Paul and Anzt, Hartwig} } @conference {, title = {Generalized Flow-Graph Programming Using Template Task-Graphs: Initial Implementation and Assessment}, booktitle = {2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2022}, month = {2022-07}, publisher = {IEEE}, organization = {IEEE}, address = {Lyon, France}, abstract = {We present and evaluate TTG, a novel programming model and its C++ implementation that by marrying the ideas of control and data flowgraph programming supports compact specification and efficient distributed execution of dynamic and irregular applications. Programming interfaces that support task-based execution often only support shared memory parallel environments; a few support distributed memory environments, either by discovering the entire DAG of tasks on all processes, or by introducing explicit communications. The first approach limits scalability, while the second increases the complexity of programming. We demonstrate how TTG can address these issues without sacrificing scalability or programmability by providing higher-level abstractions than conventionally provided by task-centric programming systems, without impeding the ability of these runtimes to manage task creation and execution as well as data and resource management efficiently. TTG supports distributed memory execution over 2 different task runtimes, PaRSEC and MADNESS. Performance of four paradigmatic applications (in graph analytics, dense and block-sparse linear algebra, and numerical integrodifferential calculus) with various degrees of irregularity implemented in TTG is illustrated on large distributed-memory platforms and compared to the state-of-the-art implementations.}, doi = {10.1109/IPDPS53621.2022.00086}, url = {https://ieeexplore.ieee.org/abstract/document/9820613}, author = {Schuchart, Joseph and Nookala, Poornima and Javanmard, Mohammad Mahdi and Herault, Thomas and Valeev, Edward F. and George Bosilca and Harrison, Robert J.} } @article {, title = {Ginkgo: A Modern Linear Operator Algebra Framework for High Performance Computing}, journal = {ACM Transactions on Mathematical Software}, volume = {48}, year = {2022}, month = {2022-03}, pages = {1 - 33}, abstract = {In this article, we present Ginkgo, a modern C++ math library for scientific high performance computing. While classical linear algebra libraries act on matrix and vector objects, Ginkgo{\textquoteright}s design principle abstracts all functionality as {\textquotedblleft}linear operators,{\textquotedblright} motivating the notation of a {\textquotedblleft}linear operator algebra library.{\textquotedblright} Ginkgo{\textquoteright}s current focus is oriented toward providing sparse linear algebra functionality for high performance graphics processing unit (GPU) architectures, but given the library design, this focus can be easily extended to accommodate other algorithms and hardware architectures. We introduce this sophisticated software architecture that separates core algorithms from architecture-specific backends and provide details on extensibility and sustainability measures. We also demonstrate Ginkgo{\textquoteright}s usability by providing examples on how to use its functionality inside the MFEM and deal.ii finite element ecosystems. Finally, we offer a practical demonstration of Ginkgo{\textquoteright}s high performance on state-of-the-art GPU architectures.}, issn = {0098-3500}, doi = {10.1145/3480935}, url = {https://dl.acm.org/doi/10.1145/3480935}, author = {Anzt, Hartwig and Cojean, Terry and Flegar, Goran and G{\"o}bel, Fritz and Gr{\"u}tzmacher, Thomas and Nayak, Pratik and Ribizel, Tobias and Tsai, Yuhsiang Mike and Quintana-Ort{\'\i}, Enrique S} } @inproceedings {, title = {{Integrating process, control-flow, and data resiliency layers using a hybrid Fenix/Kokkos approach}}, journal = {2022 IEEE International Conference on Cluster Computing (CLUSTER 2022)}, year = {2022}, month = {2022-09}, address = {Heidelberg, Germany}, keywords = {checkpointing, Fault tolerance, Fenix, HPC, Kokkos, MPI-ULFM, resilience}, url = {https://hal.archives-ouvertes.fr/hal-03772536}, author = {Whitlock, Matthew and Morales, Nicolas and George Bosilca and Bouteiller, Aur{\'e}lien and Nicolae, Bogdan and Teranishi, Keita and Giem, Elisabeth and Sarkar, Vivek} } @conference {, title = {Pushing the Boundaries of Small Tasks: Scalable Low-Overhead Data-Flow Programming in TTG}, booktitle = {2022 IEEE International Conference on Cluster Computing (CLUSTER)}, year = {2022}, month = {2022-09}, publisher = {IEEE}, organization = {IEEE}, address = {Heidelberg, Germany}, abstract = {Shared memory parallel programming models strive to provide low-overhead execution environments. Task-based programming models, in particular, are well-suited to cope with the ubiquitous multi- and many-core systems since they allow applications to express all available concurrency to a scheduler, which is tasked with exploiting the available hardware resources. It is general consensus that atomic operations should be preferred over locks and mutexes to avoid inter-thread serialization and the resulting loss in efficiency. However, even atomic operations may serialize threads if not used judiciously. In this work, we will discuss several optimizations applied to TTG and the underlying PaRSEC runtime system aiming at removing contentious atomic operations to reduce the overhead of task management to a few hundred clock cycles. The result is an optimized data-flow programming system that seamlessly scales from a single node to distributed execution and which is able to compete with OpenMP in shared memory.}, keywords = {Dataflow graph, Hardware, Instruction sets, Memory management, PaR-SEC, parallel programming, runtime, scalability, Task analysis, task-based programming, Template Task Graph, TTG}, doi = {10.1109/CLUSTER51413.2022.00026}, url = {https://ieeexplore.ieee.org/document/9912704/}, author = {Schuchart, Joseph and Nookala, Poornima and Herault, Thomas and Valeev, Edward F. and George Bosilca} } @conference {, title = {A Python Library for Matrix Algebra on GPU and Multicore Architectures}, booktitle = {2022 IEEE 19th International Conference on Mobile Ad Hoc and Smart Systems (MASS)}, year = {2022}, month = {2022-12}, publisher = {IEEE}, organization = {IEEE}, address = {Denver, CO}, doi = {10.1109/MASS56207.2022.00121}, url = {https://ieeexplore.ieee.org/document/9973474/}, author = {Nance, Delario and Stanimire Tomov and Wong, Kwai} } @inproceedings {, title = {Reshaping Geostatistical Modeling and Prediction for Extreme-Scale Environmental Applications}, journal = {2022 International Conference for High Performance Computing, Networking, Storage and Analysis (SC22)}, year = {2022}, month = {2022-11}, publisher = {IEEE Press}, address = {Dallas, TX}, abstract = {We extend the capability of space-time geostatistical modeling using algebraic approximations, illustrating application-expected accuracy worthy of double precision from majority low-precision computations and low-rank matrix approximations. We exploit the mathematical structure of the dense covariance matrix whose inverse action and determinant are repeatedly required in Gaussian log-likelihood optimization. Geostatistics augments first-principles modeling approaches for the prediction of environmental phenomena given the availability of measurements at a large number of locations; however, traditional Cholesky-based approaches grow cubically in complexity, gating practical extension to continental and global datasets now available. We combine the linear algebraic contributions of mixed-precision and low-rank computations within a tilebased Cholesky solver with on-demand casting of precisions and dynamic runtime support from PaRSEC to orchestrate tasks and data movement. Our adaptive approach scales on various systems and leverages the Fujitsu A64FX nodes of Fugaku to achieve up to 12X performance speedup against the highly optimized dense Cholesky implementation.}, keywords = {climate/weather prediction, dynamic runtime systems, high performance computing., low- rank matrix approximations, mixed-precision computations, space-time geospatial statistics, Task-based programming models}, isbn = {9784665454445}, url = {https://dl.acm.org/doi/abs/10.5555/3571885.3571888}, author = {Cao, Qinglei and Abdulah, Sameh and Rabab Alomairy and Pei, Yu and Pratik Nag and George Bosilca and Dongarra, Jack and Genton, Marc G. and Keyes, David and Ltaief, Hatem and Sun, Ying} } @article {, title = {Callback-based completion notification using MPI Continuations}, journal = {Parallel Computing}, volume = {21238566}, year = {2021}, month = {Jan-05-2021}, pages = {102793}, abstract = {Asynchronous programming models (APM) are gaining more and more traction, allowing applications to expose the available concurrency to a runtime system tasked with coordinating the execution. While MPI has long provided support for multi-threaded communication and nonblocking operations, it falls short of adequately supporting APMs as correctly and efficiently handling MPI communication in different models is still a challenge. We have previously proposed an extension to the MPI standard providing operation completion notifications using callbacks, so-called MPI Continuations. This interface is flexible enough to accommodate a wide range of different APMs. In this paper, we present an extension to the previously described interface that allows for finer control of the behavior of the MPI Continuations interface. We then present some of our first experiences in using the interface in the context of different applications, including the NAS parallel benchmarks, the PaRSEC task-based runtime system, and a load-balancing scheme within an adaptive mesh refinement solver called ExaHyPE. We show that the interface, implemented inside Open MPI, enables low-latency, high-throughput completion notifications that outperform solutions implemented in the application space.}, keywords = {MPI, MPI Continuations, OmpSs, OpenMP, parsec, TAMPI, Task-based programming models}, issn = {01678191}, doi = {10.1016/j.parco.2021.102793}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0167819121000466?via\%3Dihub}, author = {Schuchart, Joseph and Samfass, Philipp and Niethammer, Christoph and Gracia, Jos{\'e} and George Bosilca} } @article {, title = {Exploiting Block Structures of KKT Matrices for Efficient Solution of Convex Optimization Problems}, journal = {IEEE Access}, year = {2021}, doi = {10.1109/ACCESS.2021.3106054}, author = {Iqbal, Zafar and Nooshabadi, Saeid and Yamazaki, Ichitaro and Stanimire Tomov and Jack Dongarra} } @article {, title = {Gingko: A Sparse Linear Algebrea Library for HPC}, year = {2021}, month = {2021-04}, publisher = {2021 ECP Annual Meeting}, author = {Hartwig Anzt and Natalie Beams and Terry Cojean and Fritz G{\"o}bel and Thomas Gr{\"u}tzmacher and Aditya Kashi and Pratik Nayak and Tobias Ribizel and Yuhsiang M. Tsai} } @article {, title = {Materials fingerprinting classification}, journal = {Computer Physics Communications}, year = {2021}, month = {Jan-05-2021}, pages = {108019}, abstract = {Significant progress in many classes of materials could be made with the availability of experimentally-derived large datasets composed of atomic identities and three-dimensional coordinates. Methods for visualizing the local atomic structure, such as atom probe tomography (APT), which routinely generate datasets comprised of millions of atoms, are an important step in realizing this goal. However, state-of-the-art APT instruments generate noisy and sparse datasets that provide information about elemental type, but obscure atomic structures, thus limiting their subsequent value for materials discovery. The application of a materials fingerprinting process, a machine learning algorithm coupled with topological data analysis, provides an avenue by which here-to-fore unprecedented structural information can be extracted from an APT dataset. As a proof of concept, the material fingerprint is applied to high-entropy alloy APT datasets containing body-centered cubic (BCC) and face-centered cubic (FCC) crystal structures. A local atomic configuration centered on an arbitrary atom is assigned a topological descriptor, with which it can be characterized as a BCC or FCC lattice with near perfect accuracy, despite the inherent noise in the dataset. This successful identification of a fingerprint is a crucial first step in the development of algorithms which can extract more nuanced information, such as chemical ordering, from existing datasets of complex materials.}, keywords = {Atom probe tomography, High entropy alloy, Machine Learning, Materials discovery, Topological data analysis}, issn = {00104655}, doi = {10.1016/j.cpc.2021.108019}, url = {https://linkinghub.elsevier.com/retrieve/pii/S0010465521001314}, author = {Spannaus, Adam and Law, Kody J.H. and Piotr Luszczek and Nasrin, Farzana and Micucci, Cassie Putman and Liaw, Peter K. and Santodonato, Louis J. and Keffer, David J. and Maroulas, Vasileios} } @conference {, title = {Quo Vadis MPI RMA? Towards a More Efficient Use of MPI One-Sided Communication}, booktitle = {EuroMPI{\textquoteright}21}, year = {2021}, address = {Garching, Munich Germany}, abstract = { The MPI standard has long included one-sided communication abstractions through the MPI Remote Memory Access (RMA) interface. Unfortunately, the MPI RMA chapter in the 4.0 version of the MPI standard still contains both well-known and lesser known short-comings for both implementations and users, which lead to potentially non-optimal usage patterns. In this paper, we identify a set of issues and propose ways for applications to better express anticipated usage of RMA routines, allowing the MPI implementation to better adapt to the application{\textquoteright}s needs. In order to increase the flexibility of the RMA interface, we add the capability to duplicate windows, allowing access to the same resources encapsulated by a window using different configurations. In the same vein, we introduce the concept of MPI memory handles, meant to provide life-time guarantees on memory attached to dynamic windows, removing the overhead currently present in using dynamically exposed memory. We will show that our extensions provide improved accumulate latencies, reduced overheads for multi-threaded flushes, and allow for zero overhead dynamic memory window usage. }, keywords = {Memory Handles, MPI, MPI-RMA, RDMA}, url = {https://arxiv.org/abs/2111.08142}, author = {Schuchart, Joseph and Niethammer, Christoph and Gracia, Jos{\'e} and George Bosilca} } @conference {, title = {DeepFreeze: Towards Scalable Asynchronous Checkpointing of Deep Learning Models}, booktitle = {20th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID)}, year = {2020}, month = {2020-05}, publisher = {IEEE}, organization = {IEEE}, address = {Melbourne, VIC, Australia}, abstract = {In the age of big data, deep learning has emerged as a powerful tool to extract insight and exploit its value, both in industry and scientific applications. One common pattern emerging in such applications is frequent checkpointing of the state of the learning model during training, needed in a variety of scenarios: analysis of intermediate states to explain features and correlations with training data, exploration strategies involving alternative models that share a common ancestor, knowledge transfer, resilience, etc. However, with increasing size of the learning models and popularity of distributed data-parallel training approaches, simple checkpointing techniques used so far face several limitations: low serialization performance, blocking I/O, stragglers due to the fact that only a single process is involved in checkpointing. This paper proposes a checkpointing technique specifically designed to address the aforementioned limitations, introducing efficient asynchronous techniques to hide the overhead of serialization and I/O, and distribute the load over all participating processes. Experiments with two deep learning applications (CANDLE and ResNet) on a pre-Exascale HPC platform (Theta) shows significant improvement over state-of-art, both in terms of checkpointing duration and runtime overhead.}, doi = {https://doi.org/10.1109/CCGrid49817.2020.00-76}, author = {Bogdan Nicolae and Jiali Li and Justin M. Wozniak and George Bosilca and Matthieu Dorier and Franck Cappello} } @article {, title = {Evaluating Asynchronous Schwarz Solvers on GPUs}, journal = {International Journal of High Performance Computing Applications}, year = {2020}, month = {2020-08}, abstract = {With the commencement of the exascale computing era, we realize that the majority of the leadership supercomputers are heterogeneous and massively parallel. Even a single node can contain multiple co-processors such as GPUs and multiple CPU cores. For example, ORNL{\textquoteright}s Summit accumulates six NVIDIA Tesla V100 GPUs and 42 IBM Power9 cores on each node. Synchronizing across compute resources of multiple nodes can be prohibitively expensive. Hence, it is necessary to develop and study asynchronous algorithms that circumvent this issue of bulk-synchronous computing. In this study, we examine the asynchronous version of the abstract Restricted Additive Schwarz method as a solver. We do not explicitly synchronize, but allow the communication between the sub-domains to be completely asynchronous, thereby removing the bulk synchronous nature of the algorithm. We accomplish this by using the one-sided Remote Memory Access (RMA) functions of the MPI standard. We study the benefits of using such an asynchronous solver over its synchronous counterpart. We also study the communication patterns governed by the partitioning and the overlap between the sub-domains on the global solver. Finally, we show that this concept can render attractive performance benefits over the synchronous counterparts even for a well-balanced problem.}, keywords = {abstract Schwarz methods, Asynchronous solvers, exascale, GPUs, multicore processors, parallel numerical linear algebra}, doi = {https://doi.org/10.1177/1094342020946814}, author = {Pratik Nayak and Terry Cojean and Hartwig Anzt} } @article {, title = {Ginkgo: A High Performance Numerical Linear Algebra Library}, journal = {Journal of Open Source Software}, volume = {5}, year = {2020}, month = {2020-08}, abstract = {Ginkgo is a production-ready sparse linear algebra library for high performance computing on GPU-centric architectures with a high level of performance portability and focuses on software sustainability. The library focuses on solving sparse linear systems and accommodates a large variety of matrix formats, state-of-the-art iterative (Krylov) solvers and preconditioners, which make the library suitable for a variety of scientific applications. Ginkgo supports many architectures such as multi-threaded CPU, NVIDIA GPUs, and AMD GPUs. The heavy use of modern C++ features simplifies the addition of new executor paradigms and algorithmic functionality without introducing significant performance overhead. Solving linear systems is usually one of the most computationally and memory intensive aspects of any application. Hence there has been a significant amount of effort in this direction with software libraries such as UMFPACK (Davis, 2004) and CHOLMOD (Chen, Davis, Hager, \& Rajamanickam, 2008) for solving linear systems with direct methods and PETSc (Balay et al., 2020), Trilinos ({\textquotedblleft}The Trilinos Project Website,{\textquotedblright} 2020), Eigen (Guennebaud, Jacob, \& others, 2010) and many more to solve linear systems with iterative methods. With Ginkgo, we aim to ensure high performance while not compromising portability. Hence, we provide very efficient low level kernels optimized for different architectures and separate these kernels from the algorithms thereby ensuring extensibility and ease of use. Ginkgo is also a part of the xSDK effort (Bartlett et al., 2017) and available as a Spack (Gamblin et al., 2015) package. xSDK aims to provide infrastructure for and interoperability between a collection of related and complementary software elements to foster rapid and efficient development of scientific applications using High Performance Computing. Within this effort, we provide interoperability with application libraries such as deal.ii (Arndt et al., 2019) and mfem (Anderson et al., 2020). Ginkgo provides wrappers within these two libraries so that they can take advantage of the features of Ginkgo.}, doi = {https://doi.org/10.21105/joss.02260}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai} } @article {, title = {Ginkgo: A Node-Level Sparse Linear Algebra Library for HPC (Poster)}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Hartwig Anzt and Terry Cojean and Yen-Chen Chen and Fritz Goebel and Thomas Gruetzmacher and Pratik Nayak and Tobias Ribizel and Yu-Hsiang Tsai and Jack Dongarra} } @article {, title = {How to Build Your Own Deep Neural Network}, year = {2020}, month = {2020-07}, publisher = {PEARC20}, keywords = {AI, Deep Neural Networks, dense linear algebra, HPC, ML}, author = {Kwai Wong and Stanimire Tomov and Daniel Nichols and Rocco Febbo and Florent Lopez and Julian Halloy and Xianfeng Ma} } @article {, title = {Integrating Deep Learning in Domain Science at Exascale (MagmaDNN)}, year = {2020}, month = {2020-12}, publisher = {DOD HPCMP seminar}, address = {virtual}, abstract = {We will present some of the current challenges in the design and integration of deep learning AI with traditional HPC simulations. We evaluate existing packages for readiness to run efficiently deep learning models and applications on large scale HPC systems, identify challenges, and propose new asynchronous parallelization and optimization techniques for current large-scale heterogeneous systems and up-coming exascale systems. These developments, along with existing HPC AI software capabilities, have been integrated in MagmaDNN, an open source HPC deep learning framework. Many deep learning frameworks are targeted towards data scientists and fall short in providing quality integration into existing HPC workflows. This paper discusses the necessities of an HPC deep learning framework and how these can be provided, e.g., as in MagmaDNN, through a deep integration with existing HPC libraries such as MAGMA and its modular memory management, MPI, CuBLAS, CuDNN, MKL, and HIP. Advancements are also illustrated through the use of algorithmic enhancements in reduced and mixed-precision and asynchronous optimization methods. Finally, we present illustrations and potential solutions on enhancing traditional compute and data intensive applications at ORNL and UTK with AI. The approaches and future challenges are illustrated on materials science, imaging, and climate applications.}, author = {Stanimire Tomov and Kwai Wong and Jack Dongarra and Rick Archibald and Edmond Chow and Eduardo D{\textquoteright}Azevedo and Markus Eisenbach and Rocco Febbo and Florent Lopez and Daniel Nichols and Junqi Yin} } @techreport {, title = {Integrating Deep Learning in Domain Sciences at Exascale}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-20-10}, year = {2020}, month = {2020-08}, publisher = {University of Tennessee}, abstract = {This paper presents some of the current challenges in designing deep learning artificial intelligence (AI) and integrating it with traditional high-performance computing (HPC) simulations. We evaluate existing packages for their ability to run deep learning models and applications on large-scale HPC systems e ciently, identify challenges, and propose new asynchronous parallelization and optimization techniques for current large-scale heterogeneous systems and upcoming exascale systems. These developments, along with existing HPC AI software capabilities, have been integrated into MagmaDNN, an open-source HPC deep learning framework. Many deep learning frameworks are targeted at data scientists and fall short in providing quality integration into existing HPC workflows. This paper discusses the necessities of an HPC deep learning framework and how those needs can be provided (e.g., as in MagmaDNN) through a deep integration with existing HPC libraries, such as MAGMA and its modular memory management, MPI, CuBLAS, CuDNN, MKL, and HIP. Advancements are also illustrated through the use of algorithmic enhancements in reduced- and mixed-precision, as well as asynchronous optimization methods. Finally, we present illustrations and potential solutions for enhancing traditional compute- and data-intensive applications at ORNL and UTK with AI. The approaches and future challenges are illustrated in materials science, imaging, and climate applications.}, author = {Rick Archibald and Edmond Chow and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Markus Eisenbach and Rocco Febbo and Florent Lopez and Daniel Nichols and Stanimire Tomov and Kwai Wong and Junqi Yin} } @conference {, title = {Integrating Deep Learning in Domain Sciences at Exascale}, booktitle = {2020 Smoky Mountains Computational Sciences and Engineering Conference (SMC 2020)}, year = {2020}, month = {2020-08}, abstract = {This paper presents some of the current challenges in designing deep learning artificial intelligence (AI) and integrating it with traditional high-performance computing (HPC) simulations. We evaluate existing packages for their ability to run deep learning models and applications on large-scale HPC systems e ciently, identify challenges, and propose new asynchronous parallelization and optimization techniques for current large-scale heterogeneous systems and upcoming exascale systems. These developments, along with existing HPC AI software capabilities, have been integrated into MagmaDNN, an open-source HPC deep learning framework. Many deep learning frameworks are targeted at data scientists and fall short in providing quality integration into existing HPC workflows. This paper discusses the necessities of an HPC deep learning framework and how those needs can be provided (e.g., as in MagmaDNN) through a deep integration with existing HPC libraries, such as MAGMA and its modular memory management, MPI, CuBLAS, CuDNN, MKL, and HIP. Advancements are also illustrated through the use of algorithmic enhancements in reduced- and mixed-precision, as well as asynchronous optimization methods. Finally, we present illustrations and potential solutions for enhancing traditional compute- and data-intensive applications at ORNL and UTK with AI. The approaches and future challenges are illustrated in materials science, imaging, and climate applications.}, author = {Rick Archibald and Edmond Chow and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Markus Eisenbach and Rocco Febbo and Florent Lopez and Daniel Nichols and Stanimire Tomov and Kwai Wong and Junqi Yin} } @article {, title = {Load-Balancing Sparse Matrix Vector Product Kernels on GPUs}, journal = {ACM Transactions on Parallel Computing}, volume = {7}, year = {2020}, month = {2020-03}, abstract = {Efficient processing of Irregular Matrices on Single Instruction, Multiple Data (SIMD)-type architectures is a persistent challenge. Resolving it requires innovations in the development of data formats, computational techniques, and implementations that strike a balance between thread divergence, which is inherent for Irregular Matrices, and padding, which alleviates the performance-detrimental thread divergence but introduces artificial overheads. To this end, in this article, we address the challenge of designing high performance sparse matrix-vector product (SpMV) kernels designed for Nvidia Graphics Processing Units (GPUs). We present a compressed sparse row (CSR) format suitable for unbalanced matrices. We also provide a load-balancing kernel for the coordinate (COO) matrix format and extend it to a hybrid algorithm that stores part of the matrix in SIMD-friendly Ellpack format (ELL) format. The ratio between the ELL- and the COO-part is determined using a theoretical analysis of the nonzeros-per-row distribution. For the over 2,800 test matrices available in the Suite Sparse matrix collection, we compare the performance against SpMV kernels provided by NVIDIA{\textquoteright}s cuSPARSE library and a heavily-tuned sliced ELL (SELL-P) kernel that prevents unnecessary padding by considering the irregular matrices as a combination of matrix blocks stored in ELL format.}, doi = {https://doi.org/10.1145/3380930}, author = {Hartwig Anzt and Terry Cojean and Chen Yen-Chen and Jack Dongarra and Goran Flegar and Pratik Nayak and Stanimire Tomov and Yuhsiang M. Tsai and Weichung Wang} } @techreport {, title = {A Survey of Numerical Methods Utilizing Mixed Precision Arithmetic}, journal = {SLATE Working Notes}, number = {15, ICL-UT-20-08}, year = {2020}, month = {2020-07}, publisher = {University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Erik Boman and Erin Carson and Terry Cojean and Jack Dongarra and Mark Gates and Thomas Gruetzmacher and Nicholas J. Higham and Sherry Li and Neil Lindquist and Yang Liu and Jennifer Loe and Piotr Luszczek and Pratik Nayak and Sri Pranesh and Siva Rajamanickam and Tobias Ribizel and Barry Smith and Kasia Swirydowicz and Stephen Thomas and Stanimire Tomov and Yaohung Tsai and Ichitaro Yamazaki and Urike Meier Yang} } @conference {, title = {The Template Task Graph (TTG) - An Emerging Practical Dataflow Programming Paradigm for Scientific Simulation at Extreme Scale}, booktitle = { 2020 IEEE/ACM 5th International Workshop on Extreme Scale Programming Models and Middleware (ESPM2)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {We describe TESSE, an emerging general-purpose, open-source software ecosystem that attacks the twin challenges of programmer productivity and portable performance for advanced scientific applications on modern high-performance computers. TESSE builds upon and extends the ParsecDAG/-dataflow runtime with a new Domain Specific Languages (DSL) and new integration capabilities. Motivating this work is our belief that such a dataflow model, perhaps with applications composed in domain specific languages, can overcome many of the challenges faced by a wide variety of irregular applications that are poorly served by current programming and execution models. Two such applications from many-body physics and applied mathematics are briefly explored. This paper focuses upon the Template Task Graph (TTG), which is TESSE{\textquoteright}s main C++ Api that provides a powerful work/data-flow programming model. Algorithms on spatial trees, block-sparse tensors, and wave fronts are used to illustrate the API and associated concepts, as well as to compare with related approaches.}, keywords = {dag, dataflow, exascale, graph, High-performance computing, workflow}, doi = {https://doi.org/10.1109/ESPM251964.2020.00011}, author = {George Bosilca and Robert Harrison and Thomas Herault and Mohammad Mahdi Javanmard and Poornima Nookala and Edward Valeev} } @techreport {1398, title = {A Collection of Presentations from the BDEC2 Workshop in Kobe, Japan}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-09}, year = {2019}, month = {2019-02}, publisher = {University of Tennessee, Knoxville}, author = {Rosa M. Badia and Micah Beck and Fran{\c c}ois Bodin and Taisuke Boku and Franck Cappello and Alok Choudhary and Carlos Costa and Ewa Deelman and Nicola Ferrier and Katsuki Fujisawa and Kohei Fujita and Maria Girone and Geoffrey Fox and Shantenu Jha and Yoshinari Kameda and Christian Kniep and William Kramer and James Lin and Kengo Nakajima and Yiwei Qiu and Kishore Ramachandran and Glenn Ricart and Kim Serradell and Dan Stanzione and Lin Gan and Martin Swany and Christine Sweeney and Alex Szalay and Christine Kirkpatrick and Kenton McHenry and Alainna White and Steve Tuecke and Ian Foster and Joe Mambretti and William. M Tang and Michela Taufer and Miguel V{\'a}zquez} } @article {1366, title = {MagmaDNN 0.2 High-Performance Data Analytics for Manycore GPUs and CPUs}, year = {2019}, month = {2019-01}, publisher = {University of Tennessee}, doi = {10.13140/RG.2.2.14906.64961}, author = {Lucien Ng and Sihan Chen and Alex Gessinger and Daniel Nichols and Sophia Cheng and Anu Meenasorna and Kwai Wong and Stanimire Tomov and Azzam Haidar and Eduardo D{\textquoteright}Azevedo and Jack Dongarra} } @conference {1326, title = {MagmaDNN: Accelerated Deep Learning Using MAGMA}, booktitle = {Practice and Experience in Advanced Research Computing (PEARC {\textquoteright}19)}, year = {2019}, month = {2019-07}, publisher = {ACM}, organization = {ACM}, address = {Chicago, IL}, author = {Daniel Nichols and Kwai Wong and Stanimire Tomov and Lucien Ng and Sihan Chen and Alex Gessinger} } @conference {1324, title = {MagmaDNN: Towards High-Performance Data Analytics and Machine Learning for Data-Driven Scientific Computing}, booktitle = {ISC High Performance}, year = {2019}, month = {2019-06}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Frankfurt, Germany}, abstract = {In this paper, we present work towards the development of a new data analytics and machine learning (ML) framework, called MagmaDNN. Our main goal is to provide scalable, high-performance data analytics and ML solutions for scientific applications running on current and upcoming heterogeneous many-core GPU-accelerated architectures. To this end, since many of the functionalities needed are based on standard linear algebra (LA) routines, we designed MagmaDNN to derive its performance power from the MAGMA library. The close integration provides the fundamental (scalable high-performance) LA routines available in MAGMA as a backend to MagmaDNN. We present some design issues for performance and scalability that are specific to ML using Deep Neural Networks (DNN), as well as the MagmaDNN designs towards overcoming them. In particular, MagmaDNN uses well established HPC techniques from the area of dense LA, including task-based parallelization, DAG representations, scheduling, mixed-precision algorithms, asynchronous solvers, and autotuned hyperparameter optimization. We illustrate these techniques and their incorporation and use to outperform other frameworks, currently available.}, doi = {https://doi.org/10.1007/978-3-030-34356-9_37}, author = {Daniel Nichols and Natalie-Sofia Tomov and Frank Betancourt and Stanimire Tomov and Kwai Wong and Jack Dongarra} } @conference {1327, title = {OpenDIEL: A Parallel Workflow Engine and DataAnalytics Framework}, booktitle = {Practice and Experience in Advanced Research Computing (PEARC {\textquoteright}19)}, year = {2019}, month = {2019-07}, publisher = {ACM}, organization = {ACM}, address = {Chicago, IL}, author = {Frank Betancourt and Kwai Wong and Efosa Asemota and Quindell Marshall and Daniel Nichols and Stanimire Tomov} } @conference {1318, title = {Towards Continuous Benchmarking}, booktitle = {Platform for Advanced Scientific Computing Conference (PASC 2019)}, year = {2019}, month = {2019-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Zurich, Switzerland}, abstract = {We present an automated performance evaluation framework that enables an automated workflow for testing and performance evaluation of software libraries. Integrating this component into an ecosystem enables sustainable software development, as a community effort, via a web application for interactively evaluating the performance of individual software components. The performance evaluation tool is based exclusively on web technologies, which removes the burden of downloading performance data or installing additional software. We employ this framework for the Ginkgo software ecosystem, but the framework can be used with essentially any software project, including the comparison between different software libraries. The Continuous Integration (CI) framework of Ginkgo is also extended to automatically run a benchmark suite on predetermined HPC systems, store the state of the machine and the environment along with the compiled binaries, and collect results in a publicly accessible performance data repository based on Git. The Ginkgo performance explorer (GPE) can be used to retrieve the performance data from the repository, and visualizes it in a web browser. GPE also implements an interface that allows users to write scripts, archived in a Git repository, to extract particular data, compute particular metrics, and visualize them in many different formats (as specified by the script). The combination of these approaches creates a workflow which enables performance reproducibility and software sustainability of scientific software. In this paper, we present example scripts that extract and visualize performance data for Ginkgo{\textquoteright}s SpMV kernels that allow users to identify the optimal kernel for specific problem characteristics.}, isbn = {9781450367707}, doi = {https://doi.org/10.1145/3324989.3325719}, author = {Hartwig Anzt and Yen Chen Chen and Terry Cojean and Jack Dongarra and Goran Flegar and Pratik Nayak and Enrique S. Quintana-Orti and Yuhsiang M. Tsai and Weichung Wang} } @conference {, title = {Towards Portable Online Prediction of Network Utilization Using MPI-Level Monitoring}, booktitle = {2019 European Conference on Parallel Processing (Euro-Par 2019)}, year = {2019}, month = {2019-08}, publisher = {Springer}, organization = {Springer}, address = {G{\"o}ttingen, Germany}, abstract = {Stealing network bandwidth helps a variety of HPC runtimes and services to run additional operations in the background without negatively affecting the applications. A key ingredient to make this possible is an accurate prediction of the future network utilization, enabling the runtime to plan the background operations in advance, such as to avoid competing with the application for network bandwidth. In this paper, we propose a portable deep learning predictor that only uses the information available through MPI introspection to construct a recurrent sequence-to-sequence neural network capable of forecasting network utilization. We leverage the fact that most HPC applications exhibit periodic behaviors to enable predictions far into the future (at least the length of a period). Our online approach does not have an initial training phase, it continuously improves itself during application execution without incurring significant computational overhead. Experimental results show better accuracy and lower computational overhead compared with the state-of-the-art on two representative applications.}, doi = {https://doi.org/10.1007/978-3-030-29400-7_4}, author = {Shu-Mei Tseng and Bogdan Nicolae and George Bosilca and Emmanuel Jeannot and Aparna Chandramowlishwaran and Franck Cappello} } @conference {, title = {Understanding Scalability and Fine-Grain Parallelism of Synchronous Data Parallel Training}, booktitle = {2019 IEEE/ACM Workshop on Machine Learning in High Performance Computing Environments (MLHPC)}, year = {2019}, month = {2019-11}, publisher = {IEEE}, organization = {IEEE}, address = {Denver, CO}, abstract = {In the age of big data, deep learning has emerged as a powerful tool to extract insight and exploit its value, both in industry and scientific applications. With increasing complexity of learning models and amounts of training data, data-parallel approaches based on frequent all-reduce synchronization steps are increasingly popular. Despite the fact that high-performance computing (HPC) technologies have been designed to address such patterns efficiently, the behavior of data-parallel approaches on HPC platforms is not well understood. To address this issue, in this paper we study the behavior of Horovod, a popular data-parallel approach that relies on MPI, on Theta, a pre-Exascale machine at Argonne National Laboratory. Using two representative applications, we explore two aspects: (1) how performance and scalability is affected by important parameters such as number of nodes, number of workers, threads per node, batch size; (2) how computational phases are interleaved withall-reduce communication phases at fine granularity and what consequences this interleaving has in terms of potential bottlenecks. Our findings show that pipelining of back-propagation, gradient reduction and weight updates mitigate the effects of stragglers during all-reduce only partially. Furthermore, there can be significant delays between weights update, which can be leveraged to mask the overhead of additional background operations that are coupled with the training.}, doi = {https://doi.org/10.1109/MLHPC49564.2019.00006}, author = {Jiali Li and Bogdan Nicolae and Justin M. Wozniak and George Bosilca} } @article {1268, title = {Autotuning in High-Performance Computing Applications}, journal = {Proceedings of the IEEE}, volume = {106}, year = {2018}, month = {2018-11}, pages = {2068{\textendash}2083}, abstract = {Autotuning refers to the automatic generation of a search space of possible implementations of a computation that are evaluated through models and/or empirical measurement to identify the most desirable implementation. Autotuning has the potential to dramatically improve the performance portability of petascale and exascale applications. To date, autotuning has been used primarily in high-performance applications through tunable libraries or previously tuned application code that is integrated directly into the application. This paper draws on the authors{\textquoteright} extensive experience applying autotuning to high-performance applications, describing both successes and future challenges. If autotuning is to be widely used in the HPC community, researchers must address the software engineering challenges, manage configuration overheads, and continue to demonstrate significant performance gains and portability across architectures. In particular, tools that configure the application must be integrated into the application build process so that tuning can be reapplied as the application and target architectures evolve.}, keywords = {High-performance computing, performance tuning programming systems}, doi = {10.1109/JPROC.2018.2841200}, author = {Prasanna Balaprakash and Jack Dongarra and Todd Gamblin and Mary Hall and Jeffrey Hollingsworth and Boyana Norris and Richard Vuduc} } @article {1231, title = {A Survey of MPI Usage in the US Exascale Computing Project}, journal = {Concurrency Computation: Practice and Experience}, year = {2018}, month = {2018-09}, type = {Special Issue}, abstract = {The Exascale Computing Project (ECP) is currently the primary effort in theUnited States focused on developing {\textquotedblleft}exascale{\textquotedblright} levels of computing capabilities, including hardware, software, and applications. In order to obtain amore thorough understanding of how the software projects under the ECPare using, and planning to use theMessagePassing Interface (MPI), and help guide the work of our own project within the ECP, we created a survey.Of the 97 ECP projects active at the time the survey was distributed, we received 77 responses, 56 of which reported that their projects were usingMPI. This paper reports the results of that survey for the benefit of the broader community of MPI developers.}, keywords = {exascale, MPI}, doi = {https://doi.org/10.1002/cpe.4851}, author = {David E. Bernholdt and Swen Boehm and George Bosilca and Manjunath Gorentla Venkata and Ryan E. Grant and Thomas Naughton and Howard P. Pritchard and Martin Schulz and Geoffroy R. Vallee} } @article {1337, title = {MagmaDNN {\textendash} High-Performance Data Analytics for Manycore GPUs and CPUs}, year = {2017}, month = {2017-12}, publisher = {2017 Summer Research Experiences for Undergraduate (REU), Presentation}, address = {Knoxville, TN}, author = {Lucien Ng and Kwai Wong and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @article {1086, title = {Structure-aware Linear Solver for Realtime Convex Optimization for Embedded Systems}, journal = {IEEE Embedded Systems Letters}, volume = {9}, year = {2017}, month = {2017-05}, pages = {61{\textendash}64}, abstract = {With the increasing sophistication in the use of optimization algorithms such as deep learning on embedded systems, the convex optimization solvers on embedded systems have found widespread use. This letter presents a novel linear solver technique to reduce the run-time of convex optimization solver by using the property that some parameters are fixed during the solution iterations of a solve instance. Our experimental results show that the run-time can be reduced by two orders of magnitude.}, keywords = {Karush Kuhn Tucker (KKT), Realtime embedded convex optimization solver}, doi = {10.1109/LES.2017.2700401}, url = {http://ieeexplore.ieee.org/document/7917357/}, author = {Ichitaro Yamazaki and Saeid Nooshabadi and Stanimire Tomov and Jack Dongarra} } @inproceedings {996, title = {Domain Overlap for Iterative Sparse Triangular Solves on GPUs}, journal = {Software for Exascale Computing - SPPEXA}, volume = {113}, year = {2016}, month = {2016-09}, pages = {527{\textendash}545}, publisher = {Springer International Publishing}, abstract = {Iterative methods for solving sparse triangular systems are an attractive alternative to exact forward and backward substitution if an approximation of the solution is acceptable. On modern hardware, performance benefits are available as iterative methods allow for better parallelization. In this paper, we investigate how block-iterative triangular solves can benefit from using overlap. Because the matrices are triangular, we use {\textquotedblleft}directed{\textquotedblright} overlap, depending on whether the matrix is upper or lower triangular. We enhance a GPU implementation of the block-asynchronous Jacobi method with directed overlap. For GPUs and other cases where the problem must be overdecomposed, i.e., more subdomains and threads than cores, there is a preference in processing or scheduling the subdomains in a specific order, following the dependencies specified by the sparse triangular matrix. For sparse triangular factors from incomplete factorizations, we demonstrate that moderate directed overlap with subdomain scheduling can improve convergence and time-to-solution.}, doi = {10.1007/978-3-319-40528-5_24}, author = {Hartwig Anzt and Edmond Chow and Daniel Szyld and Jack Dongarra}, editor = {Hans-Joachim Bungartz and Philipp Neumann and Wolfgang E. Nagel} } @conference {939, title = {Heterogeneous Streaming}, booktitle = {The Sixth International Workshop on Accelerators and Hybrid Exascale Systems (AsHES), IPDPS 2016}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This paper introduces a new heterogeneous streaming library called hetero Streams (hStreams). We show how a simple FIFO streaming model can be applied to heterogeneous systems that include manycore coprocessors and multicore CPUs. This model supports concurrency across nodes, among tasks within a node, and between data transfers and computation. We give examples for different approaches, show how the implementation can be layered, analyze overheads among layers, and apply those models to parallelize applications using simple, intuitive interfaces. We compare the features and versatility of hStreams, OpenMP, CUDA Streams1 and OmpSs. We show how the use of hStreams makes it easier for scientists to identify tasks and easily expose concurrency among them, and how it enables tuning experts and runtime systems to tailor execution for different heterogeneous targets. Practical application examples are taken from the field of numerical linear algebra, commercial structural simulation software, and a seismic processing application.}, keywords = {plasma}, author = {Chris J. Newburn and Gaurav Bansal and Michael Wood and Luis Crivelli and Judit Planas and Alejandro Duran and Paulo Souza and Leonardo Borges and Piotr Luszczek and Stanimire Tomov and Jack Dongarra and Hartwig Anzt and Mark Gates and Azzam Haidar and Yulu Jia and Khairul Kabir and Ichitaro Yamazaki and Jesus Labarta} } @techreport {972, title = {High Performance Realtime Convex Solver for Embedded Systems}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-745}, year = {2016}, month = {2016-10}, abstract = {Convex optimization solvers for embedded systems find widespread use. This letter presents a novel technique to reduce the run-time of decomposition of KKT matrix for the convex optimization solver for an embedded system, by two orders of magnitude. We use the property that although the KKT matrix changes, some of its block sub-matrices are fixed during the solution iterations and the associated solving instances.}, keywords = {KKT, Realtime embedded convex optimization solver}, author = {Ichitaro Yamazaki and Saeid Nooshabadi and Stanimire Tomov and Jack Dongarra} } @article {706, title = {Analyzing PAPI Performance on Virtual Machines}, journal = {VMWare Technical Journal}, volume = {Winter 2013}, year = {2014}, month = {2014-01}, abstract = {Performance Application Programming Interface (PAPI) aims to provide a consistent interface for measuring performance events using the performance counter hardware available on the CPU as well as available software performance events and off-chip hardware. Without PAPI, a user may be forced to search through specific processor documentation to discover the name of processor performance events. These names can change from model to model and vendor to vendor. PAPI simplifies this process by providing a consistent interface and a set of processor-agnostic preset events. Software engineers can use data collected through source-code instrumentation using the PAPI interface to examine the relation between software performance and performance events. PAPI can also be used within many high-level performance-monitoring utilities such as TAU, Vampir, and Score-P. VMware{\textregistered} ESXiTM and KVM have both added support within the last year for virtualizing performance counters. This article compares results measuring the performance of five real-world applications included in the Mantevo Benchmarking Suite in a VMware virtual machine, a KVM virtual machine, and on bare metal. By examining these results, it will be shown that PAPI provides accurate performance counts in a virtual machine environment.}, url = {https://labs.vmware.com/vmtj/analyzing-papi-performance-on-virtual-machines}, author = {John Nelson} } @techreport {692, title = {Analyzing PAPI Performance on Virtual Machines}, journal = {ICL Technical Report}, number = {ICL-UT-13-02}, year = {2013}, month = {2013-08}, abstract = {Over the last ten years, virtualization techniques have become much more widely popular as a result of fast and cheap processors. Virtualization provides many benefits making it appealing for testing environments. Encapsulating configurations is a huge motivator for wanting to do performance testing on virtual machines. Provisioning, a technique that is used by FutureGrid, is also simplified using virtual machines. Virtual machines enable portability among heterogeneous systems while providing an identical configuration within the guest operating system. My work in ICL has focused on using PAPI inside of virtual machines. There were two main areas of focus throughout my research. The first originated because of anomalous results of the HPC Challenge Benchmark reported in a paper submitted by ICL [3] in which the order of input sizes tested impacted run time on virtual machines but not on bare metal. A discussion of this anomaly will be given in section II along with a discussion of timers used in virtual machines. The second area of focus was exploring the recently implemented support by KVM (Kernel-based Virtual Machine) and VMware for guest OS level performance counters. A discussion of application tests run to observe the behavior of event counts measured in a virtual machine as well as a discussion of information learned pertinent to event measurement will be given in section III.}, author = {John Nelson} } @conference {687, title = {Diagnosis and Optimization of Application Prefetching Performance}, booktitle = {Proceedings of the 27th ACM International Conference on Supercomputing (ICS {\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Eugene, Oregon, USA}, abstract = {Hardware prefetchers are effective at recognizing streaming memory access patterns and at moving data closer to the processing units to hide memory latency. However, hardware prefetchers can track only a limited number of data streams due to finite hardware resources. In this paper, we introduce the term streaming concurrency to characterize the number of parallel, logical data streams in an application. We present a simulation algorithm for understanding the streaming concurrency at any point in an application, and we show that this metric is a good predictor of the number of memory requests initiated by streaming prefetchers. Next, we try to understand the causes behind poor prefetching performance. We identified four prefetch unfriendly conditions and we show how to classify an application{\textquoteright}s memory references based on these conditions. We evaluated our analysis using the SPEC CPU2006 benchmark suite. We selected two benchmarks with unfavorable access patterns and transformed them to improve their prefetching effectiveness. Results show that making applications more prefetcher friendly can yield meaningful performance gains.}, isbn = {9781450321303}, doi = {10.1145/2464996.2465014}, url = {http://dl.acm.org/citation.cfm?doid=2464996.2465014}, author = {Gabriel Marin and Colin McCurdy and Jeffrey Vetter}, editor = {Allen D. Malony and Nemirovsky, Mario and Midkiff, Sam} } @article {1382, title = {PAPI 5: Measuring Power, Energy, and the Cloud}, year = {2013}, month = {2013-04}, publisher = {2013 IEEE International Symposium on Performance Analysis of Systems and Software}, address = {Austin, TX}, author = {Vincent Weaver and Dan Terpstra and Heike McCraw and Matt Johnson and Kiran Kasichayanula and James Ralph and John Nelson and Phil Mucci and Tushar Mohan and Shirley Moore} } @conference {686, title = {Toward a scalable multi-GPU eigensolver via compute-intensive kernels and efficient communication}, booktitle = {Proceedings of the 27th ACM International Conference on Supercomputing (ICS {\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Eugene, Oregon, USA}, abstract = {The enormous gap between the high-performance capabilities of GPUs and the slow interconnect between them has made the development of numerical software that is scalable across multiple GPUs extremely challenging. We describe a successful methodology on how to address the challenges---starting from our algorithm design, kernel optimization and tuning, to our programming model---in the development of a scalable high-performance tridiagonal reduction algorithm for the symmetric eigenvalue problem. This is a fundamental linear algebra problem with many engineering and physics applications. We use a combination of a task-based approach to parallelism and a new algorithmic design to achieve high performance. The goal of the new design is to increase the computational intensity of the major compute kernels and to reduce synchronization and data transfers between GPUs. This may increase the number of flops, but the increase is offset by the more efficient execution and reduced data transfers. Our performance results are the best available, providing an enormous performance boost compared to current state-of-the-art solutions. In particular, our software scales up to 1070 Gflop/s using 16 Intel E5-2670 cores and eight M2090 GPUs, compared to 45 Gflop/s achieved by the optimized Intel Math Kernel Library (MKL) using only the 16 CPU cores.}, keywords = {eigenvalue, gpu communication, gpu computation, heterogeneous programming model, performance, reduction to tridiagonal, singular value decomposiiton, task parallelism}, isbn = {9781450321303}, doi = {10.1145/2464996.2465438}, url = {http://dl.acm.org/citation.cfm?doid=2464996.2465438}, author = {Azzam Haidar and Mark Gates and Stanimire Tomov and Jack Dongarra}, editor = {Allen D. Malony and Nemirovsky, Mario and Midkiff, Sam} } @article {icl:690, title = {An Implementation of the Tile QR Factorization for a GPU and Multiple CPUs}, journal = {Applied Parallel and Scientific Computing}, volume = {7133}, year = {2012}, month = {2012-00}, pages = {248-257}, author = {Jakub Kurzak and Rajib Nath and Peng Du and Jack Dongarra}, editor = {Kristj{\'a}n J{\'o}nasson} } @article {icl:688, title = {PAPI-V: Performance Monitoring for Virtual Machines}, journal = {CloudTech-HPC 2012}, year = {2012}, month = {2012-09}, address = {Pittsburgh, PA}, abstract = {This paper describes extensions to the PAPI hardware counter library for virtual environments, called PAPI-V. The extensions support timing routines, I/O measurements, and processor counters. The PAPI-V extensions will allow application and tool developers to use a familiar interface to obtain relevant hardware performance monitoring information in virtual environments.}, keywords = {papi}, doi = {10.1109/ICPPW.2012.29}, author = {Matt Johnson and Heike McCraw and Shirley Moore and Phil Mucci and John Nelson and Dan Terpstra and Vincent M Weaver and Tushar Mohan} } @inproceedings {icl:673, title = {Correlated Set Coordination in Fault Tolerant Message Logging Protocols}, journal = {Proceedings of 17th International Conference, Euro-Par 2011, Part II}, volume = {6853}, year = {2011}, month = {2011-08}, pages = {51-64}, publisher = {Springer}, address = {Bordeaux, France}, keywords = {ftmpi}, author = {Aurelien Bouteiller and Thomas Herault and George Bosilca and Jack Dongarra}, editor = {Emmanuel Jeannot and Raymond Namyst and Jean Roman} } @article {icl:653, title = {A Hybridization Methodology for High-Performance Linear Algebra Software for GPUs}, journal = {in GPU Computing Gems, Jade Edition}, volume = {2}, year = {2011}, month = {2011-00}, pages = {473-484}, publisher = {Elsevier}, keywords = {magma, morse}, author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Hatem Ltaeif and Raymond Namyst and Samuel Thibault and Stanimire Tomov}, editor = {Wen-mei W. Hwu} } @article {icl:646, title = {Impact of Kernel-Assisted MPI Communication over Scientific Applications: CPMD and FFTW}, journal = {18th EuroMPI}, year = {2011}, month = {2011-09}, pages = {247-254}, publisher = {Springer}, address = {Santorini, Greece}, keywords = {dague}, author = {Teng Ma and Aurelien Bouteiller and George Bosilca and Jack Dongarra}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @article {icl:643, title = {The International Exascale Software Project Roadmap}, journal = {International Journal of High Performance Computing}, volume = {25}, number = {1}, year = {2011}, month = {2011-01}, pages = {3-60}, abstract = {Over the last 20 years, the open-source community has provided more and more software on which the world{\textquoteright}s high-performance computing systems depend for performance and productivity. The community has invested millions of dollars and years of effort to build key components. However, although the investments in these separate software elements have been tremendously valuable, a great deal of productivity has also been lost because of the lack of planning, coordination, and key integration of technologies necessary to make them work together smoothly and efficiently, both within individual petascale systems and between different systems. It seems clear that this completely uncoordinated development model will not provide the software needed to support the unprecedented parallelism required for peta/ exascale computation on millions of cores, or the flexibility required to exploit new hardware models and features, such as transactional memory, speculative execution, and graphics processing units. This report describes the work of the community to prepare for the challenges of exascale computing, ultimately combing their efforts in a coordinated International Exascale Software Project.}, doi = {https://doi.org/10.1177/1094342010391989}, author = {Jack Dongarra and Pete Beckman and Terry Moore and Patrick Aerts and Giovanni Aloisio and Jean-Claude Andre and David Barkai and Jean-Yves Berthou and Taisuke Boku and Bertrand Braunschweig and Franck Cappello and Barbara Chapman and Xuebin Chi and Alok Choudhary and Sudip Dosanjh and Thom Dunning and Sandro Fiore and Al Geist and Bill Gropp and Robert Harrison and Mark Hereld and Michael Heroux and Adolfy Hoisie and Koh Hotta and Zhong Jin and Yutaka Ishikawa and Fred Johnson and Sanjay Kale and Richard Kenway and David Keyes and Bill Kramer and Jesus Labarta and Alain Lichnewsky and Thomas Lippert and Bob Lucas and Barney MacCabe and Satoshi Matsuoka and Paul Messina and Peter Michielse and Bernd Mohr and Matthias S. Mueller and Wolfgang E. Nagel and Hiroshi Nakashima and Michael E. Papka and Dan Reed and Mitsuhisa Sato and Ed Seidel and John Shalf and David Skinner and Marc Snir and Thomas Sterling and Rick Stevens and Fred Streitz and Bob Sugar and Shinji Sumimoto and William Tang and John Taylor and Rajeev Thakur and Anne Trefethen and Mateo Valero and Aad van der Steen and Jeffrey Vetter and Peg Williams and Robert Wisniewski and Kathy Yelick} } @article {icl:647, title = {OMPIO: A Modular Software Architecture for MPI I/O}, journal = {18th EuroMPI}, year = {2011}, month = {2011-09}, pages = {81-89}, publisher = {Springer}, address = {Santorini, Greece}, author = {Mohamad Chaarawi and Edgar Gabriel and Rainer Keller and Richard L. Graham and George Bosilca and Jack Dongarra}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @inproceedings {icl:632, title = {Optimizing Symmetric Dense Matrix-Vector Multiplication on GPUs}, journal = {ACM/IEEE Conference on Supercomputing (SC{\textquoteright}11)}, year = {2011}, month = {2011-11}, address = {Seattle, WA}, keywords = {magma}, author = {Rajib Nath and Stanimire Tomov and Tingxing Dong and Jack Dongarra} } @inproceedings {icl:674, title = {Scalable Runtime for MPI: Efficiently Building the Communication Infrastructure}, journal = {Proceedings of Recent Advances in the Message Passing Interface - 18th European MPI Users{\textquoteright} Group Meeting, EuroMPI 2011}, volume = {6960}, year = {2011}, month = {2011-09}, pages = {342-344}, publisher = {Springer}, address = {Santorini, Greece}, keywords = {ftmpi}, author = {George Bosilca and Thomas Herault and Pierre Lemariner and Jack Dongarra and A. Rezmerita}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @article {icl:546, title = {Accelerating GPU Kernels for Dense Linear Algebra}, journal = {Proc. of VECPAR{\textquoteright}10}, year = {2010}, month = {2010-06}, address = {Berkeley, CA}, keywords = {magma}, author = {Rajib Nath and Stanimire Tomov and Jack Dongarra} } @article {icl:547, title = {Accelerating the Reduction to Upper Hessenberg, Tridiagonal, and Bidiagonal Forms through Hybrid GPU-Based Computing}, journal = {Parallel Computing}, volume = {36}, number = {12}, year = {2010}, month = {2010-00}, pages = {645-654}, keywords = {magma}, author = {Stanimire Tomov and Rajib Nath and Jack Dongarra} } @article {1364, title = {Autotuning Dense Linear Algebra Libraries on GPUs}, year = {2010}, month = {2010-06}, publisher = {Sixth International Workshop on Parallel Matrix Algorithms and Applications (PMAA 2010)}, address = {Basel, Switzerland}, author = {Rajib Nath and Stanimire Tomov and Emmanuel Agullo and Jack Dongarra} } @inbook {854, title = {Blas for GPUs}, booktitle = {Scientific Computing with Multicore and Accelerators}, series = {Chapman \& Hall/CRC Computational Science}, year = {2010}, publisher = {CRC Press}, organization = {CRC Press}, chapter = {4}, address = {Boca Raton, Florida}, isbn = {9781439825365}, author = {Rajib Nath and Stanimire Tomov and Jack Dongarra} } @inproceedings {icl:523, title = {Dense Linear Algebra Solvers for Multicore with GPU Accelerators}, journal = {Parallel Distributed Processing, Workshops and Phd Forum (IPDPSW), 2010 IEEE International Symposium on}, year = {2010}, pages = {1-8}, address = {Atlanta, GA}, abstract = {Solving dense linear systems of equations is a fundamental problem in scientific computing. Numerical simulations involving complex systems represented in terms of unknown variables and relations between them often lead to linear systems of equations that must be solved as fast as possible. We describe current efforts toward the development of these critical solvers in the area of dense linear algebra (DLA) for multicore with GPU accelerators. We describe how to code/develop solvers to effectively use the high computing power available in these new and emerging hybrid architectures. The approach taken is based on hybridization techniques in the context of Cholesky, LU, and QR factorizations. We use a high-level parallel programming model and leverage existing software infrastructure, e.g. optimized BLAS for CPU and GPU, and LAPACK for sequential CPU processing. Included also are architecture and algorithm-specific optimizations for standard solvers as well as mixed-precision iterative refinement solvers. The new algorithms, depending on the hardware configuration and routine parameters, can lead to orders of magnitude acceleration when compared to the same algorithms on standard multicore architectures that do not contain GPU accelerators. The newly developed DLA solvers are integrated and freely available through the MAGMA library.}, doi = {10.1109/IPDPSW.2010.5470941}, author = {Stanimire Tomov and Rajib Nath and Hatem Ltaeif and Jack Dongarra} } @techreport {icl:600, title = {EZTrace: a generic framework for performance analysis}, journal = {ICL Technical Report}, number = {ICL-UT-11-01}, year = {2010}, month = {2010-12}, author = {Jack Dongarra and Mathieu Faverge and Yutaka Ishikawa and Raymond Namyst and Fran{\c c}ois Rue and Francois Trahay} } @techreport {icl:585, title = {Faster, Cheaper, Better - A Hybridization Methodology to Develop Linear Algebra Software for GPUs}, journal = {LAPACK Working Note}, number = {230}, year = {2010}, month = {2010-00}, keywords = {magma, morse}, author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Hatem Ltaeif and Raymond Namyst and Samuel Thibault and Stanimire Tomov} } @article {icl:526, title = {Hybrid Multicore Cholesky Factorization with Multiple GPU Accelerators}, journal = {IEEE Transaction on Parallel and Distributed Systems (submitted)}, year = {2010}, month = {2010-03}, keywords = {magma, plasma}, author = {Hatem Ltaeif and Stanimire Tomov and Rajib Nath and Jack Dongarra} } @techreport {icl:548, title = {An Improved MAGMA GEMM for Fermi GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-10-655 (also LAPACK working note 227)}, year = {2010}, month = {2010-07}, keywords = {magma}, author = {Rajib Nath and Stanimire Tomov and Jack Dongarra} } @article {icl:582, title = {An Improved MAGMA GEMM for Fermi GPUs}, journal = {International Journal of High Performance Computing}, volume = {24}, number = {4}, year = {2010}, month = {2010-00}, pages = {511-515}, keywords = {magma}, author = {Rajib Nath and Stanimire Tomov and Jack Dongarra} } @article {icl:521, title = {A Scalable High Performant Cholesky Factorization for Multicore with GPU Accelerators}, journal = {Proc. of VECPAR{\textquoteright}10 (to appear)}, year = {2010}, month = {2010-06}, address = {Berkeley, CA}, keywords = {magma, plasma}, author = {Hatem Ltaeif and Stanimire Tomov and Rajib Nath and Peng Du and Jack Dongarra} } @article {1362, title = {Scheduling Cholesky Factorization on Multicore Architectures with GPU Accelerators}, year = {2010}, month = {2010-07}, publisher = {2010 Symposium on Application Accelerators in High-Performance Computing (SAAHPC{\textquoteright}10), Poster}, address = {Knoxville, TN}, author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Hatem Ltaeif and Raymond Namyst and Rajib Nath and Jean Roman and Samuel Thibault and Stanimire Tomov} } @article {icl:576, title = {Trace-based Performance Analysis for the Petascale Simulation Code FLASH}, journal = {International Journal of High Performance Computing Applications (to appear)}, year = {2010}, month = {2010-00}, author = {Heike Jagode and Andreas Knuepfer and Jack Dongarra and Matthias Jurenz and Matthias S. Mueller and Wolfgang E. Nagel} } @article {icl:507, title = {Computational Science {\textendash} ICCS 2009, Proceedings of the 9th International Conference}, journal = {Lecture Notes in Computer Science: Theoretical Computer Science and General Issues}, volume = {-}, number = {5544-5545}, year = {2009}, month = {2009-05}, address = {Baton Rouge, LA}, editor = {Gabrielle Allen and Jaros{\l}aw Nabrzyski and E. Seidel and Geert Dick van Albada and Jack Dongarra and Peter M. Sloot} } @article {icl:497, title = {I/O Performance Analysis for the Petascale Simulation Code FLASH}, journal = {ISC{\textquoteright}09}, year = {2009}, month = {2009-06}, address = {Hamburg, Germany}, keywords = {test}, author = {Heike Jagode and Shirley Moore and Dan Terpstra and Jack Dongarra and Andreas Knuepfer and Matthias Jurenz and Matthias S. Mueller and Wolfgang E. Nagel} } @inproceedings {icl:602, title = {Modeling the Office of Science Ten Year Facilities Plan: The PERI Architecture Tiger Team}, journal = {SciDAC 2009, Journal of Physics: Conference Series}, volume = {180(2009)012039}, year = {2009}, month = {2009-07}, publisher = {IOP Publishing}, address = {San Diego, California}, keywords = {test}, author = {Bronis R. de Supinski and Sadaf Alam and David Bailey and Laura Carrington and Chris Daley and Anshu Dubey and Todd Gamblin and Dan Gunter and Paul D. Hovland and Heike Jagode and Karen Karavanic and Gabriel Marin and John Mellor-Crummey and Shirley Moore and Boyana Norris and Leonid Oliker and Catherine Olschanowsky and Philip C. Roth and Martin Schulz and Sameer Shende and Allan Snavely} } @inproceedings {icl:512, title = {A Note on Auto-tuning GEMM for GPUs}, journal = {9th International Conference on Computational Science (ICCS 2009)}, number = {5544-5545}, year = {2009}, month = {2009-05}, pages = {884-892}, address = {Baton Rouge, LA}, doi = {10.1007/978-3-642-01970-8_89}, author = {Yinan Li and Jack Dongarra and Stanimire Tomov}, editor = {Gabrielle Allen and Jaros{\l}aw Nabrzyski and E. Seidel and Geert Dick van Albada and Jack Dongarra and Peter M. Sloot} } @article {1352, title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects}, year = {2009}, month = {2009-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)}, address = {Portland, OR}, author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Rajib Nath and Stanimire Tomov and Asim YarKhan and Vasily Volkov} } @article {1365, title = {Numerical Linear Algebra on Hybrid Architectures: Recent Developments in the MAGMA Project}, year = {2009}, month = {2009-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)}, address = {Portland, Oregon}, author = {Rajib Nath and Jack Dongarra and Stanimire Tomov and Hatem Ltaeif and Peng Du} } @techreport {icl:475, title = {Trace-based Performance Analysis for the Petascale Simulation Code FLASH}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-09-01}, year = {2009}, month = {2009-04}, keywords = {test}, author = {Heike Jagode and Andreas Knuepfer and Jack Dongarra and Matthias Jurenz and Matthias S. Mueller and Wolfgang E. Nagel} } @inproceedings {icl:519, title = {VGrADS: Enabling e-Science Workflows on Grids and Clouds with Fault Tolerance}, journal = {SC{\textquoteright}09 The International Conference for High Performance Computing, Networking, Storage and Analysis (to appear)}, year = {2009}, month = {2009-00}, address = {Portland, OR}, keywords = {grads}, author = {Lavanya Ramakrishan and Daniel Nurmi and Anirban Mandal and Charles Koelbel and Dennis Gannon and Mark Huang and Yang-Suk Kee and Graziano Obertelli and Kiran Thyagaraja and Rich Wolski and Asim YarKhan and Dmitrii Zagorodnov} } @inproceedings {icl:440, title = {Exploring New Architectures in Accelerating CFD for Air Force Applications}, journal = {Proceedings of the DoD HPCMP User Group Conference}, year = {2008}, month = {2008-01}, address = {Seattle, Washington}, keywords = {magma}, author = {Jack Dongarra and Shirley Moore and Gregory D. Peterson and Stanimire Tomov and Jeff Allred and Vincent Natoli and David Richie} } @article {icl:409, title = {High Performance GridRPC Middleware}, journal = {Recent developments in Grid Technology and Applications}, year = {2008}, month = {2008-00}, publisher = {Nova Science Publishers}, keywords = {netsolve}, author = {Yves Caniou and Eddy Caron and Frederic Desprez and Hidemoto Nakada and Yoshio Tanaka and Keith Seymour}, editor = {George A. Gravvanis and John P. Morrison and Hamid R. Arabnia and D. A. Power} } @conference {icl:298, title = {Performance Profiling and Analysis of DoD Applications using PAPI and TAU}, booktitle = {Proceedings of DoD HPCMP UGC 2005}, year = {2005}, month = {2005-06}, publisher = {IEEE}, organization = {IEEE}, address = {Nashville, TN}, keywords = {papi}, author = {Shirley Moore and David Cronk and Felix Wolf and Avi Purkayastha and Patricia J. Teller and Robert Araiza and Gabriela Aguilera and Jamie Nava} } @article {icl:236, title = {Cray X1 Evaluation Status Report}, journal = {Oak Ridge National Laboratory Report}, volume = {/-2004/13}, year = {2004}, month = {2004-01}, author = {Pratul Agarwal and R. A. Alexander and E. Apra and Satish Balay and Arthur S. Bland and James Colgan and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Tom Dunigan and Mark Fahey and Al Geist and M. Gordon and Robert Harrison and Dinesh Kaushik and M. Krishnakumar and Piotr Luszczek and Tony Mezzacapa and Jeff Nichols and Jarek Nieplocha and Leonid Oliker and T. Packwood and M. Pindzola and Thomas C. Schulthess and Jeffrey Vetter and James B White and T. Windus and Patrick H. Worley and Thomas Zacharia} } @article {icl:92, title = {Active Netlib: An Active Mathematical Software Collection for Inquiry-based Computational Science and Engineering Education}, journal = {Journal of Digital Information special issue on Interactivity in Digital Libraries}, volume = {2}, number = {4}, year = {2002}, month = {2002-00}, keywords = {activenetlib, rib}, author = {Shirley Moore and A.J. Baker and Jack Dongarra and Christian Halloy and Chung Ng} } @techreport {icl:97, title = {GridRPC: A Remote Procedure Call API for Grid Computing}, journal = {ICL Technical Report}, number = {ICL-UT-02-06}, year = {2002}, month = {2002-11}, author = {Keith Seymour and Hidemoto Nakada and Satoshi Matsuoka and Jack Dongarra and Craig Lee and Henri Casanova} } @inproceedings {icl:187, title = {Overview of GridRPC: A Remote Procedure Call API for Grid Computing}, journal = {Proceedings of the Third International Workshop on Grid Computing}, year = {2002}, month = {2002-01}, pages = {274-278}, author = {Keith Seymour and Hidemoto Nakada and Satoshi Matsuoka and Jack Dongarra and Craig Lee and Henri Casanova}, editor = {Manish Parashar} }