@article {, title = {Economical Quasi-Newton Unitary Optimization of Electronic Orbitals}, journal = {Physical Chemistry Chemical Physics}, year = {2024}, month = {2023-12}, abstract = {We present an efficient quasi-Newton orbital solver optimized to reduce the number of gradient evaluations and other computational steps of comparable cost. The solver optimizes orthogonal orbitals by sequences of unitary rotations generated by the (preconditioned) limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) algorithm equipped with trust-region step restriction. The low-rank structure of the L-BFGS inverse Hessian is exploited when solving the trust-region problem. The efficiency of the proposed {\textquoteleft}{\textquoteleft}Quasi-Newton Unitary Optimization with Trust-Region{\textquoteright}{\textquoteright} (QUOTR) solver is compared to that of the standard Roothaan-Hall approach accelerated by the Direct Inversion of Iterative Subspace (DIIS), and other exact and approximate Newton solvers for mean-field (Hartree-Fock and Kohn-Sham) problems.}, issn = {1463-9076}, doi = {10.1039/D3CP05557D}, url = {http://pubs.rsc.org/en/Content/ArticleLanding/2024/CP/D3CP05557}, author = {Slattery, Samuel A and Surjuse, Kshitijkumar A and Peterson, Charles and Penchoff, Deborah A and Valeev, Edward} } @conference {, title = {AI Benchmarking for Science: Efforts from the MLCommons Science Working Group}, booktitle = {Lecture Notes in Computer Science}, volume = {13387}, year = {2023}, month = {2023-01}, pages = {47 - 64}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {With machine learning (ML) becoming a transformative tool for science, the scientific community needs a clear catalogue of ML techniques, and their relative benefits on various scientific problems, if they were to make significant advances in science using AI. Although this comes under the purview of benchmarking, conventional benchmarking initiatives are focused on performance, and as such, science, often becomes a secondary criteria. In this paper, we describe a community effort from a working group, namely, MLCommons Science Working Group, in developing science-specific AI benchmarking for the international scientific community. Since the inception of the working group in 2020, the group has worked very collaboratively with a number of national laboratories, academic institutions and industries, across the world, and has developed four science-specific AI benchmarks. We will describe the overall process, the resulting benchmarks along with some initial results. We foresee that this initiative is likely to be very transformative for the AI for Science, and for performance-focused communities.}, isbn = {978-3-031-23219-0}, doi = {10.1007/978-3-031-23220-610.1007/978-3-031-23220-6_4}, url = {https://link.springer.com/chapter/10.1007/978-3-031-23220-6_4}, author = {Thiyagalingam, Jeyan and von Laszewski, Gregor and Yin, Junqi and Emani, Murali and Papay, Juri and Barrett, Gregg and Luszczek, Piotr and Tsaris, Aristeidis and Kirkpatrick, Christine and Wang, Feiyi and Gibbs, Tom and Vishwanath, Venkatram and Shankar, Mallikarjun and Fox, Geoffrey and Hey, Tony}, editor = {Anzt, Hartwig and Bienz, Amanda and Luszczek, Piotr and Baboulin, Marc} } @article {, title = {Direct Determination of Optimal Real-Space Orbitals for Correlated Electronic Structure of Molecules}, journal = {Journal of Chemical Theory and Computation}, volume = {19}, year = {2023}, month = {2023-10}, pages = {7230 - 7241}, abstract = {We demonstrate how to determine numerically nearly exact orthonormal orbitals that are optimal for the evaluation of the energy of arbitrary (correlated) states of atoms and molecules by minimization of the energy Lagrangian. Orbitals are expressed in real space using a multiresolution spectral element basis that is refined adaptively to achieve the user-specified target precision while avoiding the ill-conditioning issues that plague AO basis set expansions traditionally used for correlated models of molecular electronic structure. For light atoms, the orbital solver, in conjunction with a variational electronic structure model [selected Configuration Interaction (CI)] provides energies of comparable precision to a state-of-the-art atomic CI solver. The computed electronic energies of atoms and molecules are significantly more accurate than the counterparts obtained with the orbital sets of the same rank expanded in Gaussian AO bases, and can be determined even when linear dependence issues preclude the use of the AO bases. It is feasible to optimize more than 100 fully correlated numerical orbitals on a single computer node, and significant room exists for additional improvement. These findings suggest that real-space orbital representations might be the preferred alternative to AO representations for high-end models of correlated electronic states of molecules and materials.}, issn = {1549-9618}, doi = {10.1021/acs.jctc.3c00732}, url = {https://pubs.acs.org/doi/10.1021/acs.jctc.3c00732}, author = {Valeev, Edward F. and Harrison, Robert J. and Holmes, Adam A. and Peterson, Charles C. and Penchoff, Deborah A.} } @article {, title = {Revisiting I/O bandwidth-sharing strategies for HPC applications}, number = {RR-9502}, year = {2023}, month = {2023-03}, institution = {INRIA}, abstract = {This work revisits I/O bandwidth-sharing strategies for HPC applications. When several applications post concurrent I/O operations, well-known approaches include serializing these operations (First-Come First-Served) or fair-sharing the bandwidth across them (FairShare). Another recent approach, I/O-Sets, assigns priorities to the applications, which are classified into different sets based upon the average length of their iterations. We introduce several new bandwidth-sharing strategies, some of them simple greedy algorithms, and some of them more complicated to implement, and we compare them with existing ones. Our new strategies do not rely on any a-priori knowledge of the behavior of the applications, such as the length of work phases, the volume of I/O operations, or some expected periodicity. We introduce a rigorous framework, namely steady-state windows, which enables to derive bounds on the competitive ratio of all bandwidth-sharing strategies for three different objectives: minimum yield, platform utilization, and global efficiency. To the best of our knowledge, this work is the first to provide a quantitative assessment of the online competitiveness of any bandwidth-sharing strategy. This theory-oriented assessment is complemented by a comprehensive set of simulations, based upon both synthetic and realistic traces. The main conclusion is that our simple and low-complexity greedy strategies significantly outperform First-Come First-Served, FairShare and I/O-Sets, and we recommend that the I/O community implements them for further assessment.}, keywords = {bandwidth sharing, HPC applications, I/O, scheduling strategy}, url = {https://hal.inria.fr/hal-04038011}, author = {Anne Benoit and Thomas Herault and Lucas Perotin and Yves Robert and Frederic Vivien} } @conference {, title = {When to checkpoint at the end of a fixed-length reservation?}, booktitle = {Fault Tolerance for HPC at eXtreme Scales (FTXS) Workshop}, year = {2023}, month = {2023-08}, address = {Denver, United States}, abstract = {This work considers an application executing for a fixed duration, namely the length of the reservation that it has been granted. The checkpoint duration is a stochastic random variable that obeys some well-known probability distribution law. The question is when to take a checkpoint towards the end of the execution, so that the expectation of the work done is maximized. We address two scenarios. In the first scenario, a checkpoint can be taken at any time; despite its simplicity, this natural problem has not been considered yet (to the best of our knowledge). We provide the optimal solution for a variety of probability distribution laws modeling checkpoint duration. The second scenario is more involved: the application is a linear workflow consisting of a chain of tasks with IID stochastic execution times, and a checkpoint can be taken only at the end of a task. First, we introduce a static strategy where we compute the optimal number of tasks before the application checkpoints at the beginning of the execution. Then, we design a dynamic strategy that decides whether to checkpoint or to continue executing at the end of each task. We instantiate this second scenario with several examples of probability distribution laws for task durations.}, url = {https://inria.hal.science/hal-04215554}, author = {Quentin Barbut and Anne Benoit and Thomas Herault and Yves Robert and Frederic Vivien} } @inproceedings {, title = {Checkpointing {\`a} la Young/Daly: An Overview}, journal = {IC3-2022: Proceedings of the 2022 Fourteenth International Conference on Contemporary Computing}, year = {2022}, month = {2022-08}, pages = {701-710}, publisher = {ACM Press}, address = {Noida, India}, abstract = {The Young/Daly formula provides an approximation of the optimal checkpoint period for a parallel application executing on a supercomputing platform. The Young/Daly formula was originally designed for preemptible tightly-coupled applications. We provide some background and survey various application scenarios to assess the usefulness and limitations of the formula.}, isbn = {9781450396752}, doi = {10.1145/3549206}, url = {https://dl.acm.org/doi/fullHtml/10.1145/3549206.3549328}, author = {Anne Benoit and Yishu Du and Thomas Herault and Loris Marchal and Guillaume Pallez and Lucas Perotin and Yves Robert and Hongyang Sun and Frederic Vivien} } @conference {, title = {Composition of Algorithmic Building Blocks in Template Task Graphs}, booktitle = {2022 IEEE/ACM Parallel Applications Workshop: Alternatives To MPI+X (PAW-ATM)}, year = {2022}, month = {2023-01}, publisher = {IEEE}, organization = {IEEE}, address = {Dallas, TX, USA}, doi = {10.1109/PAW-ATM56565.2022.00008}, url = {https://ieeexplore.ieee.org/document/10024647/}, author = {Herault, Thomas and Schuchart, Joseph and Valeev, Edward F. and George Bosilca} } @article {, title = {Evaluations of molecular modeling and machine learning for predictive capabilities in binding of lanthanum and actinium with carboxylic acids}, journal = {Journal of Radioanalytical and Nuclear Chemistry}, year = {2022}, month = {2022-12}, issn = {0236-5731}, doi = {10.1007/s10967-022-08620-7}, url = {https://rdcu.be/c2lGj}, author = {Penchoff, Deborah A. and Peterson, Charles C. and Wrancher, Eleigha M. and George Bosilca and Harrison, Robert J. and Valeev, Edward F. and Benny, Paul D.} } @conference {, title = {Generalized Flow-Graph Programming Using Template Task-Graphs: Initial Implementation and Assessment}, booktitle = {2022 IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2022}, month = {2022-07}, publisher = {IEEE}, organization = {IEEE}, address = {Lyon, France}, abstract = {We present and evaluate TTG, a novel programming model and its C++ implementation that by marrying the ideas of control and data flowgraph programming supports compact specification and efficient distributed execution of dynamic and irregular applications. Programming interfaces that support task-based execution often only support shared memory parallel environments; a few support distributed memory environments, either by discovering the entire DAG of tasks on all processes, or by introducing explicit communications. The first approach limits scalability, while the second increases the complexity of programming. We demonstrate how TTG can address these issues without sacrificing scalability or programmability by providing higher-level abstractions than conventionally provided by task-centric programming systems, without impeding the ability of these runtimes to manage task creation and execution as well as data and resource management efficiently. TTG supports distributed memory execution over 2 different task runtimes, PaRSEC and MADNESS. Performance of four paradigmatic applications (in graph analytics, dense and block-sparse linear algebra, and numerical integrodifferential calculus) with various degrees of irregularity implemented in TTG is illustrated on large distributed-memory platforms and compared to the state-of-the-art implementations.}, doi = {10.1109/IPDPS53621.2022.00086}, url = {https://ieeexplore.ieee.org/abstract/document/9820613}, author = {Schuchart, Joseph and Nookala, Poornima and Javanmard, Mohammad Mahdi and Herault, Thomas and Valeev, Edward F. and George Bosilca and Harrison, Robert J.} } @conference {, title = {Pushing the Boundaries of Small Tasks: Scalable Low-Overhead Data-Flow Programming in TTG}, booktitle = {2022 IEEE International Conference on Cluster Computing (CLUSTER)}, year = {2022}, month = {2022-09}, publisher = {IEEE}, organization = {IEEE}, address = {Heidelberg, Germany}, abstract = {Shared memory parallel programming models strive to provide low-overhead execution environments. Task-based programming models, in particular, are well-suited to cope with the ubiquitous multi- and many-core systems since they allow applications to express all available concurrency to a scheduler, which is tasked with exploiting the available hardware resources. It is general consensus that atomic operations should be preferred over locks and mutexes to avoid inter-thread serialization and the resulting loss in efficiency. However, even atomic operations may serialize threads if not used judiciously. In this work, we will discuss several optimizations applied to TTG and the underlying PaRSEC runtime system aiming at removing contentious atomic operations to reduce the overhead of task management to a few hundred clock cycles. The result is an optimized data-flow programming system that seamlessly scales from a single node to distributed execution and which is able to compete with OpenMP in shared memory.}, keywords = {Dataflow graph, Hardware, Instruction sets, Memory management, PaR-SEC, parallel programming, runtime, scalability, Task analysis, task-based programming, Template Task Graph, TTG}, doi = {10.1109/CLUSTER51413.2022.00026}, url = {https://ieeexplore.ieee.org/document/9912704/}, author = {Schuchart, Joseph and Nookala, Poornima and Herault, Thomas and Valeev, Edward F. and George Bosilca} } @conference {, title = {Distributed-Memory Multi-GPU Block-Sparse Tensor Contraction for Electronic Structure}, booktitle = {35th IEEE International Parallel \& Distributed Processing Symposium (IPDPS 2021)}, year = {2021}, month = {2021-05}, publisher = {IEEE}, organization = {IEEE}, address = {Portland, OR}, abstract = {Many domains of scientific simulation (chemistry, condensed matter physics, data science) increasingly eschew dense tensors for block-sparse tensors, sometimes with additional structure (recursive hierarchy, rank sparsity, etc.). Distributed-memory parallel computation with block-sparse tensorial data is paramount to minimize the time-tosolution (e.g., to study dynamical problems or for real-time analysis) and to accommodate problems of realistic size that are too large to fit into the host/device memory of a single node equipped with accelerators. Unfortunately, computation with such irregular data structures is a poor match to the dominant imperative, bulk-synchronous parallel programming model. In this paper, we focus on the critical element of block-sparse tensor algebra, namely binary tensor contraction, and report on an efficient and scalable implementation using the task-focused PaRSEC runtime. High performance of the block-sparse tensor contraction on the Summit supercomputer is demonstrated for synthetic data as well as for real data involved in electronic structure simulations of unprecedented size.}, keywords = {block-sparse matrix multiplication, distributed-memory, Electronic structure, multi-GPU node, parsec, tensor contraction}, url = {https://hal.inria.fr/hal-02970659/document}, author = {Thomas Herault and Yves Robert and George Bosilca and Robert Harrison and Cannada Lewis and Edward Valeev and Jack Dongarra} } @inproceedings {, title = {Evaluating Task Dropping Strategies for Overloaded Real-Time Systems (Work-In-Progress)}, journal = {42nd Real Time Systems Symposium (RTSS)}, year = {2021}, publisher = {IEEE Computer Society Press}, author = {Yiqin Gao and Guillaume Pallez and Yves Robert and Frederic Vivien} } @inbook {, title = {An Introduction to High Performance Computing and Its Intersection with Advances in Modeling Rare Earth Elements and Actinides}, booktitle = {Rare Earth Elements and Actinides: Progress in Computational Science Applications}, volume = {1388}, year = {2021}, month = {2021-10}, pages = {3-53}, publisher = {American Chemical Society}, organization = {American Chemical Society}, chapter = {1}, address = {Washington, DC}, abstract = {Computationally driven solutions in nuclear and radiochemistry heavily depend on efficient modeling of Rare Earth Elements (REEs) and actinides. Accurate modeling of REEs and actinides faces challenges stemming from limitations from an imbalanced hardware-software ecosystem and its implications on inefficient use of High Performance Computing (HPC). This chapter provides a historical perspective on the evolution of HPC hardware, its intersectionality with domain sciences, the importance of benchmarks for performance, and an overview of challenges and advances in modeling REEs and actinides. This chapter intends to provide an introduction for researchers at the intersection of scientific computing, software development for HPC, and applied computational modeling of REEs and actinides. The chapter is structured in five sections. First, the Introduction includes subsections focusing on the Importance of REEs and Actinides (1.1), Hardware, Software, and the HPC Ecosystem (1.2), and Electronic Structure Modeling of REEs and Actinides (1.3). Second, a section in High Performance Computing focuses on the TOP500 (2.1), HPC Performance (2.2), HPC Benchmarks: Processing, Bandwidth, and Latency (2.3), and HPC Benchmarks and their Relationship to Chemical Modeling (2.4). Third, the Software Challenges and Advances focus on NWChem/NWChemEx (3.1), MADNESS (3.2), and MPQC (3.3). The fourth section provides a short overview of Artificial Intelligence in HPC applications relevant to nuclear and radiochemistry. The fifth section illustrates A Protocol to Evaluate Complexation Preferences in Separations of REEs and Actinides through Computational Modeling.}, keywords = {actinide, Computational modeling, HPC, REE}, isbn = {ISBN13: 9780841298255 eISBN: 9780841298248}, doi = {10.1021/bk-2021-1388.ch001}, url = {https://pubs.acs.org/doi/10.1021/bk-2021-1388.ch001}, author = {Deborah A. Penchoff and Edward Valeev and Heike Jagode and Piotr Luszczek and Anthony Danalis and George Bosilca and Robert J. Harrison and Jack Dongarra and Theresa L. Windus} } @techreport {, title = {ASCR@40: Four Decades of Department of Energy Leadership in Advanced Scientific Computing Research}, year = {2020}, month = {2020-08}, publisher = {Advanced Scientific Computing Advisory Committee (ASCAC), US Department of Energy}, url = {https://computing.llnl.gov/misc/ASCR@40-Highlights.pdf}, author = {Bruce Hendrickson and Paul Messina and Buddy Bland and Jackie Chen and Phil Colella and Eli Dart and Jack Dongarra and Thom Dunning and Ian Foster and Richard Gerber and Rachel Harken and Wendy Huntoon and Bill Johnston and John Sarrao and Jeff Vetter} } @techreport {, title = {ASCR@40: Highlights and Impacts of ASCR{\textquoteright}s Programs}, year = {2020}, month = {2020-06}, publisher = {US Department of Energy{\textquoteright}s Office of Advanced Scientific Computing Research}, abstract = {The Office of Advanced Scientific Computing Research (ASCR) sits within the Office of Science in the Department of Energy (DOE). Per their web pages, {\textquotedblleft}the mission of the ASCR program is to discover, develop, and deploy computational and networking capabilities to analyze, model, simulate, and predict complex phenomena important to the DOE.{\textquotedblright} This succinct statement encompasses a wide range of responsibilities for computing and networking facilities; for procuring, deploying, and operating high performance computing, networking, and storage resources; for basic research in mathematics and computer science; for developing and sustaining a large body of software; and for partnering with organizations across the Office of Science and beyond. While its mission statement may seem very contemporary, the roots of ASCR are quite deep{\textemdash}long predating the creation of DOE. Applied mathematics and advanced computing were both elements of the Theoretical Division of the Manhattan Project. In the early 1950s, the Manhattan Project scientist and mathematician John von Neumann, then a commissioner for the AEC (Atomic Energy Commission), advocated for the creation of a Mathematics program to support the continued development and applications of digital computing. Los Alamos National Laboratory (LANL) scientist John Pasta created such a program to fund researchers at universities and AEC laboratories. Under several organizational name changes, this program has persisted ever since, and would eventually grow to become ASCR.}, doi = {https://doi.org/10.2172/1631812}, url = {https://www.osti.gov/servlets/purl/1631812}, author = {Bruce Hendrickson and Paul Messina and Buddy Bland and Jackie Chen and Phil Colella and Eli Dart and Jack Dongarra and Thom Dunning and Ian Foster and Richard Gerber and Rachel Harken and Wendy Huntoon and Bill Johnston and John Sarrao and Jeff Vetter} } @techreport {, title = {CEED ECP Milestone Report: Improve Performance and Capabilities of CEED-Enabled ECP Applications on Summit/Sierra}, journal = {ECP Milestone Reports}, year = {2020}, month = {2020-05}, publisher = {Zenodo}, doi = {https://doi.org/10.5281/zenodo.3860804}, url = {https://doi.org/10.5281/zenodo.3860804}, author = {Kolev, Tzanio and Fischer, Paul and Abdelfattah, Ahmad and Ananthan, Shreyas and Valeria Barra and Natalie Beams and Bleile, Ryan and Brown, Jed and Carson, Robert and Camier, Jean-Sylvain and Churchfield, Matthew and Dobrev, Veselin and Jack Dongarra and Dudouit, Yohann and Karakus, Ali and Kerkemeier, Stefan and Lan, YuHsiang and Medina, David and Merzari, Elia and Min, Misun and Parker, Scott and Ratnayaka, Thilina and Smith, Cameron and Sprague, Michael and Stitt, Thomas and Thompson, Jeremy and Tomboulides, Ananias and Stanimire Tomov and Tomov, Vladimir and Vargas, Arturo and Warburton, Tim and Weiss, Kenneth} } @conference {, title = {Energy-Aware Strategies for Reliability-Oriented Real-Time Task Allocation on Heterogeneous Platforms}, booktitle = {49th International Conference on Parallel Processing (ICPP 2020)}, year = {2020}, publisher = {ACM Press}, organization = {ACM Press}, address = {Edmonton, AB, Canada}, author = {Li Han and Yiqin Gao and Jing Liu and Yves Robert and Frederic Vivien} } @conference {1372, title = {Improved Energy-Aware Strategies for Periodic Real-Time Tasks under Reliability Constraints}, booktitle = {40th IEEE Real-Time Systems Symposium (RTSS 2019)}, year = {2020}, month = {2020-02}, publisher = {IEEE Press}, organization = {IEEE Press}, address = {York, UK}, author = {Li Han and Louis-Claude Canon and Jing Liu and Yves Robert and Frederic Vivien} } @conference {, title = {The Template Task Graph (TTG) - An Emerging Practical Dataflow Programming Paradigm for Scientific Simulation at Extreme Scale}, booktitle = { 2020 IEEE/ACM 5th International Workshop on Extreme Scale Programming Models and Middleware (ESPM2)}, year = {2020}, month = {2020-11}, publisher = {IEEE}, organization = {IEEE}, abstract = {We describe TESSE, an emerging general-purpose, open-source software ecosystem that attacks the twin challenges of programmer productivity and portable performance for advanced scientific applications on modern high-performance computers. TESSE builds upon and extends the ParsecDAG/-dataflow runtime with a new Domain Specific Languages (DSL) and new integration capabilities. Motivating this work is our belief that such a dataflow model, perhaps with applications composed in domain specific languages, can overcome many of the challenges faced by a wide variety of irregular applications that are poorly served by current programming and execution models. Two such applications from many-body physics and applied mathematics are briefly explored. This paper focuses upon the Template Task Graph (TTG), which is TESSE{\textquoteright}s main C++ Api that provides a powerful work/data-flow programming model. Algorithms on spatial trees, block-sparse tensors, and wave fronts are used to illustrate the API and associated concepts, as well as to compare with related approaches.}, keywords = {dag, dataflow, exascale, graph, High-performance computing, workflow}, doi = {https://doi.org/10.1109/ESPM251964.2020.00011}, author = {George Bosilca and Robert Harrison and Thomas Herault and Mohammad Mahdi Javanmard and Poornima Nookala and Edward Valeev} } @techreport {1398, title = {A Collection of Presentations from the BDEC2 Workshop in Kobe, Japan}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-09}, year = {2019}, month = {2019-02}, publisher = {University of Tennessee, Knoxville}, author = {Rosa M. Badia and Micah Beck and Fran{\c c}ois Bodin and Taisuke Boku and Franck Cappello and Alok Choudhary and Carlos Costa and Ewa Deelman and Nicola Ferrier and Katsuki Fujisawa and Kohei Fujita and Maria Girone and Geoffrey Fox and Shantenu Jha and Yoshinari Kameda and Christian Kniep and William Kramer and James Lin and Kengo Nakajima and Yiwei Qiu and Kishore Ramachandran and Glenn Ricart and Kim Serradell and Dan Stanzione and Lin Gan and Martin Swany and Christine Sweeney and Alex Szalay and Christine Kirkpatrick and Kenton McHenry and Alainna White and Steve Tuecke and Ian Foster and Joe Mambretti and William. M Tang and Michela Taufer and Miguel V{\'a}zquez} } @techreport {1399, title = {A Collection of White Papers from the BDEC2 Workshop in Poznan, Poland}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-10}, year = {2019}, month = {2019-05}, publisher = {University of Tennessee, Knoxville}, author = {Gabriel Antoniu and Alexandru Costan and Ovidiu Marcu and Maria S. P{\'e}rez and Nenad Stojanovic and Rosa M. Badia and Miguel V{\'a}zquez and Sergi Girona and Micah Beck and Terry Moore and Piotr Luszczek and Ezra Kissel and Martin Swany and Geoffrey Fox and Vibhatha Abeykoon and Selahattin Akkas and Kannan Govindarajan and Gurhan Gunduz and Supun Kamburugamuve and Niranda Perera and Ahmet Uyar and Pulasthi Wickramasinghe and Chathura Widanage and Maria Girone and Toshihiro Hanawa and Richard Moreno and Ariel Oleksiak and Martin Swany and Ryousei Takano and M.P. van Haarlem and J. van Leeuwen and J.B.R. Oonk and T. Shimwell and L.V.E. Koopmans} } @techreport {1408, title = {A Collection of White Papers from the BDEC2 Workshop in San Diego, CA}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-19-13}, year = {2019}, month = {2019-10}, publisher = {University of Tennessee}, author = {Ilkay Altintas and Kyle Marcus and Volkan Vural and Shweta Purawat and Daniel Crawl and Gabriel Antoniu and Alexandru Costan and Ovidiu Marcu and Prasanna Balaprakash and Rongqiang Cao and Yangang Wang and Franck Cappello and Robert Underwood and Sheng Di and Justin M. Wozniak and Jon C. Calhoun and Cong Xu and Antonio Lain and Paolo Faraboschi and Nic Dube and Dejan Milojicic and Balazs Gerofi and Maria Girone and Viktor Khristenko and Tony Hey and Erza Kissel and Yu Liu and Richard Loft and Pekka Manninen and Sebastian von Alfthan and Takemasa Miyoshi and Bruno Raffin and Olivier Richard and Denis Trystram and Maryam Rahnemoonfar and Robin Murphy and Joel Saltz and Kentaro Sano and Rupak Roy and Kento Sato and Jian Guo and Jen s Domke and Weikuan Yu and Takaki Hatsui and Yasumasa Joti and Alex Szalay and William M. Tang and Michael R. Wyatt II and Michela Taufer and Todd Gamblin and Stephen Herbein and Adam Moody and Dong H. Ahn and Rich Wolski and Chandra Krintz and Fatih Bakir and Wei-tsung Lin and Gareth George} } @article {1314, title = {A Generic Approach to Scheduling and Checkpointing Workflows}, journal = {International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1255-1274}, keywords = {checkpoint, fail-stop error, resilience, workflow}, doi = {https://doi.org/10.1177/1094342019866891}, author = {Li Han and Valentin Le F{\`e}vre and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @article {, title = {{A Generic Approach to Scheduling and Checkpointing Workflows}}, journal = {Int. Journal of High Performance Computing Applications}, volume = {33}, number = {6}, year = {2019}, pages = {1255-1274}, author = {Han, Li and Le F{\`e}vre, Valentin and Canon, Louis-Claude and Robert, Yves and Vivien, Fr{\'e}d{\'e}ric} } @conference {1339, title = {Scheduling Independent Stochastic Tasks on Heterogeneous Cloud Platforms}, booktitle = {IEEE Cluster 2019}, year = {2019}, month = {2019-09}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Albuquerque, New Mexico}, author = {Yiqin Gao and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @article {1315, title = {Scheduling Independent Stochastic Tasks under Deadline and Budget Constraints}, journal = {International Journal of High Performance Computing Applications}, volume = {34}, year = {2019}, month = {2019-06}, pages = {246-264}, abstract = {This article discusses scheduling strategies for the problem of maximizing the expected number of tasks that can be executed on a cloud platform within a given budget and under a deadline constraint. The execution times of tasks follow independent and identically distributed probability laws. The main questions are how many processors to enroll and whether and when to interrupt tasks that have been executing for some time. We provide complexity results and an asymptotically optimal strategy for the problem instance with discrete probability distributions and without deadline. We extend the latter strategy for the general case with continuous distributions and a deadline and we design an efficient heuristic which is shown to outperform standard approaches when running simulations for a variety of useful distribution laws.}, doi = {https://doi.org/10.1177/1094342019852135}, author = {Louis-Claude Canon and Aur{\'e}lie Kong Win Chang and Yves Robert and Frederic Vivien} } @article {1268, title = {Autotuning in High-Performance Computing Applications}, journal = {Proceedings of the IEEE}, volume = {106}, year = {2018}, month = {2018-11}, pages = {2068{\textendash}2083}, abstract = {Autotuning refers to the automatic generation of a search space of possible implementations of a computation that are evaluated through models and/or empirical measurement to identify the most desirable implementation. Autotuning has the potential to dramatically improve the performance portability of petascale and exascale applications. To date, autotuning has been used primarily in high-performance applications through tunable libraries or previously tuned application code that is integrated directly into the application. This paper draws on the authors{\textquoteright} extensive experience applying autotuning to high-performance applications, describing both successes and future challenges. If autotuning is to be widely used in the HPC community, researchers must address the software engineering challenges, manage configuration overheads, and continue to demonstrate significant performance gains and portability across architectures. In particular, tools that configure the application must be integrated into the application build process so that tuning can be reapplied as the application and target architectures evolve.}, keywords = {High-performance computing, performance tuning programming systems}, doi = {10.1109/JPROC.2018.2841200}, author = {Prasanna Balaprakash and Jack Dongarra and Todd Gamblin and Mary Hall and Jeffrey Hollingsworth and Boyana Norris and Richard Vuduc} } @techreport {1194, title = {Bidiagonal SVD Computation via an Associated Tridiagonal Eigenproblem}, journal = {LAPACK Working Note}, number = {LAWN 295, ICL-UT-18-02}, year = {2018}, month = {2018-04}, publisher = {University of Tennessee}, abstract = {In this paper, we present an algorithm for the singular value decomposition (SVD) of a bidiagonal matrix by means of the eigenpairs of an associated symmetric tridiagonal matrix. The algorithm is particularly suited for the computation of a subset of singular values and corresponding vectors. We focus on a sequential version of the algorithm, and discuss special cases and implementation details. We use a large set of bidiagonal matrices to assess the accuracy of the implementation in single and double precision, as well as to identify potential shortcomings. We show that the algorithm can be up to three orders of magnitude faster than existing algorithms, which are limited to the computation of a full SVD. We also show time comparisons of an implementation that uses the strategy discussed in the paper as a building block for the computation of the SVD of general matrices.}, author = {Osni Marques and James Demmel and Paulo B. Vasconcelos} } @article {1211, title = {Big Data and Extreme-Scale Computing: Pathways to Convergence - Toward a Shaping Strategy for a Future Software and Data Ecosystem for Scientific Inquiry}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-07}, pages = {435{\textendash}479}, abstract = {Over the past four years, the Big Data and Exascale Computing (BDEC) project organized a series of five international workshops that aimed to explore the ways in which the new forms of data-centric discovery introduced by the ongoing revolution in high-end data analysis (HDA) might be integrated with the established, simulation-centric paradigm of the high-performance computing (HPC) community. Based on those meetings, we argue that the rapid proliferation of digital data generators, the unprecedented growth in the volume and diversity of the data they generate, and the intense evolution of the methods for analyzing and using that data are radically reshaping the landscape of scientific computing. The most critical problems involve the logistics of wide-area, multistage workflows that will move back and forth across the computing continuum, between the multitude of distributed sensors, instruments and other devices at the networks edge, and the centralized resources of commercial clouds and HPC centers. We suggest that the prospects for the future integration of technological infrastructures and research ecosystems need to be considered at three different levels. First, we discuss the convergence of research applications and workflows that establish a research paradigm that combines both HPC and HDA, where ongoing progress is already motivating efforts at the other two levels. Second, we offer an account of some of the problems involved with creating a converged infrastructure for peripheral environments, that is, a shared infrastructure that can be deployed throughout the network in a scalable manner to meet the highly diverse requirements for processing, communication, and buffering/storage of massive data workflows of many different scientific domains. Third, we focus on some opportunities for software ecosystem convergence in big, logically centralized facilities that execute large-scale simulations and models and/or perform large-scale data analytics. We close by offering some conclusions and recommendations for future investment and policy review.}, doi = {https://doi.org/10.1177/1094342018778123}, author = {Mark Asch and Terry Moore and Rosa M. Badia and Micah Beck and Pete Beckman and Thierry Bidot and Fran{\c c}ois Bodin and Franck Cappello and Alok Choudhary and Bronis R. de Supinski and Ewa Deelman and Jack Dongarra and Anshu Dubey and Geoffrey Fox and Haohuan Fu and Sergi Girona and Michael Heroux and Yutaka Ishikawa and Kate Keahey and David Keyes and William T. Kramer and Jean-Fran{\c c}ois Lavignon and Yutong Lu and Satoshi Matsuoka and Bernd Mohr and St{\'e}phane Requena and Joel Saltz and Thomas Schulthess and Rick Stevens and Martin Swany and Alexander Szalay and William Tang and Ga{\"e}l Varoquaux and Jean-Pierre Vilotte and Robert W. Wisniewski and Zhiwei Xu and Igor Zacharov} } @article {1187, title = {Checkpointing Workflows for Fail-Stop Errors}, journal = {IEEE Transactions on Computers}, volume = {67}, year = {2018}, month = {2018-08}, pages = {1105{\textendash}1120}, abstract = {We consider the problem of orchestrating the execution of workflow applications structured as Directed Acyclic Graphs (DAGs) on parallel computing platforms that are subject to fail-stop failures. The objective is to minimize expected overall execution time, or makespan. A solution to this problem consists of a schedule of the workflow tasks on the available processors and of a decision of which application data to checkpoint to stable storage, so as to mitigate the impact of processor failures. To address this challenge, we consider a restricted class of graphs, Minimal Series-Parallel Graphs (M-SPGS), which is relevant to many real-world workflow applications. For this class of graphs, we propose a recursive list-scheduling algorithm that exploits the M-SPG structure to assign sub-graphs to individual processors, and uses dynamic programming to decide how to checkpoint these sub-graphs. We assess the performance of our algorithm for production workflow configurations, comparing it to an approach in which all application data is checkpointed and an approach in which no application data is checkpointed. Results demonstrate that our algorithm outperforms both the former approach, because of lower checkpointing overhead, and the latter approach, because of better resilience to failures.}, keywords = {checkpoint, fail-stop error, resilience, workflow}, url = {http://ieeexplore.ieee.org/document/8279499/}, author = {Li Han and Louis-Claude Canon and Henri Casanova and Yves Robert and Frederic Vivien} } @techreport {1397, title = {A Collection of White Papers from the BDEC2 Workshop in Bloomington, IN}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-15}, year = {2018}, month = {2018-11}, publisher = {University of Tennessee, Knoxville}, author = {James Ahrens and Christopher M. Biwer and Alexandru Costan and Gabriel Antoniu and Maria S. P{\'e}rez and Nenad Stojanovic and Rosa Badia and Oliver Beckstein and Geoffrey Fox and Shantenu Jha and Micah Beck and Terry Moore and Sunita Chandrasekaran and Carlos Costa and Thierry Deutsch and Luigi Genovese and Tarek El-Ghazawi and Ian Foster and Dennis Gannon and Toshihiro Hanawa and Tevfik Kosar and William Kramer and Madhav V. Marathe and Christopher L. Barrett and Takemasa Miyoshi and Alex Pothen and Ariful Azad and Judy Qiu and Bo Peng and Ravi Teja and Sahil Tyagi and Chathura Widanage and Jon Koskey and Maryam Rahnemoonfar and Umakishore Ramachandran and Miles Deegan and William Tang and Osamu Tatebe and Michela Taufer and Michel Cuende and Ewa Deelman and Trilce Estrada and Rafael Ferreira Da Silva and Harrel Weinstein and Rodrigo Vargas and Miwako Tsuji and Kevin G. Yager and Wanling Gao and Jianfeng Zhan and Lei Wang and Chunjie Luo and Daoyi Zheng and Xu Wen and Rui Ren and Chen Zheng and Xiwen He and Hainan Ye and Haoning Tang and Zheng Cao and Shujie Zhang and Jiahui Dai} } @inproceedings {1306, title = {Evaluating Contexts in OpenSHMEM-X Reference Implementation}, journal = {OpenSHMEM and Related Technologies. Big Compute and Big Data Convergence}, year = {2018}, pages = {50{\textendash}62}, publisher = {Springer International Publishing}, address = {Cham}, abstract = {Many-core processors are now ubiquitous in supercomputing. This evolution pushes toward the adoption of mixed models in which cores are exploited with threading models (and related programming abstractions, such as OpenMP), while communication between distributed memory domains employ a communication Application Programming Interface (API). OpenSHMEM is a partitioned global address space communication specification that exposes one-sided and synchronization operations. As the threaded semantics of OpenSHMEM are being fleshed out by its standardization committee, it is important to assess the soundness of the proposed concepts. This paper implements and evaluate the {\textquoteleft}{\textquoteleft}context{\textquoteright}{\textquoteright} extension in relation to threaded operations. We discuss the implementation challenges of the context and the associated API in OpenSHMEM-X. We then evaluate its performance in threaded situations on the Infiniband network using micro-benchmarks and the Random Access benchmark and see that adding communication contexts significantly improves message rate achievable by the executing multi-threaded PEs.}, isbn = {978-3-319-73814-7}, doi = {https://doi.org/10.1007/978-3-319-73814-7_4}, author = {Aurelien Bouteiller and Pophale, Swaroop and Swen Boehm and Baker, Matthew B. and Manjunath Gorentla Venkata}, editor = {Manjunath Gorentla Venkata and Imam, Neena and Pophale, Swaroop} } @conference {1215, title = {A Generic Approach to Scheduling and Checkpointing Workflows}, booktitle = { The 47th International Conference on Parallel Processing (ICPP 2018)}, year = {2018}, month = {2018-08}, publisher = {IEEE Computer Society Press}, organization = {IEEE Computer Society Press}, address = {Eugene, OR}, abstract = {This work deals with scheduling and checkpointing strategies to execute scientific workflows on failure-prone large-scale platforms. To the best of our knowledge, this work is the first to target failstop errors for arbitrary workflows. Most previous work addresses soft errors, which corrupt the task being executed by a processor but do not cause the entire memory of that processor to be lost, contrarily to fail-stop errors. We revisit classical mapping heuristics such as HEFT and MinMin and complement them with several checkpointing strategies. The objective is to derive an efficient trade-off between checkpointing every task (CkptAll), which is an overkill when failures are rare events, and checkpointing no task (CkptNone), which induces dramatic re-execution overhead even when only a few failures strike during execution. Contrarily to previous work, our approach applies to arbitrary workflows, not just special classes of dependence graphs such as M-SPGs (Minimal Series-Parallel Graphs). Extensive experiments report significant gain over both CkptAll and CkptNone, for a wide variety of workflows.}, author = {Li Han and Valentin Le F{\`e}vre and Louis-Claude Canon and Yves Robert and Frederic Vivien} } @techreport {1207, title = {Initial Integration and Evaluation of SLATE Parallel BLAS in LATTE}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-07}, year = {2018}, month = {2018-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Asim YarKhan and Gerald Ragghianti and Jack Dongarra and Marc Cawkwell and Danny Perez and Arthur Voter} } @article {1199, title = {Investigating Power Capping toward Energy-Efficient Scientific Applications}, journal = {Concurrency Computation: Practice and Experience}, volume = {2018}, year = {2018}, month = {2018-04}, pages = {1-14}, abstract = {The emergence of power efficiency as a primary constraint in processor and system design poses new challenges concerning power and energy awareness for numerical libraries and scientific applications. Power consumption also plays a major role in the design of data centers, which may house petascale or exascale-level computing systems. At these extreme scales, understanding and improving the energy efficiency of numerical libraries and their related applications becomes a crucial part of the successful implementation and operation of the computing system. In this paper, we study and investigate the practice of controlling a compute system{\textquoteright}s power usage, and we explore how different power caps affect the performance of numerical algorithms with different computational intensities. Further, we determine the impact, in terms of performance and energy usage, that these caps have on a system running scientific applications. This analysis will enable us to characterize the types of algorithms that benefit most from these power management schemes. Our experiments are performed using a set of representative kernels and several popular scientific benchmarks. We quantify a number of power and performance measurements and draw observations and conclusions that can be viewed as a roadmap to achieving energy efficiency in the design and execution of scientific algorithms.}, keywords = {energy efficiency, High Performance Computing, Intel Xeon Phi, Knights landing, papi, performance analysis, Performance Counters, power efficiency}, doi = {https://doi.org/10.1002/cpe.4485}, url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4485}, author = {Azzam Haidar and Heike Jagode and Phil Vaccaro and Asim YarKhan and Stanimire Tomov and Jack Dongarra} } @article {1231, title = {A Survey of MPI Usage in the US Exascale Computing Project}, journal = {Concurrency Computation: Practice and Experience}, year = {2018}, month = {2018-09}, type = {Special Issue}, abstract = {The Exascale Computing Project (ECP) is currently the primary effort in theUnited States focused on developing {\textquotedblleft}exascale{\textquotedblright} levels of computing capabilities, including hardware, software, and applications. In order to obtain amore thorough understanding of how the software projects under the ECPare using, and planning to use theMessagePassing Interface (MPI), and help guide the work of our own project within the ECP, we created a survey.Of the 97 ECP projects active at the time the survey was distributed, we received 77 responses, 56 of which reported that their projects were usingMPI. This paper reports the results of that survey for the benefit of the broader community of MPI developers.}, keywords = {exascale, MPI}, doi = {https://doi.org/10.1002/cpe.4851}, author = {David E. Bernholdt and Swen Boehm and George Bosilca and Manjunath Gorentla Venkata and Ryan E. Grant and Thomas Naughton and Howard P. Pritchard and Martin Schulz and Geoffroy R. Vallee} } @techreport {1280, title = {Tensor Contraction on Distributed Hybrid Architectures using a Task-Based Runtime System}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-13}, year = {2018}, month = {2018-12}, publisher = {University of Tennessee}, abstract = {The needs for predictive simulation of electronic structure in chemistry and materials science calls for fast/reduced-scaling formulations of quantum n-body methods that replace the traditional dense tensors with element-, block-, rank-, and block-rank-sparse (data-sparse) tensors. The resulting, highly irregular data structures are a poor match to imperative, bulk-synchronous parallel programming style due to the dynamic nature of the problem and to the lack of clear domain decomposition to guarantee a fair load-balance. TESSE runtime and the associated programming model aim to support performance-portable composition of applications involving irregular and dynamically changing data. In this paper we report an implementation of irregular dense tensor contraction in a paradigmatic electronic structure application based on the TESSE extension of PaRSEC, a distributed hybrid task runtime system, and analyze the resulting performance on a distributed memory cluster of multi-GPU nodes. Unprecedented strong scaling and promising efficiency indicate a viable future for task-based programming of complete production-quality reduced scaling models of electronic structure.}, author = {George Bosilca and Damien Genet and Robert Harrison and Thomas Herault and Mohammad Mahdi Javanmard and Chong Peng and Edward Valeev} } @conference {1099, title = {Assuming failure independence: are we right to be wrong?}, booktitle = {The 3rd International Workshop on Fault Tolerant Systems (FTS)}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii}, abstract = {This paper revisits the failure temporal independence hypothesis which is omnipresent in the analysis of resilience methods for HPC. We explain why a previous approach is incorrect, and we propose a new method to detect failure cascades, i.e., series of non-independent consecutive failures. We use this new method to assess whether public archive failure logs contain failure cascades. Then we design and compare several cascadeaware checkpointing algorithms to quantify the maximum gain that could be obtained, and we report extensive simulation results with archive and synthetic failure logs. Altogether, there are a few logs that contain cascades, but we show that the gain that can be achieved from this knowledge is not significant. The conclusion is that we can wrongly, but safely, assume failure independence!}, author = {Guillaume Aupy and Yves Robert and Frederic Vivien} } @conference {1098, title = {Checkpointing Workflows for Fail-Stop Errors}, booktitle = {IEEE Cluster}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii}, abstract = {We consider the problem of orchestrating the execution of workflow applications structured as Directed Acyclic Graphs (DAGs) on parallel computing platforms that are subject to fail-stop failures. The objective is to minimize expected overall execution time, or makespan. A solution to this problem consists of a schedule of the workflow tasks on the available processors and of a decision of which application data to checkpoint to stable storage, so as to mitigate the impact of processor failures. For general DAGs this problem is hopelessly intractable. In fact, given a solution, computing its expected makespan is still a difficult problem. To address this challenge, we consider a restricted class of graphs, Minimal Series-Parallel Graphs (M-SPGS). It turns out that many real-world workflow applications are naturally structured as M-SPGS. For this class of graphs, we propose a recursive list-scheduling algorithm that exploits the M-SPG structure to assign sub-graphs to individual processors, and uses dynamic programming to decide which tasks in these sub-graphs should be checkpointed. Furthermore, it is possible to efficiently compute the expected makespan for the solution produced by this algorithm, using a first-order approximation of task weights and existing evaluation algorithms for 2-state probabilistic DAGs. We assess the performance of our algorithm for production workflow configurations, comparing it to (i) an approach in which all application data is checkpointed, which corresponds to the standard way in which most production workflows are executed today; and (ii) an approach in which no application data is checkpointed. Our results demonstrate that our algorithm strikes a good compromise between these two approaches, leading to lower checkpointing overhead than the former and to better resilience to failure than the latter.}, author = {Li Han and Louis-Claude Canon and Henri Casanova and Yves Robert and Frederic Vivien} } @conference {1168, title = {The Design and Performance of Batched BLAS on Modern High-Performance Computing Systems}, booktitle = {International Conference on Computational Science (ICCS 2017)}, year = {2017}, month = {2017-06}, publisher = {Elsevier}, organization = {Elsevier}, address = {Z{\"u}rich, Switzerland}, abstract = {A current trend in high-performance computing is to decompose a large linear algebra problem into batches containing thousands of smaller problems, that can be solved independently, before collating the results. To standardize the interface to these routines, the community is developing an extension to the BLAS standard (the batched BLAS), enabling users to perform thousands of small BLAS operations in parallel whilst making efficient use of their hardware. We discuss the benefits and drawbacks of the current batched BLAS proposals and perform a number of experiments, focusing on a general matrix-matrix multiplication (GEMM), to explore their affect on the performance. In particular we analyze the effect of novel data layouts which, for example, interleave the matrices in memory to aid vectorization and prefetching of data. Utilizing these modifications our code outperforms both MKL1 CuBLAS2 by up to 6 times on the self-hosted Intel KNL (codenamed Knights Landing) and Kepler GPU architectures, for large numbers of double precision GEMM operations using matrices of size 2 {\texttimes} 2 to 20 {\texttimes} 20.}, keywords = {Batched BLAS, BLAS, High-performance computing, Memory management, Parallel processing, Scientific computing}, doi = {DOI:10.1016/j.procs.2017.05.138}, author = {Jack Dongarra and Sven Hammarling and Nicholas J. Higham and Samuel Relton and Pedro Valero-Lara and Mawussi Zounon} } @conference {1100, title = {Efficient Communications in Training Large Scale Neural Networks}, booktitle = {ACM MultiMedia Workshop 2017}, year = {2017}, month = {2017-10}, publisher = {ACM}, organization = {ACM}, address = {Mountain View, CA}, abstract = {We consider the problem of how to reduce the cost of communication that is required for the parallel training of a neural network. The state-of-the-art method, Bulk Synchronous Parallel Stochastic Gradient Descent (BSP-SGD), requires many collective communication operations, like broadcasts of parameters or reductions for sub-gradient aggregations, which for large messages quickly dominates overall execution time and limits parallel scalability. To address this problem, we develop a new technique for collective operations, referred to as Linear Pipelining (LP). It is tuned to the message sizes that arise in BSP-SGD, and works effectively on multi-GPU systems. Theoretically, the cost of LP is invariant to P, where P is the number of GPUs, while the cost of more conventional Minimum Spanning Tree (MST) scales like O(logP). LP also demonstrate up to 2x faster bandwidth than Bidirectional Exchange (BE) techniques that are widely adopted by current MPI implementations. We apply these collectives to BSP-SGD, showing that the proposed implementations reduce communication bottlenecks in practice while preserving the attractive convergence properties of BSP-SGD.}, author = {Yiyang Zhao and Linnan Wan and Wei Wu and George Bosilca and Richard Vuduc and Jinmian Ye and Wenqi Tang and Zenglin Xu} } @techreport {1171, title = {POMPEI: Programming with OpenMP4 for Exascale Investigations}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-09}, year = {2017}, month = {2017-12}, publisher = {University of Tennessee}, abstract = {The objective of the Programming with OpenMP4 for Exascale Investigations (POMPEI) project is to explore new task-based programming techniques together with data structure centric programming for scientific applications to harness the potential of extreme-scale systems. Tasking is a well established by now approach on such systems as it has been used successfully to handle their large-scale parallelism and heterogeneity, which are leading challenges on the way to exascale computing. The approach is to harness the latest features of OpenMP4.5 and OpenACC2.5 to design abstractions shared among tasks and mapped efficiently to data-structure driven programming paradigms. This technical report describes the approach, along with its reference implementation and results for dense linear algebra algorithms.}, author = {Jack Dongarra and Azzam Haidar and Oscar Hernandez and Stanimire Tomov and Manjunath Gorentla Venkata} } @conference {1134, title = {Power-aware Computing: Measurement, Control, and Performance Analysis for Intel Xeon Phi}, booktitle = {2017 IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}17), Best Paper Finalist}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {The emergence of power efficiency as a primary constraint in processor and system designs poses new challenges concerning power and energy awareness for numerical libraries and scientific applications. Power consumption also plays a major role in the design of data centers in particular for peta- and exa- scale systems. Understanding and improving the energy efficiency of numerical simulation becomes very crucial. We present a detailed study and investigation toward control- ling power usage and exploring how different power caps affect the performance of numerical algorithms with different computa- tional intensities, and determine the impact and correlation with performance of scientific applications. Our analyses is performed using a set of representatives kernels, as well as many highly used scientific benchmarks. We quantify a number of power and performance measurements, and draw observations and conclusions that can be viewed as a roadmap toward achieving energy efficiency computing algorithms.}, doi = {https://doi.org/10.1109/HPEC.2017.8091085}, author = {Azzam Haidar and Heike Jagode and Asim YarKhan and Phil Vaccaro and Stanimire Tomov and Jack Dongarra} } @article {1338, title = {Power-Aware HPC on Intel Xeon Phi KNL Processors}, year = {2017}, month = {2017-06}, publisher = {ISC High Performance (ISC17), Intel Booth Presentation}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Heike Jagode and Asim YarKhan and Phil Vaccaro and Stanimire Tomov and Jack Dongarra} } @conference {941, title = {GPU-Aware Non-contiguous Data Movement In Open MPI}, booktitle = {25th International Symposium on High-Performance Parallel and Distributed Computing (HPDC{\textquoteright}16)}, year = {2016}, month = {2016-06}, publisher = {ACM}, organization = {ACM}, address = {Kyoto, Japan}, abstract = {

Due to better parallel density and power efficiency, GPUs have become more popular for use in scientific applications. Many of these applications are based on the ubiquitous Message Passing Interface (MPI) programming paradigm, and take advantage of non-contiguous memory layouts to exchange data between processes. However, support for efficient non-contiguous data movements for GPU-resident data is still in its infancy, imposing a negative impact on the overall application performance.

To address this shortcoming, we present a solution where we take advantage of the inherent parallelism in the datatype packing and unpacking operations. We developed a close integration between Open MPI{\textquoteright}s stack-based datatype engine, NVIDIA{\textquoteright}s Uni ed Memory Architecture and GPUDirect capabilities. In this design the datatype packing and unpacking operations are offloaded onto the GPU and handled by specialized GPU kernels, while the CPU remains the driver for data movements between nodes. By incorporating our design into the Open MPI library we have shown significantly better performance for non-contiguous GPU-resident data transfers on both shared and distributed memory machines.

}, keywords = {datatype, gpu, hybrid architecture, MPI, non-contiguous data}, doi = {http://dx.doi.org/10.1145/2907294.2907317}, author = {Wei Wu and George Bosilca and Rolf vandeVaart and Sylvain Jeaugey and Jack Dongarra} } @article {1343, title = {A Standard for Batched BLAS Routines}, year = {2016}, month = {2016-04}, publisher = {17th SIAM Conference on Parallel Processing for Scientific Computing (SIAM PP16)}, address = {Paris, France}, author = {Pedro Valero-Lara and Jack Dongarra and Azzam Haidar and Samuel D. Relton and Stanimire Tomov and Mawussi Zounon} } @inproceedings {1308, title = {Surviving Errors with OpenSHMEM}, journal = {OpenSHMEM and Related Technologies. Enhancing OpenSHMEM for Hybrid Environments}, year = {2016}, pages = {66{\textendash}81}, publisher = {Springer International Publishing}, address = {Baltimore, MD, USA}, abstract = {Unexpected error conditions stem from a variety of underlying causes, including resource exhaustion, network failures, hardware failures, or program errors. As the scale of HPC systems continues to grow, so does the probability of encountering a condition that causes a failure; meanwhile, error recovery and run-through failure management are becoming mature, and interoperable HPC programming paradigms are beginning to feature advanced error management. As a result from these developments, it becomes increasingly desirable to gracefully handle error conditions in OpenSHMEM. In this paper, we present the design and rationale behind an extension of the OpenSHMEM API that can (1) notify user code of unexpected erroneous conditions, (2) permit customized user response to errors without incurring overhead on an error-free execution path, (3) propagate the occurence of an error condition to all Processing Elements, and (4) consistently close the erroneous epoch in order to resume the application.}, isbn = {978-3-319-50995-2}, author = {Aurelien Bouteiller and George Bosilca and Manjunath Gorentla Venkata}, editor = {Manjunath Gorentla Venkata and Imam, Neena and Pophale, Swaroop and Mintz, Tiffany M.} } @inproceedings {1310, title = {From MPI to OpenSHMEM: Porting LAMMPS}, journal = {OpenSHMEM and Related Technologies. Experiences, Implementations, and Technologies}, year = {2015}, pages = {121{\textendash}137}, publisher = {Springer International Publishing}, address = {Annapolis, MD, USA}, abstract = {This work details the opportunities and challenges of porting a Petascale, MPI-based application {\textendash}-LAMMPS{\textendash}- to OpenSHMEM. We investigate the major programming challenges stemming from the differences in communication semantics, address space organization, and synchronization operations between the two programming models. This work provides several approaches to solve those challenges for representative communication patterns in LAMMPS, e.g., by considering group synchronization, peer{\textquoteright}s buffer status tracking, and unpacked direct transfer of scattered data. The performance of LAMMPS is evaluated on the Titan HPC system at ORNL. The OpenSHMEM implementations are compared with MPI versions in terms of both strong and weak scaling. The results outline that OpenSHMEM provides a rich semantic to implement scalable scientific applications. In addition, the experiments demonstrate that OpenSHMEM can compete with, and often improve on, the optimized MPI implementation.}, isbn = {978-3-319-26428-8}, doi = {10.1007/978-3-319-26428-8_8}, author = {Tang, Chunyan and Aurelien Bouteiller and Thomas Herault and Manjunath Gorentla Venkata and George Bosilca}, editor = {Manjunath Gorentla Venkata and Shamis, Pavel and Imam, Neena and M. Graham Lopez} } @inproceedings {1309, title = {UCX: An Open Source Framework for HPC Network APIs and Beyond}, journal = {2015 IEEE 23rd Annual Symposium on High-Performance Interconnects}, year = {2015}, month = {Aug}, pages = {40-43}, publisher = {IEEE}, address = {Santa Clara, CA, USA}, abstract = {This paper presents Unified Communication X (UCX), a set of network APIs and their implementations for high throughput computing. UCX comes from the combined effort of national laboratories, industry, and academia to design and implement a high-performing and highly-scalable network stack for next generation applications and systems. UCX design provides the ability to tailor its APIs and network functionality to suit a wide variety of application domains and hardware. We envision these APIs to satisfy the networking needs of many programming models such as Message Passing Interface (MPI), OpenSHMEM, Partitioned Global Address Space (PGAS) languages, task-based paradigms and I/O bound applications. To evaluate the design we implement the APIs and protocols, and measure the performance of overhead-critical network primitives fundamental for implementing many parallel programming models and system libraries. Our results show that the latency, bandwidth, and message rate achieved by the portable UCX prototype is very close to that of the underlying driver. With UCX, we achieved a message exchange latency of 0.89 us, a bandwidth of 6138.5 MB/s, and a message rate of 14 million messages per second. As far as we know, this is the highest bandwidth and message rate achieved by any network stack (publicly known) on this hardware.}, keywords = {application program interfaces, Bandwidth, Electronics packaging, Hardware, high throughput computing, highly-scalable network stack, HPC, HPC network APIs, I/O bound applications, Infiniband, input-output programs, Libraries, Memory management, message passing, message passing interface, Middleware, MPI, open source framework, OpenSHMEM, parallel programming, parallel programming models, partitioned global address space languages, PGAS, PGAS languages, Programming, protocols, public domain software, RDMA, system libraries, task-based paradigms, UCX, Unified Communication X}, isbn = {978-1-4673-9160-3}, doi = {10.1109/HOTI.2015.13}, author = {P. Shamis and Manjunath Gorentla Venkata and M. Graham Lopez and M. B. Baker and O. Hernandez and Y. Itigin and M. Dubman and G. Shainer and R. L. Graham and L. Liss and Y. Shahar and S. Potluri and D. Rossetti and D. Becker and D. Poole and C. Lamb and S. Kumar and C. Stunkel and George Bosilca and Aurelien Bouteiller} } @article {icl:702, title = {BlackjackBench: Portable Hardware Characterization with Automated Results Analysis}, journal = {The Computer Journal}, year = {2013}, month = {2013-03}, abstract = {DARPA{\textquoteright}s AACE project aimed to develop Architecture Aware Compiler Environments. Such a compiler automatically characterizes the targeted hardware and optimizes the application codes accordingly. We present the BlackjackBench suite, a collection of portable micro-benchmarks that automate system characterization, plus statistical analysis techniques for interpreting the results. The BlackjackBench benchmarks discover the effective sizes and speeds of the hardware environment rather than the often unattainable peak values. We aim at hardware characteristics that can be observed by running executables generated by existing compilers from standard C codes. We characterize the memory hierarchy, including cache sharing and non-uniform memory access characteristics of the system, properties of the processing cores affecting the instruction execution speed and the length of the operating system scheduler time slot. We show how these features of modern multicores can be discovered programmatically. We also show how the features could potentially interfere with each other resulting in incorrect interpretation of the results, and how established classification and statistical analysis techniques can reduce experimental noise and aid automatic interpretation of results. We show how effective hardware metrics from our probes allow guided tuning of computational kernels that outperform an autotuning library further tuned by the hardware vendor.}, keywords = {hardware characterization, micro-benchmarks, statistical analysis}, doi = {10.1093/comjnl/bxt057}, author = {Anthony Danalis and Piotr Luszczek and Gabriel Marin and Jeffrey Vetter and Jack Dongarra} } @techreport {684, title = {On the Combination of Silent Error Detection and Checkpointing}, journal = {UT-CS-13-710}, year = {2013}, month = {2013-06}, publisher = {University of Tennessee Computer Science Technical Report}, abstract = {In this paper, we revisit traditional checkpointing and rollback recovery strategies, with a focus on silent data corruption errors. Contrarily to fail-stop failures, such latent errors cannot be detected immediately, and a mechanism to detect them must be provided. We consider two models: (i) errors are detected after some delays following a probability distribution (typically, an Exponential distribution); (ii) errors are detected through some verification mechanism. In both cases, we compute the optimal period in order to minimize the waste, i.e., the fraction of time where nodes do not perform useful computations. In practice, only a fixed number of checkpoints can be kept in memory, and the first model may lead to an irrecoverable failure. In this case, we compute the minimum period required for an acceptable risk. For the second model, there is no risk of irrecoverable failure, owing to the verification mechanism, but the corresponding overhead is included in the waste. Finally, both models are instantiated using realistic scenarios and application/architecture parameters.}, keywords = {checkpointing, error recovery, High-performance computing, silent data corruption, verification}, url = {http://www.netlib.org/lapack/lawnspdf/lawn278.pdf}, author = {Guillaume Aupy and Anne Benoit and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @conference {687, title = {Diagnosis and Optimization of Application Prefetching Performance}, booktitle = {Proceedings of the 27th ACM International Conference on Supercomputing (ICS {\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Eugene, Oregon, USA}, abstract = {Hardware prefetchers are effective at recognizing streaming memory access patterns and at moving data closer to the processing units to hide memory latency. However, hardware prefetchers can track only a limited number of data streams due to finite hardware resources. In this paper, we introduce the term streaming concurrency to characterize the number of parallel, logical data streams in an application. We present a simulation algorithm for understanding the streaming concurrency at any point in an application, and we show that this metric is a good predictor of the number of memory requests initiated by streaming prefetchers. Next, we try to understand the causes behind poor prefetching performance. We identified four prefetch unfriendly conditions and we show how to classify an application{\textquoteright}s memory references based on these conditions. We evaluated our analysis using the SPEC CPU2006 benchmark suite. We selected two benchmarks with unfavorable access patterns and transformed them to improve their prefetching effectiveness. Results show that making applications more prefetcher friendly can yield meaningful performance gains.}, isbn = {9781450321303}, doi = {10.1145/2464996.2465014}, url = {http://dl.acm.org/citation.cfm?doid=2464996.2465014}, author = {Gabriel Marin and Colin McCurdy and Jeffrey Vetter}, editor = {Allen D. Malony and Nemirovsky, Mario and Midkiff, Sam} } @inbook {762, title = {Keeneland: Computational Science Using Heterogeneous GPU Computing}, booktitle = {Contemporary High Performance Computing: From Petascale Toward Exascale}, series = {CRC Computational Science Series}, year = {2013}, publisher = {Taylor and Francis}, organization = {Taylor and Francis}, chapter = {7}, address = {Boca Raton, FL}, abstract = {The Keeneland Project is a five year Track 2D grant awarded by the National Science Foundation (NSF) under solicitation NSF 08-573 in August 2009 for the development and deployment of an innovative high performance computing system. The Keeneland project is led by the Georgia Institute of Technology (Georgia Tech) in collaboration with the University of Tennessee at Knoxville, National Institute of Computational Sciences, and Oak Ridge National Laboratory.}, author = {Jeffrey Vetter and Richard Glassbrook and Karsten Schwan and Sudha Yalamanchili and Mitch Horton and Ada Gavrilovska and Magda Slawinska and Jack Dongarra and Jeremy Meredith and Philip Roth and Kyle Spafford and Stanimire Tomov and John Wynkoop} } @article {748, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {Concurrency and Computation: Practice and Experience}, year = {2013}, month = {2013-11}, abstract = {In this paper, we present a unified model for several well-known checkpoint/restart protocols. The proposed model is generic enough to encompass both extremes of the checkpoint/restart space, from coordinated approaches to a variety of uncoordinated checkpoint strategies (with message logging). We identify a set of crucial parameters, instantiate them, and compare the expected efficiency of the fault tolerant protocols, for a given application/platform pair. We then propose a detailed analysis of several scenarios, including some of the most powerful currently available high performance computing platforms, as well as anticipated Exascale designs. The results of this analytical comparison are corroborated by a comprehensive set of simulations. Altogether, they outline comparative behaviors of checkpoint strategies at very large scale, thereby providing insight that is hardly accessible to direct experimentation.}, doi = {10.1002/cpe.3173}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @article {icl:684, title = {Divide and Conquer on Hybrid GPU-Accelerated Multicore Systems}, journal = {SIAM Journal on Scientific Computing}, volume = {34(2)}, year = {2012}, month = {2012-04}, pages = {C70-C82}, keywords = {magma}, author = {Christof Voemel and Stanimire Tomov and Jack Dongarra} } @article {icl:706, title = {HPC Challenge: Design, History, and Implementation Highlights}, journal = {On the Road to Exascale Computing: Contemporary Architectures in High Performance Computing (to appear)}, year = {2012}, month = {2012-00}, publisher = {Chapman \& Hall/CRC Press}, author = {Jack Dongarra and Piotr Luszczek}, editor = {Jeffrey Vetter} } @techreport {icl:716, title = {Unified Model for Assessing Checkpointing Protocols at Extreme-Scale}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 269)}, number = {UT-CS-12-697}, year = {2012}, month = {2012-06}, author = {George Bosilca and Aurelien Bouteiller and Elisabeth Brunet and Franck Cappello and Jack Dongarra and Amina Guermouche and Thomas Herault and Yves Robert and Frederic Vivien and Dounia Zaidouni} } @inproceedings {icl:607, title = {3-D parallel frequency-domain visco-acoustic wave modelling based on a hybrid direct/iterative solver}, journal = {73rd EAGE Conference \& Exhibition incorporating SPE EUROPEC 2011, Vienna, Austria, 23-26 May}, year = {2011}, month = {2011-00}, author = {Azzam Haidar and Luc Giraud and Hafedh Ben-Hadj-Ali and Florent Sourbier and St{\'e}phane Operto and Jean Virieux} } @inproceedings {icl:591, title = {BlackjackBench: Hardware Characterization with Portable Micro-Benchmarks and Automatic Statistical Analysis of Results}, journal = {IEEE International Parallel and Distributed Processing Symposium (submitted)}, year = {2011}, month = {2011-05}, address = {Anchorage, AK}, author = {Anthony Danalis and Piotr Luszczek and Gabriel Marin and Jeffrey Vetter and Jack Dongarra} } @article {icl:643, title = {The International Exascale Software Project Roadmap}, journal = {International Journal of High Performance Computing}, volume = {25}, number = {1}, year = {2011}, month = {2011-01}, pages = {3-60}, abstract = {Over the last 20 years, the open-source community has provided more and more software on which the world{\textquoteright}s high-performance computing systems depend for performance and productivity. The community has invested millions of dollars and years of effort to build key components. However, although the investments in these separate software elements have been tremendously valuable, a great deal of productivity has also been lost because of the lack of planning, coordination, and key integration of technologies necessary to make them work together smoothly and efficiently, both within individual petascale systems and between different systems. It seems clear that this completely uncoordinated development model will not provide the software needed to support the unprecedented parallelism required for peta/ exascale computation on millions of cores, or the flexibility required to exploit new hardware models and features, such as transactional memory, speculative execution, and graphics processing units. This report describes the work of the community to prepare for the challenges of exascale computing, ultimately combing their efforts in a coordinated International Exascale Software Project.}, doi = {https://doi.org/10.1177/1094342010391989}, author = {Jack Dongarra and Pete Beckman and Terry Moore and Patrick Aerts and Giovanni Aloisio and Jean-Claude Andre and David Barkai and Jean-Yves Berthou and Taisuke Boku and Bertrand Braunschweig and Franck Cappello and Barbara Chapman and Xuebin Chi and Alok Choudhary and Sudip Dosanjh and Thom Dunning and Sandro Fiore and Al Geist and Bill Gropp and Robert Harrison and Mark Hereld and Michael Heroux and Adolfy Hoisie and Koh Hotta and Zhong Jin and Yutaka Ishikawa and Fred Johnson and Sanjay Kale and Richard Kenway and David Keyes and Bill Kramer and Jesus Labarta and Alain Lichnewsky and Thomas Lippert and Bob Lucas and Barney MacCabe and Satoshi Matsuoka and Paul Messina and Peter Michielse and Bernd Mohr and Matthias S. Mueller and Wolfgang E. Nagel and Hiroshi Nakashima and Michael E. Papka and Dan Reed and Mitsuhisa Sato and Ed Seidel and John Shalf and David Skinner and Marc Snir and Thomas Sterling and Rick Stevens and Fred Streitz and Bob Sugar and Shinji Sumimoto and William Tang and John Taylor and Rajeev Thakur and Anne Trefethen and Mateo Valero and Aad van der Steen and Jeffrey Vetter and Peg Williams and Robert Wisniewski and Kathy Yelick} } @article {, title = {Keeneland: Bringing Heterogeneous GPU Computing to the Computational Science Community}, journal = {IEEE Computing in Science \& Engineering}, volume = {13}, year = {2011}, month = {2011-08}, pages = {90-95}, abstract = {The Keeneland project{\textquoteright}s goal is to develop and deploy an innovative, GPU-based high-performance computing system for the NSF computational science community.}, keywords = {Benchmark testing, Computational modeling, Computer architecture, Graphics processing unit, Hardware, Random access memory, Scientific computing}, doi = {https://doi.org/10.1109/MCSE.2011.83}, author = {Jeffrey Vetter and Richard Glassbrook and Jack Dongarra and Karsten Schwan and Bruce Loftis and Stephen McNally and Jeremy Meredith and James Rogers and Philip Roth and Kyle Spafford and Sudhakar Yalamanchili} } @article {icl:603, title = {Three-dimensional parallel frequency-domain visco-acoustic wave modelling based on a hybrid direct/iterative solver.}, journal = {To appear in Geophysical Prospecting journal.}, year = {2011}, month = {2011-00}, author = {Florent Sourbier and Azzam Haidar and Luc Giraud and Hafedh Ben-Hadj-Ali and St{\'e}phane Operto and Jean Virieux} } @article {icl:639, title = {Divide \& Conquer on Hybrid GPU-Accelerated Multicore Systems}, journal = {SIAM Journal on Scientific Computing (submitted)}, year = {2010}, month = {2010-08}, keywords = {magma}, author = {Christof Voemel and Stanimire Tomov and Jack Dongarra} } @article {icl:568, title = {Accelerating Time-To-Solution for Computational Science and Engineering}, journal = {SciDAC Review}, year = {2009}, month = {2009-00}, author = {James Demmel and Jack Dongarra and Armando Fox and Sam Williams and Vasily Volkov and Katherine Yelick} } @inproceedings {icl:474, title = {A Holistic Approach for Performance Measurement and Analysis for Petascale Applications}, journal = {ICCS 2009 Joint Workshop: Tools for Program Development and Analysis in Computational Science and Software Engineering for Large-Scale Computing}, volume = {2009}, year = {2009}, month = {2009-05}, pages = {686-695}, publisher = {Springer-Verlag Berlin Heidelberg 2009}, address = {Baton Rouge, Louisiana}, keywords = {point, test}, author = {Heike Jagode and Jack Dongarra and Sadaf Alam and Jeffrey Vetter and W. Spear and Allen D. Malony}, editor = {Gabrielle Allen} } @article {icl:481, title = {The International Exascale Software Project: A Call to Cooperative Action by the Global High Performance Community}, journal = {International Journal of High Performance Computing Applications (to appear)}, year = {2009}, month = {2009-07}, author = {Jack Dongarra and Pete Beckman and Patrick Aerts and Franck Cappello and Thomas Lippert and Satoshi Matsuoka and Paul Messina and Terry Moore and Rick Stevens and Anne Trefethen and Mateo Valero} } @article {1352, title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects}, year = {2009}, month = {2009-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)}, address = {Portland, OR}, author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Rajib Nath and Stanimire Tomov and Asim YarKhan and Vasily Volkov} } @article {icl:451, title = {DARPA{\textquoteright}s HPCS Program: History, Models, Tools, Languages}, journal = {in Advances in Computers}, volume = {72}, year = {2008}, month = {2008-01}, publisher = {Elsevier}, author = {Jack Dongarra and Robert Graybill and William Harrod and Robert Lucas and Ewing Lusk and Piotr Luszczek and Janice McMahon and Allan Snavely and Jeffrey Vetter and Katherine Yelick and Sadaf Alam and Roy Campbell and Laura Carrington and Tzu-Yi Chen and Omid Khalili and Jeremy Meredith and Mustafa Tikir}, editor = {M. Zelkowitz} } @article {1353, title = {Enhancing the Performance of Dense Linear Algebra Solvers on GPUs (in the MAGMA Project)}, year = {2008}, month = {2008-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC08)}, address = {Austin, TX}, author = {Marc Baboulin and James Demmel and Jack Dongarra and Stanimire Tomov and Vasily Volkov} } @inproceedings {icl:416, title = {Interior State Computation of Nano Structures}, journal = {PARA 2008, 9th International Workshop on State-of-the-Art in Scientific and Parallel Computing}, year = {2008}, month = {2008-05}, address = {Trondheim, Norway}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @inproceedings {icl:455, title = {Matrix Product on Heterogeneous Master Worker Platforms}, journal = {2008 PPoPP Conference}, year = {2008}, month = {2008-01}, address = {Salt Lake City, Utah}, author = {Jack Dongarra and Jean-Francois Pineau and Yves Robert and Frederic Vivien} } @article {icl:504, title = {Revisiting Matrix Product on Master-Worker Platforms}, journal = {International Journal of Foundations of Computer Science (IJFCS)}, volume = {19}, number = {6}, year = {2008}, month = {2008-12}, pages = {1317-1336}, author = {Jack Dongarra and Jean-Francois Pineau and Yves Robert and Zhiao Shi and Frederic Vivien} } @article {icl:447, title = {State-of-the-Art Eigensolvers for Electronic Structure Calculations of Large Scale Nano-Systems}, journal = {Journal of Computational Physics}, volume = {227}, number = {15}, year = {2008}, month = {2008-01}, pages = {7113-7124}, author = {Christof Voemel and Stanimire Tomov and Osni Marques and Andrew Canning and Lin-Wang Wang and Jack Dongarra} } @inproceedings {icl:374, title = {Optimal Routing in Binomial Graph Networks}, journal = {The International Conference on Parallel and Distributed Computing, applications and Technologies (PDCAT)}, year = {2007}, month = {2007-12}, publisher = {IEEE Computer Society}, address = {Adelaide, Australia}, keywords = {ftmpi}, author = {Thara Angskun and George Bosilca and Brad Vander Zanden and Jack Dongarra} } @article {icl:371, title = {Revisiting Matrix Product on Master-Worker Platforms}, journal = {International Journal of Foundations of Computer Science (IJFCS) (accepted)}, year = {2007}, month = {2007-00}, author = {Jack Dongarra and Jean-Francois Pineau and Yves Robert and Zhiao Shi and Frederic Vivien} } @article {icl:401, title = {The Use of Bulk States to Accelerate the Band Edge State Calculation of a Semiconductor Quantum Dot}, journal = {Journal of Computational Physics}, volume = {223}, year = {2007}, month = {2007-00}, pages = {774-782}, author = {Christof Voemel and Stanimire Tomov and Lin-Wang Wang and Osni Marques and Jack Dongarra} } @inproceedings {icl:325, title = {Performance evaluation of eigensolvers in nano-structure computations}, journal = {IEEE/ACM Proceedings of HPCNano SC06 (to appear)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @article {icl:327, title = {Predicting the electronic properties of 3D, million-atom semiconductor nanostructure architectures}, journal = {J. Phys.: Conf. Ser. 46}, volume = {:101088/1742-6596/46/1/040}, year = {2006}, month = {2006-01}, pages = {292-298}, keywords = {DOE_NANO}, author = {Alex Zunger and Alberto Franceschetti and Gabriel Bester and Wesley B. Jones and Kwiseon Kim and Peter A. Graf and Lin-Wang Wang and Andrew Canning and Osni Marques and Christof Voemel and Jack Dongarra and Julien Langou and Stanimire Tomov} } @article {icl:370, title = {Prospectus for the Next LAPACK and ScaLAPACK Libraries}, journal = {PARA 2006}, year = {2006}, month = {2006-06}, address = {Umea, Sweden}, author = {James Demmel and Jack Dongarra and B. Parlett and William Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye Li and Osni Marques and Jason E. Riedy and Christof Voemel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julien Langou and Stanimire Tomov} } @article {icl:332, title = {Self Adapting Numerical Software SANS Effort}, journal = {IBM Journal of Research and Development}, volume = {50}, number = {2/3}, year = {2006}, month = {2006-01}, pages = {223-238}, keywords = {gco}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Victor Eijkhout and Graham Fagg and Erika Fuentes and Julien Langou and Piotr Luszczek and Jelena Pjesivac{\textendash}Grbovic and Keith Seymour and Haihang You and Sathish Vadhiyar} } @inproceedings {icl:324, title = {Towards bulk based preconditioning for quantum dot computations}, journal = {IEEE/ACM Proceedings of HPCNano SC06 (to appear)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @article {icl:326, title = {The use of bulk states to accelerate the band edge state calculation of a semiconductor quantum dot}, journal = {Journal of Computational Physics (submitted)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Christof Voemel and Stanimire Tomov and Lin-Wang Wang and Osni Marques and Jack Dongarra} } @article {icl:244, title = {Self Adaptivity in Grid Computing}, journal = {Concurrency and Computation: Practice and Experience, Special Issue: Grid Performance}, volume = {17}, number = {2-4}, year = {2005}, month = {2005-00}, pages = {235-257}, keywords = {netsolve, sans}, author = {Sathish Vadhiyar and Jack Dongarra}, editor = {John Gurd and Anthony Hey and Juri Papay and Graham Riley} } @techreport {icl:261, title = {Towards an Accurate Model for Collective Communications}, journal = {ICL Technical Report}, number = {ICL-UT-05-03}, year = {2005}, month = {2005-01}, author = {Sathish Vadhiyar and Graham Fagg and Jack Dongarra} } @article {icl:236, title = {Cray X1 Evaluation Status Report}, journal = {Oak Ridge National Laboratory Report}, volume = {/-2004/13}, year = {2004}, month = {2004-01}, author = {Pratul Agarwal and R. A. Alexander and E. Apra and Satish Balay and Arthur S. Bland and James Colgan and Eduardo D{\textquoteright}Azevedo and Jack Dongarra and Tom Dunigan and Mark Fahey and Al Geist and M. Gordon and Robert Harrison and Dinesh Kaushik and M. Krishnakumar and Piotr Luszczek and Tony Mezzacapa and Jeff Nichols and Jarek Nieplocha and Leonid Oliker and T. Packwood and M. Pindzola and Thomas C. Schulthess and Jeffrey Vetter and James B White and T. Windus and Patrick H. Worley and Thomas Zacharia} } @inproceedings {icl:238, title = {Self Adapting Linear Algebra Algorithms and Software}, journal = {IEEE Proceedings (to appear)}, year = {2004}, month = {2004-00}, keywords = {salsa, sans}, author = {James Demmel and Jack Dongarra and Victor Eijkhout and Erika Fuentes and Antoine Petitet and Rich Vuduc and Clint Whaley and Katherine Yelick} } @article {icl:167, title = {Towards an Accurate Model for Collective Communications}, journal = {International Journal of High Performance Applications, Special Issue: Automatic Performance Tuning}, volume = {18}, number = {1}, year = {2004}, month = {2004-01}, pages = {159-167}, keywords = {lacsi}, author = {Sathish Vadhiyar and Graham Fagg and Jack Dongarra} } @article {icl:133, title = {GrADSolve - A Grid-based RPC System for Remote Invocation of Parallel Software}, journal = {Journal of Parallel and Distributed Computing (submitted)}, year = {2003}, month = {2003-03}, keywords = {grads}, author = {Sathish Vadhiyar and Jack Dongarra} } @inproceedings {icl:172, title = {GrADSolve - RPC for High Performance Computing on the Grid}, journal = {Lecture Notes in Computer Science, Proceedings of the 9th International Euro-Par Conference}, volume = {2790}, year = {2003}, month = {2003-01}, pages = {394-403}, publisher = {Springer-Verlag, Berlin}, address = {Klagenfurt, Austria}, keywords = {netsolve}, doi = {10.1007/978-3-540-45209-6_58}, author = {Sathish Vadhiyar and Jack Dongarra and Asim YarKhan}, editor = {Harald Kosch and Laszlo Boszormenyi and Hermann Hellwagner} } @article {icl:143, title = {NetSolve: Past, Present, and Future - A Look at a Grid Enabled Server}, journal = {Making the Global Infrastructure a Reality}, year = {2003}, month = {2003-00}, publisher = {Wiley Publishing}, keywords = {netsolve}, author = {Sudesh Agrawal and Jack Dongarra and Keith Seymour and Sathish Vadhiyar}, editor = {Francine Berman and Geoffrey Fox and Anthony Hey} } @inproceedings {icl:126, title = {A Performance Oriented Migration Framework for the Grid}, journal = {Proceedings of the 3rd International Symposium on Cluster Computing and the Grid}, year = {2003}, month = {2003-05}, pages = {130-137}, address = {Tokyo, Japan}, keywords = {grads}, author = {Sathish Vadhiyar} } @article {icl:138, title = {Scheduling in the Grid Application Development Software Project}, journal = {Resource Management in the Grid}, year = {2003}, month = {2003-03}, publisher = {Kluwer Publishers}, keywords = {grads}, author = {Holly Dail and Otto Sievert and Francine Berman and Henri Casanova and Asim YarKhan and Sathish Vadhiyar and Jack Dongarra and Chuang Liu and Lingyun Yang and Dave Angulo and Ian Foster} } @article {icl:135, title = {Self Adaptability in Grid Computing}, journal = {Concurrency: Practice and Experience (submitted)}, year = {2003}, month = {2003-03}, keywords = {sans}, author = {Sathish Vadhiyar and Jack Dongarra} } @article {icl:139, title = {The Semantic Conference Organizer}, journal = {Statistical Data Mining and Knowledge Discovery}, year = {2003}, month = {2003-00}, publisher = {CRC Press}, keywords = {netsolve}, author = {Kevin Heinrich and Michael Berry and Jack Dongarra and Sathish Vadhiyar}, editor = {Hamparsum Bozdogan} } @article {icl:132, title = {SRS - A Framework for Developing Malleable and Migratable Parallel Software}, journal = {Parallel Processing Letters}, volume = {13}, number = {2}, year = {2003}, month = {2003-06}, pages = {291-312}, keywords = {grads}, author = {Sathish Vadhiyar and Jack Dongarra} } @inproceedings {icl:93, title = {Deploying Parallel Numerical Library Routines to Cluster Computing in a Self Adapting Fashion}, journal = {Parallel Computing: Advances and Current Issues:Proceedings of the International Conference ParCo2001}, year = {2002}, month = {2002-01}, publisher = {Imperial College Press}, address = {London, England}, keywords = {lfc, sans}, author = {Kenneth Roche and Jack Dongarra}, editor = {Gerhard R. Joubert and Almerica Murli and Frans Peters and Marco Vanneschi} } @article {icl:10, title = {An Iterative Solver Benchmark}, journal = {Scientific Programming (to appear)}, year = {2002}, month = {2002-00}, author = {Jack Dongarra and Victor Eijkhout and Henk van der Vorst} } @inproceedings {icl:94, title = {A Metascheduler For The Grid}, journal = {Proceedings of the 11th IEEE International Symposium on High Performance Distributed Computing (HPDC 2002)}, year = {2002}, month = {2002-07}, pages = {343-351}, publisher = {IEEE Computer Society}, address = {Edinburgh, Scotland}, keywords = {grads}, author = {Sathish Vadhiyar and Jack Dongarra} } @article {icl:101, title = {Middleware for the Use of Storage in Communication}, journal = {Parallel Computing}, volume = {28}, number = {12}, year = {2002}, month = {2002-08}, pages = {1773-1788}, keywords = {netsolve}, author = {Micah Beck and Dorian Arnold and Alessandro Bassi and Francine Berman and Henri Casanova and Jack Dongarra and Terry Moore and Graziano Obertelli and James Plank and Martin Swany and Sathish Vadhiyar and Rich Wolski} } @inproceedings {icl:79, title = {Toward a Framework for Preparing and Executing Adaptive Grid Programs}, journal = {International Parallel and Distributed Processing Symposium: IPDPS 2002 Workshops}, year = {2002}, month = {2002-04}, pages = {0171}, address = {Fort Lauderdale, FL}, keywords = {grads}, author = {Ken Kennedy and John Mellor-Crummey and Keith Cooper and Linda Torczon and Francine Berman and Andrew Chien and Dave Angulo and Ian Foster and Dennis Gannon and Lennart Johnsson and Carl Kesselman and Jack Dongarra and Sathish Vadhiyar} } @techreport {icl:96, title = {Users{\textquoteright} Guide to NetSolve v1.4.1}, journal = {ICL Technical Report}, number = {ICL-UT-02-05}, year = {2002}, month = {2002-06}, keywords = {netsolve}, author = {Sudesh Agrawal and Dorian Arnold and Susan Blackford and Jack Dongarra and Michelle Miller and Kiran Sagi and Zhiao Shi and Keith Seymour and Sathish Vadhiyar} } @article {icl:87, title = {On the Convergence of Computational and Data Grids}, journal = {Parallel Processing Letters}, volume = {11}, number = {2-3}, year = {2001}, month = {2001-01}, pages = {187-202}, keywords = {netsolve}, author = {Dorian Arnold and Sathish Vadhiyar and Jack Dongarra} } @article {icl:223, title = {Iterative Solver Benchmark (LAPACK Working Note 152)}, journal = {Scientific Programming}, volume = {9}, number = {4}, year = {2001}, month = {2001-00}, pages = {223-231}, author = {Jack Dongarra and Victor Eijkhout and Henk van der Vorst} } @article {icl:4, title = {Logistical Computing and Internetworking: Middleware for the Use of Storage in Communication}, journal = {submitted to SC2001}, year = {2001}, month = {2001-11}, address = {Denver, Colorado}, keywords = {netsolve}, author = {Micah Beck and Dorian Arnold and Alessandro Bassi and Francine Berman and Henri Casanova and Jack Dongarra and Terry Moore and Graziano Obertelli and James Plank and Martin Swany and Sathish Vadhiyar and Rich Wolski} } @article {icl:89, title = {Numerical Libraries and The Grid}, journal = {International Journal of High Performance Applications and Supercomputing}, volume = {15}, number = {4}, year = {2001}, month = {2001-01}, pages = {359-374}, keywords = {grads}, author = {Antoine Petitet and Susan Blackford and Jack Dongarra and Brett Ellis and Graham Fagg and Kenneth Roche and Sathish Vadhiyar} } @techreport {icl:21, title = {Numerical Libraries and The Grid: The Grads Experiments with ScaLAPACK}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-01-460}, year = {2001}, month = {2001-01}, keywords = {grads, scalapack}, author = {Antoine Petitet and Susan Blackford and Jack Dongarra and Brett Ellis and Graham Fagg and Kenneth Roche and Sathish Vadhiyar} } @inproceedings {icl:78, title = {Performance Modeling for Self Adapting Collective Communications for MPI}, journal = {LACSI Symposium 2001}, year = {2001}, month = {2001-10}, address = {Santa Fe, NM}, keywords = {ftmpi}, author = {Sathish Vadhiyar and Graham Fagg and Jack Dongarra} } @inproceedings {icl:48, title = {Automatically Tuned Collective Communications}, journal = {Proceedings of SuperComputing 2000 (SC{\textquoteright}2000)}, year = {2000}, month = {2000-11}, address = {Dallas, TX}, keywords = {ftmpi}, author = {Sathish Vadhiyar and Graham Fagg and Jack Dongarra} } @article {icl:57, title = {Algorithmic Issues on Heterogeneous Computing Platforms}, journal = {Parallel Processing Letters}, volume = {9}, number = {2}, year = {1999}, month = {1999-01}, pages = {197-213}, author = {Pierre Boulet and Jack Dongarra and Fabrice Rastello and Yves Robert and Frederic Vivien} } @article {icl:58, title = {Static Tiling for Heterogeneous Computing Platforms}, journal = {Parallel Computing}, volume = {25}, number = {5}, year = {1999}, month = {1999-01}, pages = {547-568}, author = {Pierre Boulet and Jack Dongarra and Yves Robert and Frederic Vivien} } @book {1468, title = {Numerical Linear Algebra for High-Performance Computers}, series = {Software, Environments and Tools}, year = {1998}, publisher = {SIAM}, organization = {SIAM}, abstract = {This book presents a unified treatment of recently developed techniques and current understanding about solving systems of linear equations and large scale eigenvalue problems on high-performance computers. It provides a rapid introduction to the world of vector and parallel processing for these linear algebra applications. Topics include major elements of advanced-architecture computers and their performance, recent algorithmic development, and software for direct solution of dense matrix problems, direct solution of sparse systems of equations, iterative solution of sparse systems of equations, and solution of large sparse eigenvalue problems. This book supersedes the SIAM publication Solving Linear Systems on Vector and Shared Memory Computers, which appeared in 1990. The new book includes a considerable amount of new material in addition to incorporating a substantial revision of existing text.}, doi = {https://doi.org/10.1137/1.9780898719611}, author = {Jack Dongarra and Iain Duff and Danny Sorensen and Henk van der Vorst} }