@inproceedings {, title = {Memory Traffic and Complete Application Profiling with PAPI Multi-Component Measurements}, journal = {2023 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2023}, month = {2023-08}, publisher = {IEEE}, address = {St. Petersburg, Florida}, abstract = {Some of the most important categories of performance events count the data traffic between the processing cores and the main memory. However, since these counters are not core-private, applications require elevated privileges to access them. PAPI offers a component that can access this information on IBM systems through the Performance Co-Pilot (PCP); however, doing so adds an indirection layer that involves querying the PCP daemon. This paper performs a quantitative study of the accuracy of the measurements obtained through this component on the Summit supercomputer. We use two linear algebra kernels---a generalized matrix multiply, and a modified matrix-vector multiply---as benchmarks and a distributed, GPU-accelerated 3D-FFT mini-app (using cuFFT) to compare the measurements obtained through the PAPI PCP component against the expected values across different problem sizes. We also compare our measurements against an in-house machine with a very similar architecture to Summit, where elevated privileges allow PAPI to access the hardware counters directly (without using PCP) to show that measurements taken via PCP are as accurate as the those taken directly. Finally, using both QMCPACK and the 3D-FFT, we demonstrate the diverse hardware activities that can be monitored simultaneously via PAPI hardware components.}, keywords = {GPU power, High Performance Computing, network traffic, papi, performance analysis, Performance Counters}, doi = {10.1109/IPDPSW59300.2023.00070}, url = {https://ieeexplore.ieee.org/document/10196656}, author = {Daniel Barry and Heike Jagode and Anthony Danalis and Jack Dongarra} } @article {, title = {Memory Traffic and Complete Application Profiling with PAPI Multi-Component Measurements}, year = {2023}, month = {2023-05}, publisher = {28th HIPS Workshop}, address = {St. Petersburg, FL}, author = {Daniel Barry and Heike Jagode and Anthony Danalis and Jack Dongarra} } @inbook {, title = {Performance Application Programming Interface}, booktitle = {Accelerated Computing with HIP}, year = {2022}, month = {2022-12}, publisher = {Sun, Baruah and Kaeli}, organization = {Sun, Baruah and Kaeli}, isbn = {B0BR8KSS7K}, url = {https://a.co/d/0DoG5as}, author = {Anthony Danalis and Heike Jagode} } @inbook {, title = {An Introduction to High Performance Computing and Its Intersection with Advances in Modeling Rare Earth Elements and Actinides}, booktitle = {Rare Earth Elements and Actinides: Progress in Computational Science Applications}, volume = {1388}, year = {2021}, month = {2021-10}, pages = {3-53}, publisher = {American Chemical Society}, organization = {American Chemical Society}, chapter = {1}, address = {Washington, DC}, abstract = {Computationally driven solutions in nuclear and radiochemistry heavily depend on efficient modeling of Rare Earth Elements (REEs) and actinides. Accurate modeling of REEs and actinides faces challenges stemming from limitations from an imbalanced hardware-software ecosystem and its implications on inefficient use of High Performance Computing (HPC). This chapter provides a historical perspective on the evolution of HPC hardware, its intersectionality with domain sciences, the importance of benchmarks for performance, and an overview of challenges and advances in modeling REEs and actinides. This chapter intends to provide an introduction for researchers at the intersection of scientific computing, software development for HPC, and applied computational modeling of REEs and actinides. The chapter is structured in five sections. First, the Introduction includes subsections focusing on the Importance of REEs and Actinides (1.1), Hardware, Software, and the HPC Ecosystem (1.2), and Electronic Structure Modeling of REEs and Actinides (1.3). Second, a section in High Performance Computing focuses on the TOP500 (2.1), HPC Performance (2.2), HPC Benchmarks: Processing, Bandwidth, and Latency (2.3), and HPC Benchmarks and their Relationship to Chemical Modeling (2.4). Third, the Software Challenges and Advances focus on NWChem/NWChemEx (3.1), MADNESS (3.2), and MPQC (3.3). The fourth section provides a short overview of Artificial Intelligence in HPC applications relevant to nuclear and radiochemistry. The fifth section illustrates A Protocol to Evaluate Complexation Preferences in Separations of REEs and Actinides through Computational Modeling.}, keywords = {actinide, Computational modeling, HPC, REE}, isbn = {ISBN13: 9780841298255 eISBN: 9780841298248}, doi = {10.1021/bk-2021-1388.ch001}, url = {https://pubs.acs.org/doi/10.1021/bk-2021-1388.ch001}, author = {Deborah A. Penchoff and Edward Valeev and Heike Jagode and Piotr Luszczek and Anthony Danalis and George Bosilca and Robert J. Harrison and Jack Dongarra and Theresa L. Windus} } @conference {, title = {Effortless Monitoring of Arithmetic Intensity with PAPI{\textquoteright}s Counter Analysis Toolkit}, booktitle = {13th International Workshop on Parallel Tools for High Performance Computing}, year = {2020}, month = {2020-09}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Dresden, Germany}, abstract = {With exascale computing forthcoming, performance metrics such as memory traffic and arithmetic intensity are increasingly important for codes that heavily utilize numerical kernels. Performance metrics in different CPU architectures can be monitored by reading the occurrences of various hardware events. However, from architecture to architecture, it becomes more and more unclear which native performance events are indexed by which event names, making it difficult for users to understand what specific events actually measure. This ambiguity seems particularly true for events related to hardware that resides beyond the compute core, such as events related to memory traffic. Still, traffic to memory is a necessary characteristic for determining arithmetic intensity. To alleviate this difficulty, PAPI{\textquoteright}s Counter Analysis Toolkit measures the occurrences of events through a series of benchmarks, allowing its users to discover the high-level meaning of native events. We (i) leverage the capabilities of the Counter Analysis Toolkit to identify the names of hardware events for reading and writing bandwidth utilization in addition to floating-point operations, (ii) measure the occurrences of the events they index during the execution of important numerical kernels, and (iii) verify their identities by comparing these occurrence patterns to the expected arithmetic intensity of the numerical kernels.}, author = {Daniel Barry and Anthony Danalis and Heike Jagode} } @article {, title = {Exa-PAPI: The Exascale Performance API with Modern C++}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @techreport {1457, title = {Formulation of Requirements for New PAPI++ Software Package: Part I: Survey Results}, journal = {PAPI++ Working Notes}, number = {1, ICL-UT-20-02}, year = {2020}, month = {2020-01}, publisher = {Innovative Computing Laboratory, University of Tennessee Knoxville}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @inbook {1213, title = {Interoperable Convergence of Storage, Networking, and Computation}, booktitle = {Advances in Information and Communication: Proceedings of the 2019 Future of Information and Communication Conference (FICC)}, number = {2}, year = {2020}, pages = {667-690}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {In every form of digital store-and-forward communication, intermediate forwarding nodes are computers, with attendant memory and processing resources. This has inevitably stimulated efforts to create a wide-area infrastructure that goes beyond simple store-and-forward to create a platform that makes more general and varied use of the potential of this collection of increasingly powerful nodes. Historically, these efforts predate the advent of globally routed packet networking. The desire for a converged infrastructure of this kind has only intensified over the last 30 years, as memory, storage, and processing resources have increased in both density and speed while simultaneously decreasing in cost. Although there is a general consensus that it should be possible to define and deploy such a dramatically more capable wide-area platform, a great deal of investment in research prototypes has yet to produce a credible candidate architecture. Drawing on technical analysis, historical examples, and case studies, we present an argument for the hypothesis that in order to realize a distributed system with the kind of convergent generality and deployment scalability that might qualify as "future-defining," we must build it from a small set of simple, generic, and limited abstractions of the low level resources (processing, storage and network) of its intermediate nodes.}, keywords = {active networks, distributed cloud, distributed processing, distributed storage, edge computing, network convergence, network layering, scalability}, isbn = {978-3-030-12385-7}, author = {Micah Beck and Terry Moore and Piotr Luszczek and Anthony Danalis}, editor = {Kohei Arai and Rahul Bhatia} } @article {, title = {Performance Application Programming Interface for Extreme-Scale Environments (PAPI-EX) (Poster)}, year = {2020}, month = {2020-20}, publisher = {2020 NSF Cyberinfrastructure for Sustained Scientific Innovation (CSSI) Principal Investigator Meeting}, address = {Seattle, WA}, author = {Jack Dongarra and Heike Jagode and Anthony Danalis and Daniel Barry and Vince Weaver} } @article {, title = {PULSE: PAPI Unifying Layer for Software-Defined Events (Poster)}, year = {2020}, month = {2020-02}, publisher = {2020 NSF Cyberinfrastructure for Sustained Scientific Innovation (CSSI) Principal Investigator Meeting}, address = {Seattle, WA}, author = {Heike Jagode and Anthony Danalis} } @techreport {, title = {Roadmap for Refactoring Classic PAPI to PAPI++: Part II: Formulation of Roadmap Based on Survey Results}, journal = {PAPI++ Working Notes}, number = {2, ICL-UT-20-09}, year = {2020}, month = {2020-07}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Heike Jagode and Anthony Danalis and Damien Genet} } @conference {1449, title = {Characterization of Power Usage and Performance in Data-Intensive Applications using MapReduce over MPI}, booktitle = {2019 International Conference on Parallel Computing (ParCo2019)}, year = {2019}, month = {2019-09}, address = {Prague, Czech Republic}, author = {Joshua Davis and Tao Gao and Sunita Chandrasekaran and Heike Jagode and Anthony Danalis and Pavan Balaji and Jack Dongarra and Michela Taufer} } @conference {1379, title = {Counter Inspection Toolkit: Making Sense out of Hardware Performance Events}, booktitle = {11th International Workshop on Parallel Tools for High Performance Computing}, year = {2019}, month = {2019-02}, publisher = {Cham, Switzerland: Springer}, organization = {Cham, Switzerland: Springer}, address = {Dresden, Germany}, abstract = {Hardware counters play an essential role in understanding the behavior of performance-critical applications, and inform any effort to identify opportunities for performance optimization. However, because modern hardware is becoming increasingly complex, the number of counters that are offered by the vendors increases and, in some cases, so does their complexity. In this paper we present a toolkit that aims to assist application developers invested in performance analysis by automatically categorizing and disambiguating performance counters. We present and discuss the set of microbenchmarks and analyses that we developed as part of our toolkit. We explain why they work and discuss the non-obvious reasons why some of our early benchmarks and analyses did not work in an effort to share with the rest of the community the wisdom we acquired from negative results.}, doi = {https://doi.org/10.1007/978-3-030-11987-4_2}, author = {Anthony Danalis and Heike Jagode and H Hanumantharayappa and Sangamesh Ragate and Jack Dongarra} } @article {1387, title = {Does your tool support PAPI SDEs yet?}, year = {2019}, month = {2019-07}, publisher = {13th Scalable Tools Workshop}, address = {Tahoe City, CA}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @article {1377, title = {PAPI Software-Defined Events for in-Depth Performance Analysis}, journal = {The International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1113-1127}, abstract = {The methodology and standardization layer provided by the Performance Application Programming Interface (PAPI) has played a vital role in application profiling for almost two decades. It has enabled sophisticated performance analysis tool designers and performance-conscious scientists to gain insights into their applications by simply instrumenting their code using a handful of PAPI functions that {\textquotedblleft}just work{\textquotedblright} across different hardware components. In the past, PAPI development had focused primarily on hardware-specific performance metrics. However, the rapidly increasing complexity of software infrastructure poses new measurement and analysis challenges for the developers of large-scale applications. In particular, acquiring information regarding the behavior of libraries and runtimes{\textemdash}used by scientific applications{\textemdash}requires low-level binary instrumentation, or APIs specific to each library and runtime. No uniform API for monitoring events that originate from inside the software stack has emerged. In this article, we present our efforts to extend PAPI{\textquoteright}s role so that it becomes the de facto standard for exposing performance-critical events, which we refer to as software-defined events (SDEs), from different software layers. Upgrading PAPI with SDEs enables monitoring of both types of performance events{\textemdash}hardware- and software-related events{\textemdash}in a uniform way, through the same consistent PAPI. The goal of this article is threefold. First, we motivate the need for SDEs and describe our design decisions regarding the functionality we offer through PAPI{\textquoteright}s new SDE interface. Second, we illustrate how SDEs can be utilized by different software packages, specifically, by showcasing their use in the numerical linear algebra library MAGMA-Sparse, the tensor algebra library TAMM that is part of the NWChem suite, and the compiler-based performance analysis tool Byfl. Third, we provide a performance analysis of the overhead that results from monitoring SDEs and discuss the trade-offs between overhead and functionality.}, url = {https://doi.org/10.1177/1094342019846287}, author = {Heike Jagode and Anthony Danalis and Hartwig Anzt and Jack Dongarra} } @article {1386, title = {PAPI{\textquoteright}s new Software-Defined Events for in-depth Performance Analysis}, year = {2019}, month = {2019-09}, publisher = {13th Parallel Tools Workshop}, address = {Dresden, Germany}, abstract = {One of the most recent developments of the Performance API (PAPI) is the addition of Software-Defined Events (SDE). PAPI has successfully served the role of the abstraction and unification layer for hardware performance counters for the past two decades. This talk presents our effort to extend this role to encompass performance critical information that does not originate in hardware, but rather in critical software layers, such as libraries and runtime systems. Our overall objective is to enable monitoring of both types of performance events, hardware- and software-related events, in a uniform way, through one consistent PAPI interface. Performance analysts will be able to form a complete picture of the entire application performance without learning new instrumentation primitives. In this talk, we outline PAPI{\textquoteright}s new SDE API and showcase the usefulness of SDE through its employment in software layers as diverse as the math library MAGMA, the dataflow runtime PaRSEC, and the state-of-the-art chemistry application NWChem. We outline the process of instrumenting these software packages and highlight the performance information that can be acquired with SDEs.}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @conference {1378, title = {Software-Defined Events through PAPI}, booktitle = {2019 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2019}, month = {2019-05}, publisher = {IEEE}, organization = {IEEE}, address = {Rio de Janeiro, Brazil}, abstract = {PAPI has been used for almost two decades as an abstraction and standardization layer for profiling hardware-specific performance metrics. However, application developers-and profiling software packages-are quite often interested in information beyond hardware counters, such as the behavior of libraries used by the software that is being profiled. So far, accessing this information has required interfacing directly with the libraries on a case-by-case basis, or low-level binary instrumentation. In this paper, we introduce the new Software-Defined Event (SDE) component of PAPI which aims to enable PAPI to serve as an abstraction and standardization layer for events that originate in software layers as well. Extending PAPI to include SDEs enables monitoring of both types of performance events-hardware-and software-related events-in a uniform way, through the same consistent PAPI interface. Furthermore, implementing SDE as a PAPI component means that the new API is aimed only at the library developers who wish to export events from within their libraries. The API for reading PAPI events-both hardware and software-remains the same, so all legacy codes and tools that use PAPI will not only continue to work, but they will automatically be able to read SDEs wherever those are available. The goal of this paper is threefold. First, we outline our design decisions regarding the functionality we offer through the new SDE interface, and offer simple examples of usage. Second, we illustrate how those events can be utilized by different software packages, specifically, by showcasing their use in the task-based runtime PaRSEC, and the HPCG supercomputing benchmark. Third, we provide a thorough performance analysis of the overhead that results from monitoring different types of SDEs, and showcase the negligible overhead of using PAPI SDE even in cases of extremely heavy use.}, doi = {https://doi.org/10.1109/IPDPSW.2019.00069}, author = {Anthony Danalis and Heike Jagode and Thomas Herault and Piotr Luszczek and Jack Dongarra} } @article {1390, title = {Understanding Native Event Semantics}, year = {2019}, month = {2019-04}, publisher = {9th JLESC Workshop}, address = {Knoxville, TN}, author = {Anthony Danalis and Heike Jagode and Daniel Barry and Jack Dongarra} } @conference {1380, title = {What it Takes to keep PAPI Instrumental for the HPC Community}, booktitle = {1st Workshop on Sustainable Scientific Software (CW3S19)}, year = {2019}, month = {2019-07}, address = {Collegeville, Minnesota}, url = {https://collegeville.github.io/CW3S19/WorkshopResources/WhitePapers/JagodeHeike_CW3S19_papi.pdf}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @article {1388, title = {What it Takes to keep PAPI Instrumental for the HPC Community}, year = {2019}, month = {2019-07}, publisher = {The 2019 Collegeville Workshop on Sustainable Scientific Software (CW3S19)}, address = {Collegeville, MN}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @article {1389, title = {Is your scheduling good? How would you know?}, year = {2019}, month = {2019-06}, publisher = {14th Scheduling for Large Scale Systems Workshop}, address = {Bordeaux, France}, abstract = {Optimal scheduling is a goal that can rarely be achieved, even in purely theoretical contexts where the nuanced behavior of complex hardware and software systems can be abstracted away, and simplified assumptions can be made. In real runtime systems, task schedulers are usually designed based on intuitions about optimal design and heuristics such as minimizing idle time and load imbalance, as well as maximizing data locality and reuse. This harsh reality is due in part to the very crude tools designers of task scheduling systems have at their disposal for assessing the quality of their assumptions. Examining hardware behavior{\textemdash}such as cache reuse{\textemdash}through counters rarely leads to improvement in scheduler design, and quite often the runtime designers are left with total execution time as their only guiding mechanism. In this talk we will discuss new methods for illuminating the dark corners of task scheduling on real hardware. We will present our work on extending PAPI{\textemdash}which has long been the de facto standard for accessing hardware events{\textemdash}so that it can be used to access software events. We will focus specifically on the impact this work can have on runtime systems with dynamic schedulers, and discuss illustrative examples.}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @article {1212, title = {Accelerating NWChem Coupled Cluster through dataflow-based Execution}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-07}, pages = {540--551}, type = {Journal Article}, chapter = {540}, abstract = {Numerical techniques used for describing many-body systems, such as the Coupled Cluster methods (CC) of the quantum chemistry package NWCHEM, are of extreme interest to the computational chemistry community in fields such as catalytic reactions, solar energy, and bio-mass conversion. In spite of their importance, many of these computationally intensive algorithms have traditionally been thought of in a fairly linear fashion, or are parallelized in coarse chunks. In this paper, we present our effort of converting the NWCHEM{\textquoteright}s CC code into a dataflow-based form that is capable of utilizing the task scheduling system PARSEC (Parallel Runtime Scheduling and Execution Controller): a software package designed to enable high-performance computing at scale. We discuss the modularity of our approach and explain how the PARSEC-enabled dataflow version of the subroutines seamlessly integrate into the NWCHEM codebase. Furthermore, we argue how the CC algorithms can be easily decomposed into finer-grained tasks (compared with the original version of NWCHEM); and how data distribution and load balancing are decoupled and can be tuned independently. We demonstrate performance acceleration by more than a factor of two in the execution of the entire CC component of NWCHEM, concluding that the utilization of dataflow-based execution for CC methods enables more efficient and scalable computation.}, keywords = {CCSD, dag, dataflow, NWChem, parsec, ptg, tasks}, doi = {10.1177/1094342016672543}, url = {http://journals.sagepub.com/doi/10.1177/1094342016672543}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @article {1201, title = {Evaluation of Dataflow Programming Models for Electronic Structure Theory}, journal = {Concurrency and Computation: Practice and Experience: Special Issue on Parallel and Distributed Algorithms}, volume = {2018}, year = {2018}, month = {2018-05}, pages = {1{\textendash}20}, abstract = {Dataflow programming models have been growing in popularity as a means to deliver a good balance between performance and portability in the post-petascale era. In this paper, we evaluate different dataflow programming models for electronic structure methods and compare them in terms of programmability, resource utilization, and scalability. In particular, we evaluate two programming paradigms for expressing scientific applications in a dataflow form: (1) explicit dataflow, where the dataflow is specified explicitly by the developer, and (2) implicit dataflow, where a task scheduling runtime derives the dataflow using per-task data-access information embedded in a serial program. We discuss our findings and present a thorough experimental analysis using methods from the NWChem quantum chemistry application as our case study, and OpenMP, StarPU, and PaRSEC as the task-based runtimes that enable the different forms of dataflow execution. Furthermore, we derive an abstract model to explore the limits of the different dataflow programming paradigms.}, keywords = {CCSD, coupled cluster methods, dataflow, NWChem, OpenMP, parsec, StarPU, task-based runtime}, doi = {https://doi.org/10.1002/cpe.4490}, author = {Heike Jagode and Anthony Danalis and Reazul Hoque and Mathieu Faverge and Jack Dongarra} } @article {1393, title = {PAPI: Counting outside the Box}, year = {2018}, month = {2018-04}, publisher = {8th JLESC Meeting}, address = {Barcelona, Spain}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @article {1391, title = {PAPI{\textquoteright}s New Software-Defined Events for In-Depth Performance Analysis}, year = {2018}, month = {2018-09}, publisher = {CCDSC 2018: Workshop on Clusters, Clouds, and Data for Scientific Computing}, address = {Lyon, France}, abstract = {One of the most recent developments of the Performance API (PAPI) is the addition of Software-Defined Events (SDE). PAPI has successfully served the role of the abstraction and unification layer for hardware performance counters for over a decade. This talk presents our effort to extend this role to encompass performance critical information that does not originate in hardware, but rather in critical software layers, such as libraries and runtime systems. Our overall objective is to enable monitoring of both types of performance events, hardware- and software-related events, in a uniform way, through one consistent PAPI interface. Performance analysts will be able to form a complete picture of the entire application performance without learning new instrumentation primitives. In this talk, we outline PAPI{\textquoteright}s new SDE API and showcase the usefulness of SDE through its employment in software layers as diverse as the math library MAGMA, the dataflow runtime PaRSEC, and the state-of-the-art chemistry application NWChem. We outline the process of instrumenting these software packages and highlight the performance information that can be acquired with SDEs.}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @techreport {1275, title = {Software-Defined Events (SDEs) in MAGMA-Sparse}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-12}, year = {2018}, month = {2018-12}, publisher = {University of Tennessee}, author = {Heike Jagode and Anthony Danalis and Hartwig Anzt and Ichitaro Yamazaki and Mark Hoemmen and Erik Boman and Stanimire Tomov and Jack Dongarra} } @article {1392, title = {Software-Defined Events through PAPI for In-Depth Analysis of Application Performance}, year = {2018}, month = {2018-07}, publisher = {5th Platform for Advanced Scientific Computing Conference (PASC18)}, address = {Basel, Switzerland}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @article {999, title = {Accelerating NWChem Coupled Cluster through Dataflow-Based Execution}, journal = {The International Journal of High Performance Computing Applications}, year = {2017}, month = {2017-01}, pages = {1{\textendash}13}, abstract = {Numerical techniques used for describing many-body systems, such as the Coupled Cluster methods (CC) of the quantum chemistry package NWChem, are of extreme interest to the computational chemistry community in fields such as catalytic reactions, solar energy, and bio-mass conversion. In spite of their importance, many of these computationally intensive algorithms have traditionally been thought of in a fairly linear fashion, or are parallelized in coarse chunks. In this paper, we present our effort of converting the NWChem{\textquoteright}s CC code into a dataflow-based form that is capable of utilizing the task scheduling system PaRSEC (Parallel Runtime Scheduling and Execution Controller): a software package designed to enable high-performance computing at scale. We discuss the modularity of our approach and explain how the PaRSEC-enabled dataflow version of the subroutines seamlessly integrate into the NWChem codebase. Furthermore, we argue how the CC algorithms can be easily decomposed into finer-grained tasks (compared with the original version of NWChem); and how data distribution and load balancing are decoupled and can be tuned independently. We demonstrate performance acceleration by more than a factor of two in the execution of the entire CC component of NWChem, concluding that the utilization of dataflow-based execution for CC methods enables more efficient and scalable computation.}, keywords = {CCSD, dag, dataflow, NWChem, parsec, ptg, tasks}, doi = {10.1177/1094342016672543}, url = {http://journals.sagepub.com/doi/10.1177/1094342016672543}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @techreport {1080, title = {Roadmap for the Development of a Linear Algebra Library for Exascale Computing: SLATE: Software for Linear Algebra Targeting Exascale}, journal = {SLATE Working Notes}, number = {01, ICL-UT-17-02}, year = {2017}, month = {2017-06}, publisher = {Innovative Computing Laboratory, University of Tennessee}, type = {SLATE Working Notes}, author = {Ahmad Abdelfattah and Hartwig Anzt and Aurelien Bouteiller and Anthony Danalis and Jack Dongarra and Mark Gates and Azzam Haidar and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Stephen Wood and Panruo Wu and Ichitaro Yamazaki and Asim YarKhan} } @inproceedings {980, title = {Power Management and Event Verification in PAPI}, journal = {Tools for High Performance Computing 2015: Proceedings of the 9th International Workshop on Parallel Tools for High Performance Computing, September 2015, Dresden, Germany}, year = {2016}, pages = {pp. 41-51}, publisher = {Springer International Publishing}, address = {Dresden, Germany}, abstract = {For more than a decade, the PAPI performance monitoring library has helped to implement the familiar maxim attributed to Lord Kelvin: {\textquotedblleft}If you cannot measure it, you cannot improve it.{\textquotedblright} Widely deployed and widely used, PAPI provides a generic, portable interface for the hardware performance counters available on all modern CPUs and some other components of interest that are scattered across the chip and system. Recent and radical changes in processor and system design{\textemdash}systems that combine multicore CPUs and accelerators, shared and distributed memory, PCI- express and other interconnects{\textemdash}as well as the emergence of power efficiency as a primary design constraint, and reduced data movement as a primary programming goal, pose new challenges and bring new opportunities to PAPI. We discuss new developments of PAPI that allow for multiple sources of performance data to be measured simultaneously via a common software interface. Specifically, a new PAPI component that controls power is discussed. We explore the challenges of shared hardware counters that include system-wide measurements in existing multicore architectures. We conclude with an exploration of future directions for the PAPI interface. }, isbn = {978-3-319-39589-0}, doi = {https://doi.org/10.1007/978-3-319-39589-0_4}, author = {Heike Jagode and Asim YarKhan and Anthony Danalis and Jack Dongarra} } @conference {962, title = {Search Space Generation and Pruning System for Autotuners}, booktitle = {30th IEEE International Parallel \& Distributed Processing Symposium (IPDPS)}, year = {2016}, month = {2016-05}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {This work tackles two simultaneous challenges faced by autotuners: the ease of describing a complex, multidimensional search space, and the speed of evaluating that space, while applying a multitude of pruning constraints. This article presents a declarative notation for describing a search space and a translation system for conversion to a standard C code for fast and multithreaded, as necessary, evaluation. The notation is Python-based and thus simple in syntax and easy to assimilate by the user interested in tuning rather than learning a new programming language. A large number of dimensions and a large number of pruning constraints may be expressed with little effort. The system is discussed in the context of autotuning the canonical matrix multiplication kernel for NVIDIA GPUs, where the search space has 15 dimensions and involves application of 10 complex pruning constrains. The speed of evaluation is compared against generators created using imperative programming style in various scripting and compiled languages.}, author = {Piotr Luszczek and Mark Gates and Jakub Kurzak and Anthony Danalis and Jack Dongarra} } @conference {921, title = {Accelerating NWChem Coupled Cluster through dataflow-based Execution}, booktitle = {11th International Conference on Parallel Processing and Applied Mathematics (PPAM 2015)}, year = {2015}, month = {2015-09}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Krakow, Poland}, abstract = {Numerical techniques used for describing many-body systems, such as the Coupled Cluster methods (CC) of the quantum chemistry package NWChem, are of extreme interest to the computational chemistry community in fields such as catalytic reactions, solar energy, and bio-mass conversion. In spite of their importance, many of these computationally intensive algorithms have traditionally been thought of in a fairly linear fashion, or are parallelised in coarse chunks. In this paper, we present our effort of converting the NWChem{\textquoteright}s CC code into a dataflow-based form that is capable of utilizing the task scheduling system PaRSEC (Parallel Runtime Scheduling and Execution Controller) {\textendash} a software package designed to enable high performance computing at scale. We discuss the modularity of our approach and explain how the PaRSEC-enabled dataflow version of the subroutines seamlessly integrate into the NWChem codebase. Furthermore, we argue how the CC algorithms can be easily decomposed into finer grained tasks (compared to the original version of NWChem); and how data distribution and load balancing are decoupled and can be tuned independently. We demonstrate performance acceleration by more than a factor of two in the execution of the entire CC component of NWChem, concluding that the utilization of dataflow-based execution for CC methods enables more efficient and scalable computation.}, keywords = {CCSD, dag, dataflow, NWChem, parsec, ptg, tasks}, author = {Heike Jagode and Anthony Danalis and George Bosilca and Jack Dongarra} } @conference {915, title = {PaRSEC in Practice: Optimizing a Legacy Chemistry Application through Distributed Task-Based Execution}, booktitle = {2015 IEEE International Conference on Cluster Computing}, year = {2015}, month = {2015-09}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {Task-based execution has been growing in popularity as a means to deliver a good balance between performance and portability in the post-petascale era. The Parallel Runtime Scheduling and Execution Control (PARSEC) framework is a task-based runtime system that we designed to achieve high performance computing at scale. PARSEC offers a programming paradigm that is different than what has been traditionally used to develop large scale parallel scientific applications. In this paper, we discuss the use of PARSEC to convert a part of the Coupled Cluster (CC) component of the Quantum Chemistry package NWCHEM into a task-based form. We explain how we organized the computation of the CC methods in individual tasks with explicitly defined data dependencies between them and re-integrated the modified code into NWCHEM. We present a thorough performance evaluation and demonstrate that the modified code outperforms the original by more than a factor of two. We also compare the performance of different variants of the modified code and explain the different behaviors that lead to the differences in performance.}, keywords = {dag, parsec, ptg, tasks}, author = {Anthony Danalis and Heike Jagode and George Bosilca and Jack Dongarra} } @article {821, title = {An Efficient Distributed Randomized Algorithm for Solving Large Dense Symmetric Indefinite Linear Systems}, journal = {Parallel Computing}, volume = {40}, year = {2014}, month = {2014-07}, pages = {213-223}, abstract = {Randomized algorithms are gaining ground in high-performance computing applications as they have the potential to outperform deterministic methods, while still providing accurate results. We propose a randomized solver for distributed multicore architectures to efficiently solve large dense symmetric indefinite linear systems that are encountered, for instance, in parameter estimation problems or electromagnetism simulations. The contribution of this paper is to propose efficient kernels for applying random butterfly transformations and a new distributed implementation combined with a runtime (PaRSEC) that automatically adjusts data structures, data mappings, and the scheduling as systems scale up. Both the parallel distributed solver and the supporting runtime environment are innovative. To our knowledge, the randomization approach associated with this solver has never been used in public domain software for symmetric indefinite systems. The underlying runtime framework allows seamless data mapping and task scheduling, mapping its capabilities to the underlying hardware features of heterogeneous distributed architectures. The performance of our software is similar to that obtained for symmetric positive definite systems, but requires only half the execution time and half the amount of data storage of a general dense solver.}, keywords = {Distributed linear algebra solvers, LDLT factorization, PaRSEC runtime, plasma, Randomized algorithms, Symmetric indefinite systems}, doi = {10.1016/j.parco.2013.12.003}, author = {Marc Baboulin and Du Becker and George Bosilca and Anthony Danalis and Jack Dongarra} } @conference {764, title = {Power Monitoring with PAPI for Extreme Scale Architectures and Dataflow-based Programming Models}, booktitle = {2014 IEEE International Conference on Cluster Computing}, number = {ICL-UT-14-04}, year = {2014}, month = {2014-09}, publisher = {IEEE}, organization = {IEEE}, address = {Madrid, Spain}, abstract = {For more than a decade, the PAPI performance-monitoring library has provided a clear, portable interface to the hardware performance counters available on all modern CPUs and other components of interest (e.g., GPUs, network, and I/O systems). Most major end-user tools that application developers use to analyze the performance of their applications rely on PAPI to gain access to these performance counters. One of the critical road-blockers on the way to larger, more complex high performance systems, has been widely identified as being the energy efficiency constraints. With modern extreme scale machines having hundreds of thousands of cores, the ability to reduce power consumption for each CPU at the software level becomes critically important, both for economic and environmental reasons. In order for PAPI to continue playing its well established role in HPC, it is pressing to provide valuable performance data that not only originates from within the processing cores but also delivers insight into the power consumption of the system as a whole. An extensive effort has been made to extend the Performance API to support power monitoring capabilities for various platforms. This paper provides detailed information about three components that allow power monitoring on the Intel Xeon Phi and Blue Gene/Q. Furthermore, we discuss the integration of PAPI in PARSEC {\textendash} a taskbased dataflow-driven execution engine {\textendash} enabling hardware performance counter and power monitoring at true task granularity.}, doi = {10.1109/CLUSTER.2014.6968672}, author = {Heike McCraw and James Ralph and Anthony Danalis and Jack Dongarra} } @conference {871, title = {PTG: An Abstraction for Unhindered Parallelism}, booktitle = {International Workshop on Domain-Specific Languages and High-Level Frameworks for High Performance Computing (WOLFHPC)}, year = {2014}, month = {2014-11}, publisher = {IEEE Press}, organization = {IEEE Press}, address = {New Orleans, LA}, abstract = {

Increased parallelism and use of heterogeneous computing resources is now an established trend in High Performance Computing (HPC), a trend that, looking forward to Exascale, seems bound to intensify. Despite the evolution of hardware over the past decade, the programming paradigm of choice was invariably derived from Coarse Grain Parallelism with explicit data movements. We argue that message passing has remained the de facto standard in HPC because, until now, the ever increasing challenges that application developers had to address to create efficient portable applications remained manageable for expert programmers.

Data-flow based programming is an alternative approach with significant potential. In this paper, we discuss the Parameterized Task Graph (PTG) abstraction and present the specialized input language that we use to specify PTGs in our data-flow task-based runtime system, PaRSEC. This language and the corresponding execution model are in contrast with the execution model of explicit message passing as well as the model of alternative task based runtime systems. The Parameterized Task Graph language decouples the expression of the parallelism in the algorithm from the control-flow ordering, load balance, and data distribution. Thus, programs are more adaptable and map more efficiently on challenging hardware, as well as maintain portability across diverse architectures. To support these claims, we discuss the different challenges of HPC programming and how PaRSEC can address them, and we demonstrate that in today{\textquoteright}s large scale supercomputers, PaRSEC can significantly outperform state-of-the-art MPI applications and libraries, a trend that will increase with future architectural evolution.

}, keywords = {dte, parsec, plasma}, author = {Anthony Danalis and George Bosilca and Aurelien Bouteiller and Thomas Herault and Jack Dongarra} } @conference {768, title = {Utilizing Dataflow-based Execution for Coupled Cluster Methods}, booktitle = {2014 IEEE International Conference on Cluster Computing}, number = {ICL-UT-14-02}, year = {2014}, month = {2014-09}, publisher = {IEEE}, organization = {IEEE}, address = {Madrid, Spain}, abstract = {Computational chemistry comprises one of the driving forces of High Performance Computing. In particular, many-body methods, such as Coupled Cluster (CC) methods of the quantum chemistry package NWCHEM, are of particular interest for the applied chemistry community. Harnessing large fractions of the processing power of modern large scale computing platforms has become increasingly difficult. With the increase in scale, complexity, and heterogeneity of modern platforms, traditional programming models fail to deliver the expected performance scalability. On our way to Exascale and with these extremely hybrid platforms, dataflow-based programming models may be the only viable way for achieving and maintaining computation at scale. In this paper, we discuss a dataflow-based programming model and its applicability to NWCHEM{\textquoteright}s CC methods. Our dataflow version of the CC kernels breaks down the algorithm into fine-grained tasks with explicitly defined data dependencies. As a result, many of the traditional synchronization points can be eliminated, allowing for a dynamic reshaping of the execution based on the ongoing availability of computational resources. We build this experiment using PARSEC {\textendash} a task-based dataflow-driven execution engine {\textendash} that enables efficient task scheduling on distributed systems, providing a desirable portability layer for application developers.}, author = {Heike McCraw and Anthony Danalis and George Bosilca and Jack Dongarra and Karol Kowalski and Theresa Windus} } @article {icl:702, title = {BlackjackBench: Portable Hardware Characterization with Automated Results Analysis}, journal = {The Computer Journal}, year = {2013}, month = {2013-03}, abstract = {DARPA{\textquoteright}s AACE project aimed to develop Architecture Aware Compiler Environments. Such a compiler automatically characterizes the targeted hardware and optimizes the application codes accordingly. We present the BlackjackBench suite, a collection of portable micro-benchmarks that automate system characterization, plus statistical analysis techniques for interpreting the results. The BlackjackBench benchmarks discover the effective sizes and speeds of the hardware environment rather than the often unattainable peak values. We aim at hardware characteristics that can be observed by running executables generated by existing compilers from standard C codes. We characterize the memory hierarchy, including cache sharing and non-uniform memory access characteristics of the system, properties of the processing cores affecting the instruction execution speed and the length of the operating system scheduler time slot. We show how these features of modern multicores can be discovered programmatically. We also show how the features could potentially interfere with each other resulting in incorrect interpretation of the results, and how established classification and statistical analysis techniques can reduce experimental noise and aid automatic interpretation of results. We show how effective hardware metrics from our probes allow guided tuning of computational kernels that outperform an autotuning library further tuned by the hardware vendor.}, keywords = {hardware characterization, micro-benchmarks, statistical analysis}, doi = {10.1093/comjnl/bxt057}, author = {Anthony Danalis and Piotr Luszczek and Gabriel Marin and Jeffrey Vetter and Jack Dongarra} } @article {icl:698, title = {Dense Linear Algebra on Distributed Heterogeneous Hardware with a Symbolic DAG Approach}, journal = {Scalable Computing and Communications: Theory and Practice}, year = {2013}, month = {2013-03}, pages = {699-735}, publisher = {John Wiley \& Sons}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Thomas Herault and Piotr Luszczek and Jack Dongarra}, editor = {Samee Khan and Lin-Wang Wang and Albert Zomaya} } @article {749, title = {PaRSEC: Exploiting Heterogeneity to Enhance Scalability}, journal = {IEEE Computing in Science and Engineering}, volume = {15}, year = {2013}, month = {2013-11}, pages = {36-45}, abstract = {New high-performance computing system designs with steeply escalating processor and core counts, burgeoning heterogeneity and accelerators, and increasingly unpredictable memory access times call for dramatically new programming paradigms. These new approaches must react and adapt quickly to unexpected contentions and delays, and they must provide the execution environment with sufficient intelligence and flexibility to rearrange the execution to improve resource utilization.}, doi = {10.1109/MCSE.2013.98}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Thomas Herault and Jack Dongarra} } @inbook {695, title = {Scalable Dense Linear Algebra on Heterogeneous Hardware}, booktitle = {HPC: Transition Towards Exascale Processing, in the series Advances in Parallel Computing}, year = {2013}, abstract = {Abstract. Design of systems exceeding 1 Pflop/s and the push toward 1 Eflop/s, forced a dramatic shift in hardware design. Various physical and engineering constraints resulted in introduction of massive parallelism and functional hybridization with the use of accelerator units. This paradigm change brings about a serious challenge for application developers, as the management of multicore proliferation and heterogeneity rests on software. And it is reasonable to expect, that this situation will not change in the foreseeable future. This chapter presents a methodology of dealing with this issue in three common scenarios. In the context of shared-memory multicore installations, we show how high performance and scalability go hand in hand, when the well-known linear algebra algorithms are recast in terms of Direct Acyclic Graphs (DAGs), which are then transparently scheduled at runtime inside the Parallel Linear Algebra Software for Multicore Architectures (PLASMA) project. Similarly, Matrix Algebra on GPU and Multicore Architectures (MAGMA) schedules DAG-driven computations on multicore processors and accelerators. Finally, Distributed PLASMA (DPLASMA), takes the approach to distributed-memory machines with the use of automatic dependence analysis and the Direct Acyclic Graph Engine (DAGuE) to deliver high performance at the scale of many thousands of cores.}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Thomas Herault and Jakub Kurzak and Piotr Luszczek and Stanimire Tomov and Jack Dongarra} } @article {icl:670, title = {DAGuE: A generic distributed DAG Engine for High Performance Computing.}, journal = {Parallel Computing}, volume = {38}, number = {1-2}, year = {2012}, month = {2012-00}, pages = {27-51}, publisher = {Elsevier}, keywords = {dague, parsec}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Thomas Herault and Pierre Lemariner and Jack Dongarra} } @techreport {icl:683, title = {An efficient distributed randomized solver with application to large dense linear systems}, journal = {ICL Technical Report}, number = {ICL-UT-12-02}, year = {2012}, month = {2012-07}, keywords = {dague, dplasma, parsec, plasma}, author = {Marc Baboulin and Dulceneia Becker and George Bosilca and Anthony Danalis and Jack Dongarra} } @conference {icl:699, title = {From Serial Loops to Parallel Execution on Distributed Systems}, booktitle = {International European Conference on Parallel and Distributed Computing (Euro-Par {\textquoteright}12)}, year = {2012}, month = {2012-08}, address = {Rhodes, Greece}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Thomas Herault and Jack Dongarra} } @inproceedings {icl:591, title = {BlackjackBench: Hardware Characterization with Portable Micro-Benchmarks and Automatic Statistical Analysis of Results}, journal = {IEEE International Parallel and Distributed Processing Symposium (submitted)}, year = {2011}, month = {2011-05}, address = {Anchorage, AK}, author = {Anthony Danalis and Piotr Luszczek and Gabriel Marin and Jeffrey Vetter and Jack Dongarra} } @inproceedings {icl:675, title = {DAGuE: A Generic Distributed DAG Engine for High Performance Computing}, journal = {Proceedings of the Workshops of the 25th IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2011 Workshops)}, year = {2011}, month = {2011-00}, pages = {1151-1158}, publisher = {IEEE}, address = {Anchorage, Alaska, USA}, keywords = {dague, parsec}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Thomas Herault and Pierre Lemariner and Jack Dongarra} } @inproceedings {icl:676, title = {Flexible Development of Dense Linear Algebra Algorithms on Massively Parallel Architectures with DPLASMA}, journal = {Proceedings of the Workshops of the 25th IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2011 Workshops)}, year = {2011}, month = {2011-05}, pages = {1432-1441}, publisher = {IEEE}, address = {Anchorage, Alaska, USA}, keywords = {dague, dplasma, parsec}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra} } @article {icl:646, title = {Impact of Kernel-Assisted MPI Communication over Scientific Applications: CPMD and FFTW}, journal = {18th EuroMPI}, year = {2011}, month = {2011-09}, pages = {247-254}, publisher = {Springer}, address = {Santorini, Greece}, keywords = {dague}, author = {Teng Ma and Aurelien Bouteiller and George Bosilca and Jack Dongarra}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @article {icl:647, title = {OMPIO: A Modular Software Architecture for MPI I/O}, journal = {18th EuroMPI}, year = {2011}, month = {2011-09}, pages = {81-89}, publisher = {Springer}, address = {Santorini, Greece}, author = {Mohamad Chaarawi and Edgar Gabriel and Rainer Keller and Richard L. Graham and George Bosilca and Jack Dongarra}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @inproceedings {icl:674, title = {Scalable Runtime for MPI: Efficiently Building the Communication Infrastructure}, journal = {Proceedings of Recent Advances in the Message Passing Interface - 18th European MPI Users{\textquoteright} Group Meeting, EuroMPI 2011}, volume = {6960}, year = {2011}, month = {2011-09}, pages = {342-344}, publisher = {Springer}, address = {Santorini, Greece}, keywords = {ftmpi}, author = {George Bosilca and Thomas Herault and Pierre Lemariner and Jack Dongarra and A. Rezmerita}, editor = {Yiannis Cotronis and Anthony Danalis and Dimitrios S. Nikolopoulos and Jack Dongarra} } @techreport {icl:528, title = {DAGuE: A generic distributed DAG engine for high performance computing}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-10-01}, year = {2010}, month = {2010-04}, keywords = {dague}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Thomas Herault and Pierre Lemariner and Jack Dongarra} } @techreport {icl:563, title = {Distributed Dense Numerical Linear Algebra Algorithms on Massively Parallel Architectures: DPLASMA}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-10-660}, year = {2010}, month = {2010-09}, keywords = {dague, dplasma, parsec, plasma}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra} } @techreport {icl:529, title = {Distributed-Memory Task Execution and Dependence Tracking within DAGuE and the DPLASMA Project}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-10-02}, year = {2010}, month = {2010-00}, keywords = {dague, plasma}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra} } @inproceedings {icl:503, title = {MPI-aware Compiler Optimizations for Improving Communication-Computation Overlap}, journal = {Proceedings of the 23rd annual International Conference on Supercomputing (ICS {\textquoteright}09)}, year = {2009}, month = {2009-06}, pages = {316-325}, publisher = {ACM}, address = {Yorktown Heights, NY, USA}, author = {Anthony Danalis and Lori Pollock and Martin Swany and John Cavazos} }