@inproceedings {, title = {Memory Traffic and Complete Application Profiling with PAPI Multi-Component Measurements}, journal = {2023 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2023}, month = {2023-08}, publisher = {IEEE}, address = {St. Petersburg, Florida}, abstract = {Some of the most important categories of performance events count the data traffic between the processing cores and the main memory. However, since these counters are not core-private, applications require elevated privileges to access them. PAPI offers a component that can access this information on IBM systems through the Performance Co-Pilot (PCP); however, doing so adds an indirection layer that involves querying the PCP daemon. This paper performs a quantitative study of the accuracy of the measurements obtained through this component on the Summit supercomputer. We use two linear algebra kernels---a generalized matrix multiply, and a modified matrix-vector multiply---as benchmarks and a distributed, GPU-accelerated 3D-FFT mini-app (using cuFFT) to compare the measurements obtained through the PAPI PCP component against the expected values across different problem sizes. We also compare our measurements against an in-house machine with a very similar architecture to Summit, where elevated privileges allow PAPI to access the hardware counters directly (without using PCP) to show that measurements taken via PCP are as accurate as the those taken directly. Finally, using both QMCPACK and the 3D-FFT, we demonstrate the diverse hardware activities that can be monitored simultaneously via PAPI hardware components.}, keywords = {GPU power, High Performance Computing, network traffic, papi, performance analysis, Performance Counters}, doi = {10.1109/IPDPSW59300.2023.00070}, url = {https://ieeexplore.ieee.org/document/10196656}, author = {Daniel Barry and Heike Jagode and Anthony Danalis and Jack Dongarra} } @article {, title = {Memory Traffic and Complete Application Profiling with PAPI Multi-Component Measurements}, year = {2023}, month = {2023-05}, publisher = {28th HIPS Workshop}, address = {St. Petersburg, FL}, author = {Daniel Barry and Heike Jagode and Anthony Danalis and Jack Dongarra} } @inbook {, title = {Performance Application Programming Interface}, booktitle = {Accelerated Computing with HIP}, year = {2022}, month = {2022-12}, publisher = {Sun, Baruah and Kaeli}, organization = {Sun, Baruah and Kaeli}, isbn = {B0BR8KSS7K}, url = {https://a.co/d/0DoG5as}, author = {Anthony Danalis and Heike Jagode} } @inbook {, title = {Effortless Monitoring of Arithmetic Intensity with PAPI{\textquoteright}s Counter Analysis Toolkit}, booktitle = {Tools for High Performance Computing 2018/2019}, year = {2021}, pages = {195{\textendash}218}, publisher = {Springer}, organization = {Springer}, abstract = {With exascale computing forthcoming, performance metrics such as memory traffic and arithmetic intensity are increasingly important for codes that heavily utilize numerical kernels. Performance metrics in different CPU architectures can be monitored by reading the occurrences of various hardware events. However, from architecture to architecture, it becomes more and more unclear which native performance events are indexed by which event names, making it difficult for users to understand what specific events actually measure. This ambiguity seems particularly true for events related to hardware that resides beyond the compute core, such as events related to memory traffic. Still, traffic to memory is a necessary characteristic for determining arithmetic intensity. To alleviate this difficulty, PAPI{\textquoteright}s Counter Analysis Toolkit measures the occurrences of events through a series of benchmarks, allowing its users to discover the high-level meaning of native events. We (i) leverage the capabilities of the Counter Analysis Toolkit to identify the names of hardware events for reading and writing bandwidth utilization in addition to floating-point operations, (ii) measure the occurrences of the events they index during the execution of important numerical kernels, and (iii) verify their identities by comparing these occurrence patterns to the expected arithmetic intensity of the numerical kernels.}, isbn = {978-3-030-66057-4}, doi = {10.1007/978-3-030-66057-4_11}, author = {Daniel Barry and Danalis, Anthony and Heike Jagode} } @inbook {, title = {An Introduction to High Performance Computing and Its Intersection with Advances in Modeling Rare Earth Elements and Actinides}, booktitle = {Rare Earth Elements and Actinides: Progress in Computational Science Applications}, volume = {1388}, year = {2021}, month = {2021-10}, pages = {3-53}, publisher = {American Chemical Society}, organization = {American Chemical Society}, chapter = {1}, address = {Washington, DC}, abstract = {Computationally driven solutions in nuclear and radiochemistry heavily depend on efficient modeling of Rare Earth Elements (REEs) and actinides. Accurate modeling of REEs and actinides faces challenges stemming from limitations from an imbalanced hardware-software ecosystem and its implications on inefficient use of High Performance Computing (HPC). This chapter provides a historical perspective on the evolution of HPC hardware, its intersectionality with domain sciences, the importance of benchmarks for performance, and an overview of challenges and advances in modeling REEs and actinides. This chapter intends to provide an introduction for researchers at the intersection of scientific computing, software development for HPC, and applied computational modeling of REEs and actinides. The chapter is structured in five sections. First, the Introduction includes subsections focusing on the Importance of REEs and Actinides (1.1), Hardware, Software, and the HPC Ecosystem (1.2), and Electronic Structure Modeling of REEs and Actinides (1.3). Second, a section in High Performance Computing focuses on the TOP500 (2.1), HPC Performance (2.2), HPC Benchmarks: Processing, Bandwidth, and Latency (2.3), and HPC Benchmarks and their Relationship to Chemical Modeling (2.4). Third, the Software Challenges and Advances focus on NWChem/NWChemEx (3.1), MADNESS (3.2), and MPQC (3.3). The fourth section provides a short overview of Artificial Intelligence in HPC applications relevant to nuclear and radiochemistry. The fifth section illustrates A Protocol to Evaluate Complexation Preferences in Separations of REEs and Actinides through Computational Modeling.}, keywords = {actinide, Computational modeling, HPC, REE}, isbn = {ISBN13: 9780841298255 eISBN: 9780841298248}, doi = {10.1021/bk-2021-1388.ch001}, url = {https://pubs.acs.org/doi/10.1021/bk-2021-1388.ch001}, author = {Deborah A. Penchoff and Edward Valeev and Heike Jagode and Piotr Luszczek and Anthony Danalis and George Bosilca and Robert J. Harrison and Jack Dongarra and Theresa L. Windus} } @book {, title = {Lecture Notes in Computer Science: High Performance Computing}, volume = {12761}, year = {2021}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {This book constitutes the refereed post-conference proceedings of 9 workshops held at the 35th International ISC High Performance 2021 Conference, in Frankfurt, Germany, in June-July 2021: Second International Workshop on the Application of Machine Learning Techniques to Computational Fluid Dynamics and Solid Mechanics Simulations and Analysis; HPC-IODC: HPC I/O in the Data Center Workshop; Compiler-assisted Correctness Checking and Performance Optimization for HPC; Machine Learning on HPC Systems; 4th International Workshop on Interoperability of Supercomputing and Cloud Technologies; 2nd International Workshop on Monitoring and Operational Data Analytics; 16th Workshop on Virtualization in High--Performance Cloud Computing; Deep Learning on Supercomputers; 5th International Workshop on In Situ Visualization. The 35 papers included in this volume were carefully reviewed and selected. They cover all aspects of research, development, and application of large-scale, high performance experimental and commercial systems. Topics include high-performance computing (HPC), computer architecture and hardware, programming models, system software, performance analysis and modeling, compiler analysis and optimization techniques, software sustainability, scientific applications, deep learning.}, isbn = {978-3-030-90538-5}, doi = {10.1007/978-3-030-90539-2}, author = {Heike Jagode and Anzt, Hartwig and Ltaief, Hatem and Piotr Luszczek} } @conference {, title = {Effortless Monitoring of Arithmetic Intensity with PAPI{\textquoteright}s Counter Analysis Toolkit}, booktitle = {13th International Workshop on Parallel Tools for High Performance Computing}, year = {2020}, month = {2020-09}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Dresden, Germany}, abstract = {With exascale computing forthcoming, performance metrics such as memory traffic and arithmetic intensity are increasingly important for codes that heavily utilize numerical kernels. Performance metrics in different CPU architectures can be monitored by reading the occurrences of various hardware events. However, from architecture to architecture, it becomes more and more unclear which native performance events are indexed by which event names, making it difficult for users to understand what specific events actually measure. This ambiguity seems particularly true for events related to hardware that resides beyond the compute core, such as events related to memory traffic. Still, traffic to memory is a necessary characteristic for determining arithmetic intensity. To alleviate this difficulty, PAPI{\textquoteright}s Counter Analysis Toolkit measures the occurrences of events through a series of benchmarks, allowing its users to discover the high-level meaning of native events. We (i) leverage the capabilities of the Counter Analysis Toolkit to identify the names of hardware events for reading and writing bandwidth utilization in addition to floating-point operations, (ii) measure the occurrences of the events they index during the execution of important numerical kernels, and (iii) verify their identities by comparing these occurrence patterns to the expected arithmetic intensity of the numerical kernels.}, author = {Daniel Barry and Anthony Danalis and Heike Jagode} } @article {, title = {Exa-PAPI: The Exascale Performance API with Modern C++}, year = {2020}, month = {2020-02}, publisher = {2020 Exascale Computing Project Annual Meeting}, address = {Houston, TX}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @techreport {1457, title = {Formulation of Requirements for New PAPI++ Software Package: Part I: Survey Results}, journal = {PAPI++ Working Notes}, number = {1, ICL-UT-20-02}, year = {2020}, month = {2020-01}, publisher = {Innovative Computing Laboratory, University of Tennessee Knoxville}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @article {, title = {Performance Application Programming Interface for Extreme-Scale Environments (PAPI-EX) (Poster)}, year = {2020}, month = {2020-20}, publisher = {2020 NSF Cyberinfrastructure for Sustained Scientific Innovation (CSSI) Principal Investigator Meeting}, address = {Seattle, WA}, author = {Jack Dongarra and Heike Jagode and Anthony Danalis and Daniel Barry and Vince Weaver} } @article {, title = {PULSE: PAPI Unifying Layer for Software-Defined Events (Poster)}, year = {2020}, month = {2020-02}, publisher = {2020 NSF Cyberinfrastructure for Sustained Scientific Innovation (CSSI) Principal Investigator Meeting}, address = {Seattle, WA}, author = {Heike Jagode and Anthony Danalis} } @techreport {, title = {Roadmap for Refactoring Classic PAPI to PAPI++: Part II: Formulation of Roadmap Based on Survey Results}, journal = {PAPI++ Working Notes}, number = {2, ICL-UT-20-09}, year = {2020}, month = {2020-07}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Heike Jagode and Anthony Danalis and Damien Genet} } @conference {1449, title = {Characterization of Power Usage and Performance in Data-Intensive Applications using MapReduce over MPI}, booktitle = {2019 International Conference on Parallel Computing (ParCo2019)}, year = {2019}, month = {2019-09}, address = {Prague, Czech Republic}, author = {Joshua Davis and Tao Gao and Sunita Chandrasekaran and Heike Jagode and Anthony Danalis and Pavan Balaji and Jack Dongarra and Michela Taufer} } @conference {1379, title = {Counter Inspection Toolkit: Making Sense out of Hardware Performance Events}, booktitle = {11th International Workshop on Parallel Tools for High Performance Computing}, year = {2019}, month = {2019-02}, publisher = {Cham, Switzerland: Springer}, organization = {Cham, Switzerland: Springer}, address = {Dresden, Germany}, abstract = {Hardware counters play an essential role in understanding the behavior of performance-critical applications, and inform any effort to identify opportunities for performance optimization. However, because modern hardware is becoming increasingly complex, the number of counters that are offered by the vendors increases and, in some cases, so does their complexity. In this paper we present a toolkit that aims to assist application developers invested in performance analysis by automatically categorizing and disambiguating performance counters. We present and discuss the set of microbenchmarks and analyses that we developed as part of our toolkit. We explain why they work and discuss the non-obvious reasons why some of our early benchmarks and analyses did not work in an effort to share with the rest of the community the wisdom we acquired from negative results.}, doi = {https://doi.org/10.1007/978-3-030-11987-4_2}, author = {Anthony Danalis and Heike Jagode and H Hanumantharayappa and Sangamesh Ragate and Jack Dongarra} } @article {1387, title = {Does your tool support PAPI SDEs yet?}, year = {2019}, month = {2019-07}, publisher = {13th Scalable Tools Workshop}, address = {Tahoe City, CA}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @article {1377, title = {PAPI Software-Defined Events for in-Depth Performance Analysis}, journal = {The International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1113-1127}, abstract = {The methodology and standardization layer provided by the Performance Application Programming Interface (PAPI) has played a vital role in application profiling for almost two decades. It has enabled sophisticated performance analysis tool designers and performance-conscious scientists to gain insights into their applications by simply instrumenting their code using a handful of PAPI functions that {\textquotedblleft}just work{\textquotedblright} across different hardware components. In the past, PAPI development had focused primarily on hardware-specific performance metrics. However, the rapidly increasing complexity of software infrastructure poses new measurement and analysis challenges for the developers of large-scale applications. In particular, acquiring information regarding the behavior of libraries and runtimes{\textemdash}used by scientific applications{\textemdash}requires low-level binary instrumentation, or APIs specific to each library and runtime. No uniform API for monitoring events that originate from inside the software stack has emerged. In this article, we present our efforts to extend PAPI{\textquoteright}s role so that it becomes the de facto standard for exposing performance-critical events, which we refer to as software-defined events (SDEs), from different software layers. Upgrading PAPI with SDEs enables monitoring of both types of performance events{\textemdash}hardware- and software-related events{\textemdash}in a uniform way, through the same consistent PAPI. The goal of this article is threefold. First, we motivate the need for SDEs and describe our design decisions regarding the functionality we offer through PAPI{\textquoteright}s new SDE interface. Second, we illustrate how SDEs can be utilized by different software packages, specifically, by showcasing their use in the numerical linear algebra library MAGMA-Sparse, the tensor algebra library TAMM that is part of the NWChem suite, and the compiler-based performance analysis tool Byfl. Third, we provide a performance analysis of the overhead that results from monitoring SDEs and discuss the trade-offs between overhead and functionality.}, url = {https://doi.org/10.1177/1094342019846287}, author = {Heike Jagode and Anthony Danalis and Hartwig Anzt and Jack Dongarra} } @article {1386, title = {PAPI{\textquoteright}s new Software-Defined Events for in-depth Performance Analysis}, year = {2019}, month = {2019-09}, publisher = {13th Parallel Tools Workshop}, address = {Dresden, Germany}, abstract = {One of the most recent developments of the Performance API (PAPI) is the addition of Software-Defined Events (SDE). PAPI has successfully served the role of the abstraction and unification layer for hardware performance counters for the past two decades. This talk presents our effort to extend this role to encompass performance critical information that does not originate in hardware, but rather in critical software layers, such as libraries and runtime systems. Our overall objective is to enable monitoring of both types of performance events, hardware- and software-related events, in a uniform way, through one consistent PAPI interface. Performance analysts will be able to form a complete picture of the entire application performance without learning new instrumentation primitives. In this talk, we outline PAPI{\textquoteright}s new SDE API and showcase the usefulness of SDE through its employment in software layers as diverse as the math library MAGMA, the dataflow runtime PaRSEC, and the state-of-the-art chemistry application NWChem. We outline the process of instrumenting these software packages and highlight the performance information that can be acquired with SDEs.}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @conference {1378, title = {Software-Defined Events through PAPI}, booktitle = {2019 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2019}, month = {2019-05}, publisher = {IEEE}, organization = {IEEE}, address = {Rio de Janeiro, Brazil}, abstract = {PAPI has been used for almost two decades as an abstraction and standardization layer for profiling hardware-specific performance metrics. However, application developers-and profiling software packages-are quite often interested in information beyond hardware counters, such as the behavior of libraries used by the software that is being profiled. So far, accessing this information has required interfacing directly with the libraries on a case-by-case basis, or low-level binary instrumentation. In this paper, we introduce the new Software-Defined Event (SDE) component of PAPI which aims to enable PAPI to serve as an abstraction and standardization layer for events that originate in software layers as well. Extending PAPI to include SDEs enables monitoring of both types of performance events-hardware-and software-related events-in a uniform way, through the same consistent PAPI interface. Furthermore, implementing SDE as a PAPI component means that the new API is aimed only at the library developers who wish to export events from within their libraries. The API for reading PAPI events-both hardware and software-remains the same, so all legacy codes and tools that use PAPI will not only continue to work, but they will automatically be able to read SDEs wherever those are available. The goal of this paper is threefold. First, we outline our design decisions regarding the functionality we offer through the new SDE interface, and offer simple examples of usage. Second, we illustrate how those events can be utilized by different software packages, specifically, by showcasing their use in the task-based runtime PaRSEC, and the HPCG supercomputing benchmark. Third, we provide a thorough performance analysis of the overhead that results from monitoring different types of SDEs, and showcase the negligible overhead of using PAPI SDE even in cases of extremely heavy use.}, doi = {https://doi.org/10.1109/IPDPSW.2019.00069}, author = {Anthony Danalis and Heike Jagode and Thomas Herault and Piotr Luszczek and Jack Dongarra} } @article {1390, title = {Understanding Native Event Semantics}, year = {2019}, month = {2019-04}, publisher = {9th JLESC Workshop}, address = {Knoxville, TN}, author = {Anthony Danalis and Heike Jagode and Daniel Barry and Jack Dongarra} } @conference {1380, title = {What it Takes to keep PAPI Instrumental for the HPC Community}, booktitle = {1st Workshop on Sustainable Scientific Software (CW3S19)}, year = {2019}, month = {2019-07}, address = {Collegeville, Minnesota}, url = {https://collegeville.github.io/CW3S19/WorkshopResources/WhitePapers/JagodeHeike_CW3S19_papi.pdf}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @article {1388, title = {What it Takes to keep PAPI Instrumental for the HPC Community}, year = {2019}, month = {2019-07}, publisher = {The 2019 Collegeville Workshop on Sustainable Scientific Software (CW3S19)}, address = {Collegeville, MN}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @article {1389, title = {Is your scheduling good? How would you know?}, year = {2019}, month = {2019-06}, publisher = {14th Scheduling for Large Scale Systems Workshop}, address = {Bordeaux, France}, abstract = {Optimal scheduling is a goal that can rarely be achieved, even in purely theoretical contexts where the nuanced behavior of complex hardware and software systems can be abstracted away, and simplified assumptions can be made. In real runtime systems, task schedulers are usually designed based on intuitions about optimal design and heuristics such as minimizing idle time and load imbalance, as well as maximizing data locality and reuse. This harsh reality is due in part to the very crude tools designers of task scheduling systems have at their disposal for assessing the quality of their assumptions. Examining hardware behavior{\textemdash}such as cache reuse{\textemdash}through counters rarely leads to improvement in scheduler design, and quite often the runtime designers are left with total execution time as their only guiding mechanism. In this talk we will discuss new methods for illuminating the dark corners of task scheduling on real hardware. We will present our work on extending PAPI{\textemdash}which has long been the de facto standard for accessing hardware events{\textemdash}so that it can be used to access software events. We will focus specifically on the impact this work can have on runtime systems with dynamic schedulers, and discuss illustrative examples.}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @article {1212, title = {Accelerating NWChem Coupled Cluster through dataflow-based Execution}, journal = {The International Journal of High Performance Computing Applications}, volume = {32}, year = {2018}, month = {2018-07}, pages = {540--551}, type = {Journal Article}, chapter = {540}, abstract = {Numerical techniques used for describing many-body systems, such as the Coupled Cluster methods (CC) of the quantum chemistry package NWCHEM, are of extreme interest to the computational chemistry community in fields such as catalytic reactions, solar energy, and bio-mass conversion. In spite of their importance, many of these computationally intensive algorithms have traditionally been thought of in a fairly linear fashion, or are parallelized in coarse chunks. In this paper, we present our effort of converting the NWCHEM{\textquoteright}s CC code into a dataflow-based form that is capable of utilizing the task scheduling system PARSEC (Parallel Runtime Scheduling and Execution Controller): a software package designed to enable high-performance computing at scale. We discuss the modularity of our approach and explain how the PARSEC-enabled dataflow version of the subroutines seamlessly integrate into the NWCHEM codebase. Furthermore, we argue how the CC algorithms can be easily decomposed into finer-grained tasks (compared with the original version of NWCHEM); and how data distribution and load balancing are decoupled and can be tuned independently. We demonstrate performance acceleration by more than a factor of two in the execution of the entire CC component of NWCHEM, concluding that the utilization of dataflow-based execution for CC methods enables more efficient and scalable computation.}, keywords = {CCSD, dag, dataflow, NWChem, parsec, ptg, tasks}, doi = {10.1177/1094342016672543}, url = {http://journals.sagepub.com/doi/10.1177/1094342016672543}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @article {1201, title = {Evaluation of Dataflow Programming Models for Electronic Structure Theory}, journal = {Concurrency and Computation: Practice and Experience: Special Issue on Parallel and Distributed Algorithms}, volume = {2018}, year = {2018}, month = {2018-05}, pages = {1{\textendash}20}, abstract = {Dataflow programming models have been growing in popularity as a means to deliver a good balance between performance and portability in the post-petascale era. In this paper, we evaluate different dataflow programming models for electronic structure methods and compare them in terms of programmability, resource utilization, and scalability. In particular, we evaluate two programming paradigms for expressing scientific applications in a dataflow form: (1) explicit dataflow, where the dataflow is specified explicitly by the developer, and (2) implicit dataflow, where a task scheduling runtime derives the dataflow using per-task data-access information embedded in a serial program. We discuss our findings and present a thorough experimental analysis using methods from the NWChem quantum chemistry application as our case study, and OpenMP, StarPU, and PaRSEC as the task-based runtimes that enable the different forms of dataflow execution. Furthermore, we derive an abstract model to explore the limits of the different dataflow programming paradigms.}, keywords = {CCSD, coupled cluster methods, dataflow, NWChem, OpenMP, parsec, StarPU, task-based runtime}, doi = {https://doi.org/10.1002/cpe.4490}, author = {Heike Jagode and Anthony Danalis and Reazul Hoque and Mathieu Faverge and Jack Dongarra} } @article {1199, title = {Investigating Power Capping toward Energy-Efficient Scientific Applications}, journal = {Concurrency Computation: Practice and Experience}, volume = {2018}, year = {2018}, month = {2018-04}, pages = {1-14}, abstract = {The emergence of power efficiency as a primary constraint in processor and system design poses new challenges concerning power and energy awareness for numerical libraries and scientific applications. Power consumption also plays a major role in the design of data centers, which may house petascale or exascale-level computing systems. At these extreme scales, understanding and improving the energy efficiency of numerical libraries and their related applications becomes a crucial part of the successful implementation and operation of the computing system. In this paper, we study and investigate the practice of controlling a compute system{\textquoteright}s power usage, and we explore how different power caps affect the performance of numerical algorithms with different computational intensities. Further, we determine the impact, in terms of performance and energy usage, that these caps have on a system running scientific applications. This analysis will enable us to characterize the types of algorithms that benefit most from these power management schemes. Our experiments are performed using a set of representative kernels and several popular scientific benchmarks. We quantify a number of power and performance measurements and draw observations and conclusions that can be viewed as a roadmap to achieving energy efficiency in the design and execution of scientific algorithms.}, keywords = {energy efficiency, High Performance Computing, Intel Xeon Phi, Knights landing, papi, performance analysis, Performance Counters, power efficiency}, doi = {https://doi.org/10.1002/cpe.4485}, url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/cpe.4485}, author = {Azzam Haidar and Heike Jagode and Phil Vaccaro and Asim YarKhan and Stanimire Tomov and Jack Dongarra} } @article {1393, title = {PAPI: Counting outside the Box}, year = {2018}, month = {2018-04}, publisher = {8th JLESC Meeting}, address = {Barcelona, Spain}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @article {1391, title = {PAPI{\textquoteright}s New Software-Defined Events for In-Depth Performance Analysis}, year = {2018}, month = {2018-09}, publisher = {CCDSC 2018: Workshop on Clusters, Clouds, and Data for Scientific Computing}, address = {Lyon, France}, abstract = {One of the most recent developments of the Performance API (PAPI) is the addition of Software-Defined Events (SDE). PAPI has successfully served the role of the abstraction and unification layer for hardware performance counters for over a decade. This talk presents our effort to extend this role to encompass performance critical information that does not originate in hardware, but rather in critical software layers, such as libraries and runtime systems. Our overall objective is to enable monitoring of both types of performance events, hardware- and software-related events, in a uniform way, through one consistent PAPI interface. Performance analysts will be able to form a complete picture of the entire application performance without learning new instrumentation primitives. In this talk, we outline PAPI{\textquoteright}s new SDE API and showcase the usefulness of SDE through its employment in software layers as diverse as the math library MAGMA, the dataflow runtime PaRSEC, and the state-of-the-art chemistry application NWChem. We outline the process of instrumenting these software packages and highlight the performance information that can be acquired with SDEs.}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @techreport {1275, title = {Software-Defined Events (SDEs) in MAGMA-Sparse}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-18-12}, year = {2018}, month = {2018-12}, publisher = {University of Tennessee}, author = {Heike Jagode and Anthony Danalis and Hartwig Anzt and Ichitaro Yamazaki and Mark Hoemmen and Erik Boman and Stanimire Tomov and Jack Dongarra} } @article {1392, title = {Software-Defined Events through PAPI for In-Depth Analysis of Application Performance}, year = {2018}, month = {2018-07}, publisher = {5th Platform for Advanced Scientific Computing Conference (PASC18)}, address = {Basel, Switzerland}, author = {Anthony Danalis and Heike Jagode and Jack Dongarra} } @article {999, title = {Accelerating NWChem Coupled Cluster through Dataflow-Based Execution}, journal = {The International Journal of High Performance Computing Applications}, year = {2017}, month = {2017-01}, pages = {1{\textendash}13}, abstract = {Numerical techniques used for describing many-body systems, such as the Coupled Cluster methods (CC) of the quantum chemistry package NWChem, are of extreme interest to the computational chemistry community in fields such as catalytic reactions, solar energy, and bio-mass conversion. In spite of their importance, many of these computationally intensive algorithms have traditionally been thought of in a fairly linear fashion, or are parallelized in coarse chunks. In this paper, we present our effort of converting the NWChem{\textquoteright}s CC code into a dataflow-based form that is capable of utilizing the task scheduling system PaRSEC (Parallel Runtime Scheduling and Execution Controller): a software package designed to enable high-performance computing at scale. We discuss the modularity of our approach and explain how the PaRSEC-enabled dataflow version of the subroutines seamlessly integrate into the NWChem codebase. Furthermore, we argue how the CC algorithms can be easily decomposed into finer-grained tasks (compared with the original version of NWChem); and how data distribution and load balancing are decoupled and can be tuned independently. We demonstrate performance acceleration by more than a factor of two in the execution of the entire CC component of NWChem, concluding that the utilization of dataflow-based execution for CC methods enables more efficient and scalable computation.}, keywords = {CCSD, dag, dataflow, NWChem, parsec, ptg, tasks}, doi = {10.1177/1094342016672543}, url = {http://journals.sagepub.com/doi/10.1177/1094342016672543}, author = {Heike Jagode and Anthony Danalis and Jack Dongarra} } @techreport {1105, title = {Dataflow Programming Paradigms for Computational Chemistry Methods}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-17-01}, year = {2017}, month = {2017-05}, publisher = {University of Tennessee}, type = {PhD Dissertation (Computer Science)}, address = {Knoxville, TN}, abstract = {The transition to multicore and heterogeneous architectures has shaped the High Performance Computing (HPC) landscape over the past decades. With the increase in scale, complexity, and heterogeneity of modern HPC platforms, one of the grim challenges for traditional programming models is to sustain the expected performance at scale. By contrast, dataflow programming models have been growing in popularity as a means to deliver a good balance between performance and portability in the post-petascale era. This work introduces dataflow programming models for computational chemistry methods, and compares different dataflow executions in terms of programmability, resource utilization, and scalability. This effort is driven by computational chemistry applications, considering that they comprise one of the driving forces of HPC. In particular, many-body methods, such as Coupled Cluster methods (CC), which are the "gold standard" to compute energies in quantum chemistry, are of particular interest for the applied chemistry community. On that account, the latest development for CC methods is used as the primary vehicle for this research, but our effort is not limited to CC and can be applied across other application domains. Two programming paradigms for expressing CC methods into a dataflow form, in order to make them capable of utilizing task scheduling systems, are presented. Explicit dataflow, is the programming model where the dataflow is explicitly specified by the developer, is contrasted with implicit dataflow, where a task scheduling runtime derives the dataflow. An abstract model is derived to explore the limits of the different dataflow programming paradigms.}, url = {http://trace.tennessee.edu/utk_graddiss/4469/}, author = {Heike Jagode} } @inbook {1384, title = {Performance Analysis and Debugging Tools at Scale}, booktitle = {Exascale Scientific Applications: Scalability and Performance Portability}, year = {2017}, month = {2017-11}, pages = {17-50}, publisher = {Chapman \& Hall / CRC Press}, organization = {Chapman \& Hall / CRC Press}, chapter = {2}, abstract = {This chapter explores present-day challenges and those likely to arise as new hardware and software technologies are introduced on the path to exascale. It covers some of the underlying hardware, software, and techniques that enable tools and debuggers. Performance tools and debuggers are critical components that enable computational scientists to fully exploit the computing power of While high-performance computing systems. Instrumentation is the insertion of code to perform measurement in a program. It is vital step in performance analysis, especially for parallel programs. The essence of a debugging tool is enabling observation, exploration, and control of program state, such that a developer can, for example, verify that what is currently occurring correlates to what is intended. The increased complexity and volume of performance and debugging data likely to be seen on exascale systems risks overwhelming tool users. Tools and debuggers may need to develop advanced techniques such as automated filtering and analysis to reduce the complexity seen by the user.}, isbn = {9781315277400}, doi = {https://doi.org/10.1201/b21930}, author = {Scott Parker and John Mellor-Crummey and Dong H. Ahn and Heike Jagode and Holger Brunst and Sameer Shende and Allen D. Malony and David DelSignore and Ronny Tschuter and Ralph Castain and Kevin Harms and Philip Carns and Ray Loy and Kalyan Kumaran} } @conference {1134, title = {Power-aware Computing: Measurement, Control, and Performance Analysis for Intel Xeon Phi}, booktitle = {2017 IEEE High Performance Extreme Computing Conference (HPEC{\textquoteright}17), Best Paper Finalist}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Waltham, MA}, abstract = {The emergence of power efficiency as a primary constraint in processor and system designs poses new challenges concerning power and energy awareness for numerical libraries and scientific applications. Power consumption also plays a major role in the design of data centers in particular for peta- and exa- scale systems. Understanding and improving the energy efficiency of numerical simulation becomes very crucial. We present a detailed study and investigation toward control- ling power usage and exploring how different power caps affect the performance of numerical algorithms with different computa- tional intensities, and determine the impact and correlation with performance of scientific applications. Our analyses is performed using a set of representatives kernels, as well as many highly used scientific benchmarks. We quantify a number of power and performance measurements, and draw observations and conclusions that can be viewed as a roadmap toward achieving energy efficiency computing algorithms.}, doi = {https://doi.org/10.1109/HPEC.2017.8091085}, author = {Azzam Haidar and Heike Jagode and Asim YarKhan and Phil Vaccaro and Stanimire Tomov and Jack Dongarra} } @article {1338, title = {Power-Aware HPC on Intel Xeon Phi KNL Processors}, year = {2017}, month = {2017-06}, publisher = {ISC High Performance (ISC17), Intel Booth Presentation}, address = {Frankfurt, Germany}, author = {Azzam Haidar and Heike Jagode and Asim YarKhan and Phil Vaccaro and Stanimire Tomov and Jack Dongarra} } @inproceedings {980, title = {Power Management and Event Verification in PAPI}, journal = {Tools for High Performance Computing 2015: Proceedings of the 9th International Workshop on Parallel Tools for High Performance Computing, September 2015, Dresden, Germany}, year = {2016}, pages = {pp. 41-51}, publisher = {Springer International Publishing}, address = {Dresden, Germany}, abstract = {For more than a decade, the PAPI performance monitoring library has helped to implement the familiar maxim attributed to Lord Kelvin: {\textquotedblleft}If you cannot measure it, you cannot improve it.{\textquotedblright} Widely deployed and widely used, PAPI provides a generic, portable interface for the hardware performance counters available on all modern CPUs and some other components of interest that are scattered across the chip and system. Recent and radical changes in processor and system design{\textemdash}systems that combine multicore CPUs and accelerators, shared and distributed memory, PCI- express and other interconnects{\textemdash}as well as the emergence of power efficiency as a primary design constraint, and reduced data movement as a primary programming goal, pose new challenges and bring new opportunities to PAPI. We discuss new developments of PAPI that allow for multiple sources of performance data to be measured simultaneously via a common software interface. Specifically, a new PAPI component that controls power is discussed. We explore the challenges of shared hardware counters that include system-wide measurements in existing multicore architectures. We conclude with an exploration of future directions for the PAPI interface. }, isbn = {978-3-319-39589-0}, doi = {https://doi.org/10.1007/978-3-319-39589-0_4}, author = {Heike Jagode and Asim YarKhan and Anthony Danalis and Jack Dongarra} } @conference {921, title = {Accelerating NWChem Coupled Cluster through dataflow-based Execution}, booktitle = {11th International Conference on Parallel Processing and Applied Mathematics (PPAM 2015)}, year = {2015}, month = {2015-09}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Krakow, Poland}, abstract = {Numerical techniques used for describing many-body systems, such as the Coupled Cluster methods (CC) of the quantum chemistry package NWChem, are of extreme interest to the computational chemistry community in fields such as catalytic reactions, solar energy, and bio-mass conversion. In spite of their importance, many of these computationally intensive algorithms have traditionally been thought of in a fairly linear fashion, or are parallelised in coarse chunks. In this paper, we present our effort of converting the NWChem{\textquoteright}s CC code into a dataflow-based form that is capable of utilizing the task scheduling system PaRSEC (Parallel Runtime Scheduling and Execution Controller) {\textendash} a software package designed to enable high performance computing at scale. We discuss the modularity of our approach and explain how the PaRSEC-enabled dataflow version of the subroutines seamlessly integrate into the NWChem codebase. Furthermore, we argue how the CC algorithms can be easily decomposed into finer grained tasks (compared to the original version of NWChem); and how data distribution and load balancing are decoupled and can be tuned independently. We demonstrate performance acceleration by more than a factor of two in the execution of the entire CC component of NWChem, concluding that the utilization of dataflow-based execution for CC methods enables more efficient and scalable computation.}, keywords = {CCSD, dag, dataflow, NWChem, parsec, ptg, tasks}, author = {Heike Jagode and Anthony Danalis and George Bosilca and Jack Dongarra} } @conference {915, title = {PaRSEC in Practice: Optimizing a Legacy Chemistry Application through Distributed Task-Based Execution}, booktitle = {2015 IEEE International Conference on Cluster Computing}, year = {2015}, month = {2015-09}, publisher = {IEEE}, organization = {IEEE}, address = {Chicago, IL}, abstract = {Task-based execution has been growing in popularity as a means to deliver a good balance between performance and portability in the post-petascale era. The Parallel Runtime Scheduling and Execution Control (PARSEC) framework is a task-based runtime system that we designed to achieve high performance computing at scale. PARSEC offers a programming paradigm that is different than what has been traditionally used to develop large scale parallel scientific applications. In this paper, we discuss the use of PARSEC to convert a part of the Coupled Cluster (CC) component of the Quantum Chemistry package NWCHEM into a task-based form. We explain how we organized the computation of the CC methods in individual tasks with explicitly defined data dependencies between them and re-integrated the modified code into NWCHEM. We present a thorough performance evaluation and demonstrate that the modified code outperforms the original by more than a factor of two. We also compare the performance of different variants of the modified code and explain the different behaviors that lead to the differences in performance.}, keywords = {dag, parsec, ptg, tasks}, author = {Anthony Danalis and Heike Jagode and George Bosilca and Jack Dongarra} } @conference {icl:633, title = {Parallel Performance Measurement of Heterogeneous Parallel Systems with GPUs}, booktitle = {International Conference on Parallel Processing (ICPP{\textquoteright}11)}, year = {2011}, month = {2011-09}, publisher = {ACM}, organization = {ACM}, address = {Taipei, Taiwan}, abstract = {The power of GPUs is giving rise to heterogeneous parallel computing, with new demands on programming environments, runtime systems, and tools to deliver high-performing applications. This paper studies the problems associated with performance measurement of heterogeneous machines with GPUs. A heterogeneous computation model and alternative host-GPU measurement approaches are discussed to set the stage for reporting new capabilities for heterogeneous parallel performance measurement in three leading HPC tools: PAPI, Vampir, and the TAU Performance System. Our work leverages the new CUPTI tool support in NVIDIA{\textquoteright}s CUDA device library. Heterogeneous benchmarks from the SHOC suite are used to demonstrate the measurement methods and tool support.}, keywords = {magma, mumi, papi}, isbn = {978-0-7695-4510-3}, doi = {10.1109/ICPP.2011.71}, author = {Allen D. Malony and Scott Biersdorff and Sameer Shende and Heike Jagode and Stanimire Tomov and Guido Juckeland and Robert Dietrich and Duncan Poole and Christopher Lamb} } @article {1361, title = {Power-aware Computing on GPGPUs}, year = {2011}, month = {2011-09}, publisher = {Fall Creek Falls Conference, Poster}, address = {Gatlinburg, TN}, author = {Kiran Kasichayanula and Haihang You and Shirley Moore and Stanimire Tomov and Heike Jagode and Matt Johnson} } @article {icl:557, title = {Collecting Performance Data with PAPI-C}, journal = {Tools for High Performance Computing 2009}, year = {2010}, month = {2010-05}, pages = {157-173}, publisher = {Springer Berlin / Heidelberg}, address = {3rd Parallel Tools Workshop, Dresden, Germany}, abstract = {Modern high performance computer systems continue to increase in size and complexity. Tools to measure application performance in these increasingly complex environments must also increase the richness of their measurements to provide insights into the increasingly intricate ways in which software and hardware interact. PAPI (the Performance API) has provided consistent platform and operating system independent access to CPU hardware performance counters for nearly a decade. Recent trends toward massively parallel multi-core systems with often heterogeneous architectures present new challenges for the measurement of hardware performance information, which is now available not only on the CPU core itself, but scattered across the chip and system. We discuss the evolution of PAPI into Component PAPI, or PAPI-C, in which multiple sources of performance data can be measured simultaneously via a common software interface. Several examples of components and component data measurements are discussed. We explore the challenges to hardware performance measurement in existing multi-core architectures. We conclude with an exploration of future directions for the PAPI interface.}, keywords = {mumi, papi}, doi = {https://doi.org/10.1007/978-3-642-11261-4_11}, author = {Dan Terpstra and Heike Jagode and Haihang You and Jack Dongarra} } @article {icl:576, title = {Trace-based Performance Analysis for the Petascale Simulation Code FLASH}, journal = {International Journal of High Performance Computing Applications (to appear)}, year = {2010}, month = {2010-00}, author = {Heike Jagode and Andreas Knuepfer and Jack Dongarra and Matthias Jurenz and Matthias S. Mueller and Wolfgang E. Nagel} } @inproceedings {icl:474, title = {A Holistic Approach for Performance Measurement and Analysis for Petascale Applications}, journal = {ICCS 2009 Joint Workshop: Tools for Program Development and Analysis in Computational Science and Software Engineering for Large-Scale Computing}, volume = {2009}, year = {2009}, month = {2009-05}, pages = {686-695}, publisher = {Springer-Verlag Berlin Heidelberg 2009}, address = {Baton Rouge, Louisiana}, keywords = {point, test}, author = {Heike Jagode and Jack Dongarra and Sadaf Alam and Jeffrey Vetter and W. Spear and Allen D. Malony}, editor = {Gabrielle Allen} } @article {icl:480, title = {Impact of Quad-core Cray XT4 System and Software Stack on Scientific Computation}, journal = {Euro-Par 2009, Lecture Notes in Computer Science}, volume = {5704/2009}, year = {2009}, month = {2009-08}, pages = {334-344}, publisher = {Springer Berlin / Heidelberg}, address = {Delft, The Netherlands}, keywords = {test}, author = {Sadaf Alam and Richard F. Barrett and Heike Jagode and J. A. Kuehn and Steve W. Poole and R. Sankaran} } @article {icl:497, title = {I/O Performance Analysis for the Petascale Simulation Code FLASH}, journal = {ISC{\textquoteright}09}, year = {2009}, month = {2009-06}, address = {Hamburg, Germany}, keywords = {test}, author = {Heike Jagode and Shirley Moore and Dan Terpstra and Jack Dongarra and Andreas Knuepfer and Matthias Jurenz and Matthias S. Mueller and Wolfgang E. Nagel} } @inproceedings {icl:602, title = {Modeling the Office of Science Ten Year Facilities Plan: The PERI Architecture Tiger Team}, journal = {SciDAC 2009, Journal of Physics: Conference Series}, volume = {180(2009)012039}, year = {2009}, month = {2009-07}, publisher = {IOP Publishing}, address = {San Diego, California}, keywords = {test}, author = {Bronis R. de Supinski and Sadaf Alam and David Bailey and Laura Carrington and Chris Daley and Anshu Dubey and Todd Gamblin and Dan Gunter and Paul D. Hovland and Heike Jagode and Karen Karavanic and Gabriel Marin and John Mellor-Crummey and Shirley Moore and Boyana Norris and Leonid Oliker and Catherine Olschanowsky and Philip C. Roth and Martin Schulz and Sameer Shende and Allan Snavely} } @techreport {icl:475, title = {Trace-based Performance Analysis for the Petascale Simulation Code FLASH}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-09-01}, year = {2009}, month = {2009-04}, keywords = {test}, author = {Heike Jagode and Andreas Knuepfer and Jack Dongarra and Matthias Jurenz and Matthias S. Mueller and Wolfgang E. Nagel} } @inproceedings {icl:463, title = {Custom assignment of MPI ranks for parallel multi-dimensional FFTs: Evaluation of BG/P versus BG/L}, journal = {Proceedings of the 2008 IEEE International Symposium on Parallel and Distributed Processing with Applications (ISPA-08)}, year = {2008}, month = {2008-01}, pages = {271-283}, publisher = {IEEE Computer Society}, address = {Sydney, Australia}, author = {Heike Jagode and Joachim Hein} } @techreport {icl:411, title = {Task placement of parallel multi-dimensional FFTs on a mesh communication network}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-08-613}, year = {2008}, month = {2008-01}, author = {Heike Jagode and Joachim Hein and Arthur Trew} }