@conference {811, title = {MIAMI: A Framework for Application Performance Diagnosis }, booktitle = {IPASS-2014}, year = {2014}, month = {2014-03}, publisher = {IEEE}, organization = {IEEE}, address = {Monterey, CA}, abstract = {A typical application tuning cycle repeats the following three steps in a loop: performance measurement, analysis of results, and code refactoring. While performance measurement is well covered by existing tools, analysis of results to understand the main sources of inefficiency and to identify opportunities for optimization is generally left to the user. Today{\textquoteright}s state of the art performance analysis tools use instrumentation or hardware counter sampling to measure the performance of interactions between code and the target architecture during execution. Such measurements are useful to identify hotspots in applications, places where execution time is spent or where cache misses are incurred. However, explanatory understanding of tuning opportunities requires a more detailed, mechanistic modeling approach. This paper presents MIAMI (Machine Independent Application Models for performance Insight), a set of tools for automatic performance diagnosis. MIAMI uses application characterization and models of target architectures to reason about an application{\textquoteright}s performance. MIAMI uses a modeling approach based on first-order principles to identify performance bottlenecks, pinpoint optimization opportunities, and compute bounds on the potential for improvement.}, isbn = {978-1-4799-3604-5}, doi = {10.1109/ISPASS.2014.6844480}, author = {Gabriel Marin and Jack Dongarra and Dan Terpstra} } @conference {1383, title = {Beyond the CPU: Hardware Performance Counter Monitoring on Blue Gene/Q}, booktitle = {International Supercomputing Conference 2013 (ISC{\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {Springer}, organization = {Springer}, address = {Leipzig, Germany}, author = {Heike McCraw and Dan Terpstra and Jack Dongarra and Kris Davis and Roy Musselman} } @conference {1381, title = {Non-Determinism and Overcount on Modern Hardware Performance Counter Implementations}, booktitle = {2013 IEEE International Symposium on Performance Analysis of Systems and Software}, year = {2013}, month = {2013-04}, publisher = {IEEE}, organization = {IEEE}, address = {Austin, TX}, author = {Vincent Weaver and Dan Terpstra and Shirley Moore} } @article {1382, title = {PAPI 5: Measuring Power, Energy, and the Cloud}, year = {2013}, month = {2013-04}, publisher = {2013 IEEE International Symposium on Performance Analysis of Systems and Software}, address = {Austin, TX}, author = {Vincent Weaver and Dan Terpstra and Heike McCraw and Matt Johnson and Kiran Kasichayanula and James Ralph and John Nelson and Phil Mucci and Tushar Mohan and Shirley Moore} } @inproceedings {icl:689, title = {Measuring Energy and Power with PAPI}, journal = {International Workshop on Power-Aware Systems and Architectures}, year = {2012}, month = {2012-09}, address = {Pittsburgh, PA}, abstract = {Energy and power consumption are becoming critical metrics in the design and usage of high performance systems. We have extended the Performance API (PAPI) analysis library to measure and report energy and power values. These values are reported using the existing PAPI API, allowing code previously instrumented for performance counters to also measure power and energy. Higher level tools that build on PAPI will automatically gain support for power and energy readings when used with the newest version of PAPI. We describe in detail the types of energy and power readings available through PAPI. We support external power meters, as well as values provided internally by recent CPUs and GPUs. Measurements are provided directly to the instrumented process, allowing immediate code analysis in real time. We provide examples showing results that can be obtained with our infrastructure.}, keywords = {papi}, doi = {10.1109/ICPPW.2012.39}, author = {Vincent M Weaver and Matt Johnson and Kiran Kasichayanula and James Ralph and Piotr Luszczek and Dan Terpstra and Shirley Moore} } @article {icl:688, title = {PAPI-V: Performance Monitoring for Virtual Machines}, journal = {CloudTech-HPC 2012}, year = {2012}, month = {2012-09}, address = {Pittsburgh, PA}, abstract = {This paper describes extensions to the PAPI hardware counter library for virtual environments, called PAPI-V. The extensions support timing routines, I/O measurements, and processor counters. The PAPI-V extensions will allow application and tool developers to use a familiar interface to obtain relevant hardware performance monitoring information in virtual environments.}, keywords = {papi}, doi = {10.1109/ICPPW.2012.29}, author = {Matt Johnson and Heike McCraw and Shirley Moore and Phil Mucci and John Nelson and Dan Terpstra and Vincent M Weaver and Tushar Mohan} } @article {icl:686, title = {Power Aware Computing on GPUs}, journal = {SAAHPC {\textquoteright}12 (Best Paper Award)}, year = {2012}, month = {2012-07}, address = {Argonne, IL}, keywords = {magma}, author = {Kiran Kasichayanula and Dan Terpstra and Piotr Luszczek and Stanimire Tomov and Shirley Moore and Gregory D. Peterson} } @inproceedings {icl:616, title = {Evaluation of the HPC Challenge Benchmarks in Virtualized Environments}, journal = {6th Workshop on Virtualization in High-Performance Cloud Computing}, year = {2011}, month = {2011-08}, address = {Bordeaux, France}, keywords = {hpcc}, author = {Piotr Luszczek and Eric Meek and Shirley Moore and Dan Terpstra and Vincent M Weaver and Jack Dongarra} } @article {icl:557, title = {Collecting Performance Data with PAPI-C}, journal = {Tools for High Performance Computing 2009}, year = {2010}, month = {2010-05}, pages = {157-173}, publisher = {Springer Berlin / Heidelberg}, address = {3rd Parallel Tools Workshop, Dresden, Germany}, abstract = {Modern high performance computer systems continue to increase in size and complexity. Tools to measure application performance in these increasingly complex environments must also increase the richness of their measurements to provide insights into the increasingly intricate ways in which software and hardware interact. PAPI (the Performance API) has provided consistent platform and operating system independent access to CPU hardware performance counters for nearly a decade. Recent trends toward massively parallel multi-core systems with often heterogeneous architectures present new challenges for the measurement of hardware performance information, which is now available not only on the CPU core itself, but scattered across the chip and system. We discuss the evolution of PAPI into Component PAPI, or PAPI-C, in which multiple sources of performance data can be measured simultaneously via a common software interface. Several examples of components and component data measurements are discussed. We explore the challenges to hardware performance measurement in existing multi-core architectures. We conclude with an exploration of future directions for the PAPI interface.}, keywords = {mumi, papi}, doi = {https://doi.org/10.1007/978-3-642-11261-4_11}, author = {Dan Terpstra and Heike Jagode and Haihang You and Jack Dongarra} } @article {icl:497, title = {I/O Performance Analysis for the Petascale Simulation Code FLASH}, journal = {ISC{\textquoteright}09}, year = {2009}, month = {2009-06}, address = {Hamburg, Germany}, keywords = {test}, author = {Heike Jagode and Shirley Moore and Dan Terpstra and Jack Dongarra and Andreas Knuepfer and Matthias Jurenz and Matthias S. Mueller and Wolfgang E. Nagel} } @conference {icl:130, title = {Experiences and Lessons Learned with a Portable Interface to Hardware Performance Counters}, booktitle = {PADTAD Workshop, IPDPS 2003}, year = {2003}, month = {2003-04}, publisher = {IEEE}, organization = {IEEE}, address = {Nice, France}, abstract = {The PAPI project has defined and implemented a cross-platform interface to the hardware counters available on most modern microprocessors. The interface has gained widespread use and acceptance from hardware vendors, users, and tool developers. This paper reports on experiences with the community-based open-source effort to define the PAPI specification and implement it on a variety of platforms. Collaborations with tool developers who have incorporated support for PAPI are described. Issues related to interpretation and accuracy of hardware counter data and to the overheads of collecting this data are discussed. The paper concludes with implications for the design of the next version of PAPI.}, keywords = {lacsi, papi}, isbn = {0-7695-1926-1}, author = {Jack Dongarra and Kevin London and Shirley Moore and Phil Mucci and Dan Terpstra and Haihang You and Min Zhou} } @conference {icl:11, title = {Using PAPI for Hardware Performance Monitoring on Linux Systems}, booktitle = {Conference on Linux Clusters: The HPC Revolution}, year = {2001}, month = {2001-06}, publisher = {Linux Clusters Institute}, organization = {Linux Clusters Institute}, address = {Urbana, Illinois}, abstract = {PAPI is a specification of a cross-platform interface to hardware performance counters on modern microprocessors. These counters exist as a small set of registers that count events, which are occurrences of specific signals related to a processor{\textquoteright}s function. Monitoring these events has a variety of uses in application performance analysis and tuning. The PAPI specification consists of both a standard set of events deemed most relevant for application performance tuning, as well as both high-level and low-level sets of routines for accessing the counters. The high level interface simply provides the ability to start, stop, and read sets of events, and is intended for the acquisition of simple but accurate measurement by application engineers. The fully programmable low-level interface provides sophisticated options for controlling the counters, such as setting thresholds for interrupt on overflow, as well as access to all native counting modes and events, and is intended for third-party tool writers or users with more sophisticated needs. PAPI has been implemented on a number of platforms, including Linux/x86 and Linux/IA-64. The Linux/x86 implementation requires a kernel patch that provides a driver for the hardware counters. The driver memory maps the counter registers into user space and allows virtualizing the counters on a perprocess or per-thread basis. The kernel patch is being proposed for inclusion in the main Linux tree. The PAPI library provides access on Linux platforms not only to the standard set of events mentioned above but also to all the Linux/x86 and Linux/IA-64 native events. PAPI has been installed and is in use, either directly or through incorporation into third-party end-user performance analysis tools, on a number of Linux clusters, including the New Mexico LosLobos cluster and Linux clusters at NCSA and the University of Tennessee being used for the GrADS (Grid Application Development Software) project. }, keywords = {papi}, author = {Jack Dongarra and Kevin London and Shirley Moore and Phil Mucci and Dan Terpstra} }