@article {1382, title = {PAPI 5: Measuring Power, Energy, and the Cloud}, year = {2013}, month = {2013-04}, publisher = {2013 IEEE International Symposium on Performance Analysis of Systems and Software}, address = {Austin, TX}, author = {Vincent Weaver and Dan Terpstra and Heike McCraw and Matt Johnson and Kiran Kasichayanula and James Ralph and John Nelson and Phil Mucci and Tushar Mohan and Shirley Moore} } @article {icl:688, title = {PAPI-V: Performance Monitoring for Virtual Machines}, journal = {CloudTech-HPC 2012}, year = {2012}, month = {2012-09}, address = {Pittsburgh, PA}, abstract = {This paper describes extensions to the PAPI hardware counter library for virtual environments, called PAPI-V. The extensions support timing routines, I/O measurements, and processor counters. The PAPI-V extensions will allow application and tool developers to use a familiar interface to obtain relevant hardware performance monitoring information in virtual environments.}, keywords = {papi}, doi = {10.1109/ICPPW.2012.29}, author = {Matt Johnson and Heike McCraw and Shirley Moore and Phil Mucci and John Nelson and Dan Terpstra and Vincent M Weaver and Tushar Mohan} } @inproceedings {icl:296, title = {Analysis and Optimization of Yee_Bench using Hardware Performance Counters}, journal = {Proceedings of Parallel Computing 2005 (ParCo)}, year = {2005}, month = {2005-01}, address = {Malaga, Spain}, abstract = {In this paper, we report on our analysis and optimization of a serial Fortran 90 benchmark called Yee bench. This benchmark has been run on a variety of architectures and its performance is reasonably well understood. However, on AMD Opteron based machines, we found unexpected dips in the delivered MFLOPS of the code for a seemingly random set of problem sizes. Through the use of the Opteron{\textquoteright}s on-chip hardware performance counters andPapiEx, aPAPI based tool, we discovered that these drops were directly related to high L1 cache miss rates for these problem sizes. The high miss rates could be attributed to the fact that in the two core regions of the code we have references to three dynamically allocated arrays which compete for the same set in the Opteron{\textquoteright}s 2-way set associative cache. We validated this conclusion by accurately predicting those problem sizes that exhibit this problem. We were able to alleviate these performance anomalies using variable intra-array padding to effectively accomplish inter-array padding. We conclude with some comments on the general applicability of this method as well how one might improving the implementation of the Fortran 90ALLOCATE intrinsic to handle this case. 1.}, keywords = {papi}, author = {Ulf Andersson and Phil Mucci} } @conference {icl:297, title = {PerfMiner: Cluster-Wide Collection, Storage and Presentation of Application Level Hardware Performance Data}, booktitle = {European Conference on Parallel Processing (Euro-Par 2005)}, year = {2005}, month = {2005-09}, publisher = {Springer}, organization = {Springer}, address = {Monte de Caparica, Portugal}, abstract = {We present PerfMiner, a system for the transparent collection, storage and presentation of thread-level hardware performance data across an entire cluster. Every sub-process/thread spawned by the user through the batch system is measured with near zero overhead and no dilation of run-time. Performance metrics are collected at the thread level using tool built on top of the Performance Application Programming Interface (PAPI). As the hardware counters are virtualized by the OS, the resulting counts are largely unaffected by other kernel or user processes. PerfMiner correlates this performance data with metadata from the batch system and places it in a database. Through a command line and web interface, the user can make queries to the database to report information on everything from overall workload characterization and system utilization to the performance of a single thread in a specific application. This is in contrast to other monitoring systems that report aggregate system-wide metrics sampled over a period of time. In this paper, we describe our implementation of PerfMiner as well as present some results from the test deployment of PerfMiner across three different clusters at the Center for Parallel Computers at The Royal Institute of Technology in Stockholm, Sweden.}, keywords = {papi}, doi = {https://doi.org/10.1007/11549468_1}, author = {Phil Mucci and Daniel Ahlin and Johan Danielsson and Per Ekman and Lars Malinowski} } @conference {icl:197, title = {Accurate Cache and TLB Characterization Using Hardware Counters}, booktitle = {International Conference on Computational Science (ICCS 2004)}, year = {2004}, month = {2004-06}, publisher = {Springer}, organization = {Springer}, address = {Krakow, Poland}, abstract = {We have developed a set of microbenchmarks for accurately determining the structural characteristics of data cache memories and TLBs. These characteristics include cache size, cache line size, cache associativity, memory page size, number of data TLB entries, and data TLB associativity. Unlike previous microbenchmarks that used time-based measurements, our microbenchmarks use hardware event counts to more accurately and quickly determine these characteristics while requiring fewer limiting assumptions.}, keywords = {gco, lacsi, papi}, doi = {https://doi.org/10.1007/978-3-540-24688-6_57}, author = {Jack Dongarra and Shirley Moore and Phil Mucci and Keith Seymour and Haihang You} } @conference {icl:239, title = {Automating the Large-Scale Collection and Analysis of Performance}, booktitle = {5th LCI International Conference on Linux Clusters: The HPC Revolution}, year = {2004}, month = {2004-05}, address = {Austin, Texas}, keywords = {kojak, papi}, author = {Phil Mucci and Jack Dongarra and Rick Kufrin and Shirley Moore and Fengguang Song and Felix Wolf} } @inproceedings {icl:246, title = {Memory Bandwidth and the Performance of Scientific Applications: A Study of the AMD Opteron Processor}, journal = {2005 IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS) (submitted)}, year = {2004}, month = {2004-01}, author = {Phil Mucci} } @conference {icl:130, title = {Experiences and Lessons Learned with a Portable Interface to Hardware Performance Counters}, booktitle = {PADTAD Workshop, IPDPS 2003}, year = {2003}, month = {2003-04}, publisher = {IEEE}, organization = {IEEE}, address = {Nice, France}, abstract = {The PAPI project has defined and implemented a cross-platform interface to the hardware counters available on most modern microprocessors. The interface has gained widespread use and acceptance from hardware vendors, users, and tool developers. This paper reports on experiences with the community-based open-source effort to define the PAPI specification and implement it on a variety of platforms. Collaborations with tool developers who have incorporated support for PAPI are described. Issues related to interpretation and accuracy of hardware counter data and to the overheads of collecting this data are discussed. The paper concludes with implications for the design of the next version of PAPI.}, keywords = {lacsi, papi}, isbn = {0-7695-1926-1}, author = {Jack Dongarra and Kevin London and Shirley Moore and Phil Mucci and Dan Terpstra and Haihang You and Min Zhou} } @conference {icl:159, title = {Performance Instrumentation and Measurement for Terascale Systems}, booktitle = {ICCS 2003 Terascale Workshop}, year = {2003}, month = {2003-06}, publisher = {Springer, Berlin, Heidelberg}, organization = {Springer, Berlin, Heidelberg}, address = {Melbourne, Australia}, abstract = {As computer systems grow in size and complexity, tool support is needed to facilitate the efficient mapping of large-scale applications onto these systems. To help achieve this mapping, performance analysis tools must provide robust performance observation capabilities at all levels of the system, as well as map low-level behavior to high-level program constructs. Instrumentation and measurement strategies, developed over the last several years, must evolve together with performance analysis infrastructure to address the challenges of new scalable parallel systems.}, keywords = {papi}, doi = {https://doi.org/10.1007/3-540-44864-0_6}, author = {Jack Dongarra and Allen D. Malony and Shirley Moore and Phil Mucci and Sameer Shende} } @conference {icl:15, title = {End-user Tools for Application Performance Analysis, Using Hardware Counters}, booktitle = {International Conference on Parallel and Distributed Computing Systems}, year = {2001}, month = {2001-08}, address = {Dallas, TX}, abstract = {One purpose of the end-user tools described in this paper is to give users a graphical representation of performance information that has been gathered by instrumenting an application with the PAPI library. PAPI is a project that specifies a standard API for accessing hardware performance counters available on most modern microprocessors. These counters exist as a small set of registers that count \"events\", which are occurrences of specific signals and states related to a processor{\textquoteright}s function. Monitoring these events facilitates correlation between the structure of source/object code and the efficiency of the mapping of that code to the underlying architecture. The perfometer tool developed by the PAPI project provides a graphical view of this information, allowing users to quickly see where performance bottlenecks are in their application. Only one function call has to be added by the user to their program to take advantage of perfometer. This makes it quick and simple to add and remove instrumentation from a program. Also, perfometer allows users to change the \"event\" they are monitoring. Add the ability to monitor parallel applications, set alarms and a Java front-end that can run anywhere, and this gives the user a powerful tool for quickly discovering where and why a bottleneck exists. A number of third-party tools for analyzing performance of message-passing and/or threaded programs have also incorporated support for PAPI so as to be able to display and analyze hardware counter data from their interfaces.}, keywords = {papi}, author = {Kevin London and Jack Dongarra and Shirley Moore and Phil Mucci and Keith Seymour and T. Spencer} } @conference {icl:16, title = {The PAPI Cross-Platform Interface to Hardware Performance Counters}, booktitle = {Department of Defense Users{\textquoteright} Group Conference Proceedings}, year = {2001}, month = {2001-06}, address = {Biloxi, Mississippi}, abstract = {The purpose of the PAPI project is to specify a standard API for accessing hardware performance counters available on most modern microprocessors. These counters exist as a small set of registers that count \"events,\" which are occurrences of specific signals and states related to the processor{\textquoteright}s function. Monitoring these events facilitates correlation between the structure of source/object code and the efficiency of the mapping of that code to the underlying architecture. This correlation has a variety of uses in performance analysis and tuning. The PAPI project has developed a standard set of hardware events and a standard cross-platform library interface to the underlying counter hardware. The PAPI library has been implemented for a number of Shared Resource Center platforms. The PAPI project is developing end-user tools for dynamically selecting and displaying hardware counter performance data. PAPI support is also being incorporated into a number of third-party tools.}, keywords = {papi}, author = {Kevin London and Shirley Moore and Phil Mucci and Keith Seymour and Richard Luczak} } @conference {icl:11, title = {Using PAPI for Hardware Performance Monitoring on Linux Systems}, booktitle = {Conference on Linux Clusters: The HPC Revolution}, year = {2001}, month = {2001-06}, publisher = {Linux Clusters Institute}, organization = {Linux Clusters Institute}, address = {Urbana, Illinois}, abstract = {PAPI is a specification of a cross-platform interface to hardware performance counters on modern microprocessors. These counters exist as a small set of registers that count events, which are occurrences of specific signals related to a processor{\textquoteright}s function. Monitoring these events has a variety of uses in application performance analysis and tuning. The PAPI specification consists of both a standard set of events deemed most relevant for application performance tuning, as well as both high-level and low-level sets of routines for accessing the counters. The high level interface simply provides the ability to start, stop, and read sets of events, and is intended for the acquisition of simple but accurate measurement by application engineers. The fully programmable low-level interface provides sophisticated options for controlling the counters, such as setting thresholds for interrupt on overflow, as well as access to all native counting modes and events, and is intended for third-party tool writers or users with more sophisticated needs. PAPI has been implemented on a number of platforms, including Linux/x86 and Linux/IA-64. The Linux/x86 implementation requires a kernel patch that provides a driver for the hardware counters. The driver memory maps the counter registers into user space and allows virtualizing the counters on a perprocess or per-thread basis. The kernel patch is being proposed for inclusion in the main Linux tree. The PAPI library provides access on Linux platforms not only to the standard set of events mentioned above but also to all the Linux/x86 and Linux/IA-64 native events. PAPI has been installed and is in use, either directly or through incorporation into third-party end-user performance analysis tools, on a number of Linux clusters, including the New Mexico LosLobos cluster and Linux clusters at NCSA and the University of Tennessee being used for the GrADS (Grid Application Development Software) project. }, keywords = {papi}, author = {Jack Dongarra and Kevin London and Shirley Moore and Phil Mucci and Dan Terpstra} } @article {icl:31, title = {A Portable Programming Interface for Performance Evaluation on Modern Processors}, journal = {The International Journal of High Performance Computing Applications}, volume = {14}, number = {3}, year = {2000}, month = {2000-09}, pages = {189-204}, keywords = {papi}, doi = {https://doi.org/10.1177/109434200001400303}, author = {Shirley Browne and Jack Dongarra and Nathan Garner and George Ho and Phil Mucci} } @techreport {icl:226, title = {A Portable Programming Interface for Performance Evaluation on Modern Processors}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-00-444}, year = {2000}, month = {2000-07}, author = {Shirley Browne and Jack Dongarra and Nathan Garner and Kevin London and Phil Mucci} } @inproceedings {icl:32, title = {A Scalable Cross-Platform Infrastructure for Application Performance Tuning Using Hardware Counters}, journal = {Proceedings of SuperComputing 2000 (SC{\textquoteright}00)}, year = {2000}, month = {2000-11}, address = {Dallas, TX}, keywords = {papi}, author = {Shirley Browne and Jack Dongarra and Nathan Garner and Kevin London and Phil Mucci} } @inproceedings {icl:59, title = {PAPI: A Portable Interface to Hardware Performance Counters}, journal = {Proceedings of Department of Defense HPCMP Users Group Conference}, year = {1999}, month = {1999-06}, keywords = {papi}, author = {Shirley Browne and Christine Deane and George Ho and Phil Mucci} }