@conference {1381, title = {Non-Determinism and Overcount on Modern Hardware Performance Counter Implementations}, booktitle = {2013 IEEE International Symposium on Performance Analysis of Systems and Software}, year = {2013}, month = {2013-04}, publisher = {IEEE}, organization = {IEEE}, address = {Austin, TX}, author = {Vincent Weaver and Dan Terpstra and Shirley Moore} } @article {1382, title = {PAPI 5: Measuring Power, Energy, and the Cloud}, year = {2013}, month = {2013-04}, publisher = {2013 IEEE International Symposium on Performance Analysis of Systems and Software}, address = {Austin, TX}, author = {Vincent Weaver and Dan Terpstra and Heike McCraw and Matt Johnson and Kiran Kasichayanula and James Ralph and John Nelson and Phil Mucci and Tushar Mohan and Shirley Moore} } @inproceedings {icl:689, title = {Measuring Energy and Power with PAPI}, journal = {International Workshop on Power-Aware Systems and Architectures}, year = {2012}, month = {2012-09}, address = {Pittsburgh, PA}, abstract = {Energy and power consumption are becoming critical metrics in the design and usage of high performance systems. We have extended the Performance API (PAPI) analysis library to measure and report energy and power values. These values are reported using the existing PAPI API, allowing code previously instrumented for performance counters to also measure power and energy. Higher level tools that build on PAPI will automatically gain support for power and energy readings when used with the newest version of PAPI. We describe in detail the types of energy and power readings available through PAPI. We support external power meters, as well as values provided internally by recent CPUs and GPUs. Measurements are provided directly to the instrumented process, allowing immediate code analysis in real time. We provide examples showing results that can be obtained with our infrastructure.}, keywords = {papi}, doi = {10.1109/ICPPW.2012.39}, author = {Vincent M Weaver and Matt Johnson and Kiran Kasichayanula and James Ralph and Piotr Luszczek and Dan Terpstra and Shirley Moore} } @article {icl:688, title = {PAPI-V: Performance Monitoring for Virtual Machines}, journal = {CloudTech-HPC 2012}, year = {2012}, month = {2012-09}, address = {Pittsburgh, PA}, abstract = {This paper describes extensions to the PAPI hardware counter library for virtual environments, called PAPI-V. The extensions support timing routines, I/O measurements, and processor counters. The PAPI-V extensions will allow application and tool developers to use a familiar interface to obtain relevant hardware performance monitoring information in virtual environments.}, keywords = {papi}, doi = {10.1109/ICPPW.2012.29}, author = {Matt Johnson and Heike McCraw and Shirley Moore and Phil Mucci and John Nelson and Dan Terpstra and Vincent M Weaver and Tushar Mohan} } @article {icl:686, title = {Power Aware Computing on GPUs}, journal = {SAAHPC {\textquoteright}12 (Best Paper Award)}, year = {2012}, month = {2012-07}, address = {Argonne, IL}, keywords = {magma}, author = {Kiran Kasichayanula and Dan Terpstra and Piotr Luszczek and Stanimire Tomov and Shirley Moore and Gregory D. Peterson} } @article {icl:617, title = {Autotuned Parallel I/O for Highly Scalable Biosequence Analysis}, journal = {TeraGrid{\textquoteright}11}, year = {2011}, month = {2011-07}, address = {Salt Lake City, Utah}, author = {Haihang You and Bhanu Rekapalli and Qing Liu and Shirley Moore} } @inproceedings {icl:615, title = {The Design of an Auto-tuning I/O Framework on Cray XT5 System}, journal = {Cray Users Group Conference (CUG{\textquoteright}11) (Best Paper Finalist)}, year = {2011}, month = {2011-05}, address = {Fairbanks, Alaska}, keywords = {gco}, author = {Haihang You and Qing Liu and Zhiqiang Li and Shirley Moore} } @article {icl:623, title = {Energy and performance characteristics of different parallel implementations of scientific applications on multicore systems}, journal = {International Journal of High Performance Computing Applications}, volume = {25}, number = {3}, year = {2011}, month = {2011-00}, pages = {342-350}, keywords = {mumi}, author = {Charles Lively and Xingfu Wu and Valerie Taylor and Shirley Moore and Hung-Ching Chang and Kirk Cameron} } @inproceedings {icl:616, title = {Evaluation of the HPC Challenge Benchmarks in Virtualized Environments}, journal = {6th Workshop on Virtualization in High-Performance Cloud Computing}, year = {2011}, month = {2011-08}, address = {Bordeaux, France}, keywords = {hpcc}, author = {Piotr Luszczek and Eric Meek and Shirley Moore and Dan Terpstra and Vincent M Weaver and Jack Dongarra} } @article {1361, title = {Power-aware Computing on GPGPUs}, year = {2011}, month = {2011-09}, publisher = {Fall Creek Falls Conference, Poster}, address = {Gatlinburg, TN}, author = {Kiran Kasichayanula and Haihang You and Shirley Moore and Stanimire Tomov and Heike Jagode and Matt Johnson} } @inproceedings {icl:619, title = {Power-Aware Prediction Models of Hybrid (MPI/OpenMP) Scientific Applications}, journal = {International Conference on Energy-Aware High Performance Computing (EnA-HPC 2011)}, year = {2011}, month = {2011-09}, address = {Hamburg, Germany}, keywords = {mumi}, author = {Charles Lively and Xingfu Wu and Valerie Taylor and Shirley Moore and Hung-Ching Chang and Chun-Yi Su and Kirk Cameron} } @article {icl:618, title = {User-Defined Events for Hardware Performance Monitoring}, journal = {Procedia Computer Science}, volume = {4}, year = {2011}, month = {2011-05}, pages = {2096-2104}, publisher = {Elsevier}, abstract = {PAPI is a widely used cross-platform interface to hardware performance counters. PAPI currently supports native events, which are those provided by a given platform, and preset events, which are pre-defined events thought to be common across platforms. Presets are currently mapped and defined at the time that PAPI is compiled and installed. The idea of user-defined events is to allow users to define their own metrics and to have those metrics mapped to events on a platform without the need to re-install PAPI. User-defined events can be defined in terms of native, preset, and previously defined user-defined events. The user can combine events and constants in an arbitrary expression to define a new metric and give a name to the new metric. This name can then be specified as a PAPI event in a PAPI library call the same way as native and preset events. End-user tools such as TAU and Scalasca that use PAPI can also use the user-defined metrics. Users can publish their metric definitions so that other users can use them as well. We present several examples of how user-defined events can be used for performance analysis and modeling.}, keywords = {mumi, papi}, doi = {https://doi.org/10.1016/j.procs.2011.04.229}, author = {Shirley Moore and James Ralph} } @article {icl:586, title = {Empirical Performance Tuning of Dense Linear Algebra Software}, journal = {in Performance Tuning of Scientific Applications (to appear)}, year = {2010}, month = {2010-00}, author = {Jack Dongarra and Shirley Moore}, editor = {David Bailey and Robert Lucas and Sam Williams} } @inproceedings {icl:584, title = {Performance Evaluation for Petascale Quantum Simulation Tools}, journal = {Proceedings of the Cray Users{\textquoteright} Group Meeting}, year = {2010}, month = {2010-05}, address = {Atlanta, GA}, author = {Stanimire Tomov and Wenchang Lu and and Jerzy Bernholc and Shirley Moore and Jack Dongarra} } @article {icl:554, title = {Scalability Study of a Quantum Simulation Code}, journal = {PARA 2010}, year = {2010}, month = {2010-06}, address = {Reykjavik, Iceland}, author = {Jerzy Bernholc and Miroslav Hodak and Wenchang Lu and Shirley Moore and Stanimire Tomov} } @article {icl:479, title = {Analytical Modeling and Optimization for Affinity Based Thread Scheduling on Multicore Systems}, journal = {IEEE Cluster 2009}, year = {2009}, month = {2009-08}, address = {New Orleans}, keywords = {gridpac, mumi}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @article {icl:596, title = {Capturing and Analyzing the Execution Control Flow of OpenMP Applications}, journal = {International Journal of Parallel Programming}, volume = {37}, number = {3}, year = {2009}, month = {2009-00}, pages = {266-276}, author = {Karl F{\"u}rlinger and Shirley Moore} } @article {icl:497, title = {I/O Performance Analysis for the Petascale Simulation Code FLASH}, journal = {ISC{\textquoteright}09}, year = {2009}, month = {2009-06}, address = {Hamburg, Germany}, keywords = {test}, author = {Heike Jagode and Shirley Moore and Dan Terpstra and Jack Dongarra and Andreas Knuepfer and Matthias Jurenz and Matthias S. Mueller and Wolfgang E. Nagel} } @inproceedings {icl:499, title = {Making Performance Analysis and Tuning Part of the Software Development Cycle}, journal = {Proceedings of DoD HPCMP UGC 2009}, year = {2009}, month = {2009-06}, publisher = {IEEE}, address = {San Diego, CA}, author = {Ricardo Portillo and Patricia J. Teller and David Cronk and Shirley Moore} } @inproceedings {icl:602, title = {Modeling the Office of Science Ten Year Facilities Plan: The PERI Architecture Tiger Team}, journal = {SciDAC 2009, Journal of Physics: Conference Series}, volume = {180(2009)012039}, year = {2009}, month = {2009-07}, publisher = {IOP Publishing}, address = {San Diego, California}, keywords = {test}, author = {Bronis R. de Supinski and Sadaf Alam and David Bailey and Laura Carrington and Chris Daley and Anshu Dubey and Todd Gamblin and Dan Gunter and Paul D. Hovland and Heike Jagode and Karen Karavanic and Gabriel Marin and John Mellor-Crummey and Shirley Moore and Boyana Norris and Leonid Oliker and Catherine Olschanowsky and Philip C. Roth and Martin Schulz and Sameer Shende and Allan Snavely} } @inproceedings {icl:478, title = {Performance evaluation for petascale quantum simulation tools}, journal = {Proceedings of CUG09}, year = {2009}, month = {2009-05}, address = {Atlanta, GA}, keywords = {doe-nano}, author = {Stanimire Tomov and Wenchang Lu and Jerzy Bernholc and Shirley Moore and Jack Dongarra} } @article {icl:595, title = {Recording the Control Flow of Parallel Applications to Determine Iterative and Phase-Based Behavior}, journal = {Future Generation Computing Systems}, volume = {26}, year = {2009}, month = {2009-00}, pages = {162-166}, author = {Karl F{\"u}rlinger and Shirley Moore} } @inproceedings {icl:501, title = {A Scalable Non-blocking Multicast Scheme for Distributed DAG Scheduling}, journal = {The International Conference on Computational Science 2009 (ICCS 2009)}, volume = {5544}, year = {2009}, month = {2009-05}, pages = {195-204}, address = {Baton Rouge, LA}, keywords = {plasma}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @techreport {icl:432, title = {Analytical Modeling for Affinity-Based Thread Scheduling on Multicore Platforms}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-626}, year = {2008}, month = {2008-01}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @inproceedings {icl:429, title = {Detection and Analysis of Iterative Behavior in Parallel Applications}, journal = {Proceedings of the 2008 International Conference on Computational Science (ICCS 2008)}, volume = {5103}, year = {2008}, month = {2008-01}, pages = {261-267}, address = {Krakow, Poland}, keywords = {point}, author = {Karl F{\"u}rlinger and Shirley Moore} } @inproceedings {icl:440, title = {Exploring New Architectures in Accelerating CFD for Air Force Applications}, journal = {Proceedings of the DoD HPCMP User Group Conference}, year = {2008}, month = {2008-01}, address = {Seattle, Washington}, keywords = {magma}, author = {Jack Dongarra and Shirley Moore and Gregory D. Peterson and Stanimire Tomov and Jeff Allred and Vincent Natoli and David Richie} } @inproceedings {icl:459, title = {OpenMP-centric Performance Analysis of Hybrid Applications}, journal = {Proc. 2008 IEEE International Conference on Cluster Computing (CLUSTER 2008)}, year = {2008}, month = {2008-01}, address = {Tsukuba, Japan}, author = {Karl F{\"u}rlinger and Shirley Moore} } @article {icl:417, title = {Performance Instrumentation and Compiler Optimizations for MPI/OpenMP Applications}, journal = {Lecture Notes in Computer Science, OpenMP Shared Memory Parallel Programming}, volume = {4315}, year = {2008}, month = {2008-00}, publisher = {Springer Berlin / Heidelberg}, author = {Oscar Hernandez and Fengguang Song and Barbara Chapman and Jack Dongarra and Bernd Mohr and Shirley Moore and Felix Wolf} } @article {icl:462, title = {PERI Auto-tuning}, journal = {Proc. SciDAC 2008}, volume = {125}, year = {2008}, month = {2008-01}, publisher = {Journal of Physics}, address = {Seatlle, Washington}, keywords = {gco}, author = {David Bailey and Jacqueline Chame and Chun Chen and Jack Dongarra and Mary Hall and Jeffrey K. Hollingsworth and Paul D. Hovland and Shirley Moore and Keith Seymour and Jaewook Shin and Ananta Tiwari and Sam Williams and Haihang You} } @inproceedings {icl:412, title = {Usage of the Scalasca Toolset for Scalable Performance Analysis of Large-scale Parallel Applications}, journal = {Proceedings of the 2nd International Workshop on Tools for High Performance Computing}, year = {2008}, month = {2008-01}, pages = {157-167}, publisher = {Springer}, address = {Stuttgart, Germany}, keywords = {point}, author = {Felix Wolf and Brian Wylie and Erika Abraham and Wolfgang Frings and Karl F{\"u}rlinger and Markus Geimer and Marc-Andre Hermanns and Bernd Mohr and Shirley Moore and Matthias Pfeifer}, editor = {Michael Resch and Rainer Keller and Valentin Himmler and Bettina Krammer and A Schulz} } @inproceedings {icl:460, title = {Visualizing the Program Execution Control Flow of OpenMP Applications}, journal = {Proc. 4th International Workshop on OpenMP (IWOMP 2008)}, year = {2008}, month = {2008-01}, pages = {181-190}, publisher = {Lecture Notes in Computer Science 5004}, address = {West Lafayette, Indiana}, author = {Karl F{\"u}rlinger and Shirley Moore} } @techreport {icl:336, title = {Automated Empirical Tuning of a Multiresolution Analysis Kernel}, journal = {ICL Technical Report}, number = {ICL-UT-07-01}, year = {2007}, month = {2007-01}, pages = {10}, keywords = {gco}, author = {Haihang You and Keith Seymour and Jack Dongarra and Shirley Moore} } @article {icl:400, title = {Automatic Analysis of Inefficiency Patterns in Parallel Applications}, journal = {Concurrency and Computation: Practice and Experience}, volume = {19}, number = {11}, year = {2007}, month = {2007-08}, pages = {1481-1496}, author = {Felix Wolf and Bernd Mohr and Jack Dongarra and Shirley Moore} } @inproceedings {icl:383, title = {Continuous Runtime Profiling of OpenMP Applications}, journal = {Proceedings of the 2007 Conference on Parallel Computing (PARCO 2007)}, year = {2007}, month = {2007-01}, address = {Juelich and Aachen, Germany}, keywords = {kojak}, author = {Karl F{\"u}rlinger and Shirley Moore} } @techreport {icl:338, title = {Empirical Tuning of a Multiresolution Analysis Kernel using a Specialized Code Generator}, journal = {ICL Technical Report}, number = {ICL-UT-07-02}, year = {2007}, month = {2007-01}, keywords = {gco}, author = {Haihang You and Keith Seymour and Jack Dongarra and Shirley Moore} } @inproceedings {icl:367, title = {Feedback-Directed Thread Scheduling with Memory Considerations}, journal = {IEEE International Symposium on High Performance Distributed Computing}, year = {2007}, month = {2007-06}, address = {Monterey Bay, CA}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @inproceedings {icl:386, title = {L2 Cache Modeling for Scientific Applications on Chip Multi-Processors}, journal = {Proceedings of the 2007 International Conference on Parallel Processing}, year = {2007}, month = {2007-01}, publisher = {IEEE Computer Society}, address = {Xi{\textquoteright}an, China}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @inproceedings {icl:390, title = {Memory Leak Detection in Fortran Applications using TAU}, journal = {Proc. DoD HPCMP Users Group Conference (HPCMP-UGC{\textquoteright}07)}, year = {2007}, month = {2007-01}, publisher = {IEEE Computer Society}, address = {Pittsburgh, PA}, author = {Sameer Shende and Allen D. Malony and Shirley Moore and David Cronk} } @inproceedings {icl:387, title = {Results of the PERI survey of SciDAC applications}, journal = {Journal of Physics: Conference Series, SciDAC 2007}, volume = {78}, number = {2007}, year = {2007}, month = {2007-01}, author = {Bronis R. de Supinski and Jeffrey K. Hollingsworth and Shirley Moore and Patrick H. Worley} } @inproceedings {icl:329, title = {Experiments with Strassen{\textquoteright}s Algorithm: From Sequential to Parallel}, journal = {18th IASTED International Conference on Parallel and Distributed Computing and Systems PDCS 2006 (submitted)}, year = {2006}, month = {2006-01}, address = {Dallas, Texas}, author = {Fengguang Song and Jack Dongarra and Shirley Moore} } @inproceedings {icl:309, title = {Large Event Traces in Parallel Performance Analysis}, journal = {8th Workshop {\textquoteright}Parallel Systems and Algorithms{\textquoteright} (PASA), Lecture Notes in Informatics}, number = {ICL-UT-06-08}, year = {2006}, month = {2006-03}, publisher = {Gesellschaft f{\"u}r Informatik}, address = {Frankfurt/Main, Germany}, keywords = {kojak}, author = {Felix Wolf and Felix Freitag and Bernd Mohr and Shirley Moore and Brian Wylie} } @techreport {icl:334, title = {Modeling of L2 Cache Behavior for Thread-Parallel Scientific Programs on Chip Multi-Processors}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-06-583}, year = {2006}, month = {2006-01}, author = {Fengguang Song and Shirley Moore and Jack Dongarra} } @inproceedings {icl:319, title = {Performance Instrumentation and Compiler Optimizations for MPI/OpenMP Applications}, journal = {Second International Workshop on OpenMP}, year = {2006}, month = {2006-01}, address = {Reims, France}, keywords = {kojak}, author = {Oscar Hernandez and Fengguang Song and Barbara Chapman and Jack Dongarra and Bernd Mohr and Shirley Moore and Felix Wolf} } @article {icl:271, title = {Automatic analysis of inefficiency patterns in parallel applications}, journal = {Concurrency and Computation: Practice and Experience, Special issue "Automatic Performance Analysis" (submitted)}, year = {2005}, month = {2005-00}, keywords = {kojak}, author = {Felix Wolf and Bernd Mohr and Jack Dongarra and Shirley Moore} } @inproceedings {icl:288, title = {Automatic Experimental Analysis of Communication Patterns in Virtual Topologies}, journal = {In Proceedings of the International Conference on Parallel Processing}, year = {2005}, month = {2005-06}, publisher = {IEEE Computer Society}, address = {Oslo, Norway}, keywords = {kojak}, author = {Nikhil Bhatia and Fengguang Song and Felix Wolf and Jack Dongarra and Bernd Mohr and Shirley Moore} } @inproceedings {icl:248, title = {Improving Time to Solution with Automated Performance Analysis}, journal = {Second Workshop on Productivity and Performance in High-End Computing (P-PHEC) at 11th International Symposium on High Performance Computer Architecture (HPCA-2005)}, year = {2005}, month = {2005-02}, address = {San Francisco}, keywords = {kojak}, author = {Shirley Moore and Felix Wolf and Jack Dongarra and Bernd Mohr} } @inproceedings {icl:274, title = {A Pattern-Based Approach to Automated Application Performance Analysis}, journal = {Workshop on Patterns in High Performance Computing}, year = {2005}, month = {2005-05}, address = {University of Illinois at Urbana-Champaign}, keywords = {kojak}, author = {Nikhil Bhatia and Shirley Moore and Felix Wolf and Jack Dongarra and Bernd Mohr} } @inproceedings {icl:287, title = {Performance Analysis of GYRO: A Tool Evaluation}, journal = {In Proceedings of the 2005 SciDAC Conference}, year = {2005}, month = {2005-06}, address = {San Francisco, CA}, keywords = {kojak}, author = {Patrick H. Worley and Jeff Candy and Laura Carrington and Kevin Huck and Timothy Kaiser and Kumar Mahinthakumar and Allen D. Malony and Shirley Moore and Dan Reed and Philip C. Roth and H. Shan and Sameer Shende and Allan Snavely and S. Sreepathi and Felix Wolf and Y. Zhang} } @conference {icl:298, title = {Performance Profiling and Analysis of DoD Applications using PAPI and TAU}, booktitle = {Proceedings of DoD HPCMP UGC 2005}, year = {2005}, month = {2005-06}, publisher = {IEEE}, organization = {IEEE}, address = {Nashville, TN}, keywords = {papi}, author = {Shirley Moore and David Cronk and Felix Wolf and Avi Purkayastha and Patricia J. Teller and Robert Araiza and Gabriela Aguilera and Jamie Nava} } @inproceedings {icl:270, title = {A Scalable Approach to MPI Application Performance Analysis}, journal = {In Proc. of the 12th European Parallel Virtual Machine and Message Passing Interface Conference}, year = {2005}, month = {2005-09}, publisher = {Springer LNCS}, keywords = {kojak}, author = {Shirley Moore and Felix Wolf and Jack Dongarra and Sameer Shende and Allen D. Malony and Bernd Mohr} } @conference {icl:197, title = {Accurate Cache and TLB Characterization Using Hardware Counters}, booktitle = {International Conference on Computational Science (ICCS 2004)}, year = {2004}, month = {2004-06}, publisher = {Springer}, organization = {Springer}, address = {Krakow, Poland}, abstract = {We have developed a set of microbenchmarks for accurately determining the structural characteristics of data cache memories and TLBs. These characteristics include cache size, cache line size, cache associativity, memory page size, number of data TLB entries, and data TLB associativity. Unlike previous microbenchmarks that used time-based measurements, our microbenchmarks use hardware event counts to more accurately and quickly determine these characteristics while requiring fewer limiting assumptions.}, keywords = {gco, lacsi, papi}, doi = {https://doi.org/10.1007/978-3-540-24688-6_57}, author = {Jack Dongarra and Shirley Moore and Phil Mucci and Keith Seymour and Haihang You} } @inproceedings {icl:233, title = {An Algebra for Cross-Experiment Performance Analysis}, journal = {2004 International Conference on Parallel Processing (ICCP-04)}, year = {2004}, month = {2004-08}, address = {Montreal, Quebec, Canada}, keywords = {kojak}, author = {Fengguang Song and Felix Wolf and Nikhil Bhatia and Jack Dongarra and Shirley Moore} } @conference {icl:239, title = {Automating the Large-Scale Collection and Analysis of Performance}, booktitle = {5th LCI International Conference on Linux Clusters: The HPC Revolution}, year = {2004}, month = {2004-05}, address = {Austin, Texas}, keywords = {kojak, papi}, author = {Phil Mucci and Jack Dongarra and Rick Kufrin and Shirley Moore and Fengguang Song and Felix Wolf} } @inproceedings {icl:232, title = {Efficient Pattern Search in Large Traces through Successive Refinement}, journal = {Proceedings of Euro-Par 2004}, year = {2004}, month = {2004-08}, publisher = {Springer-Verlag}, address = {Pisa, Italy}, keywords = {kojak}, author = {Felix Wolf and Bernd Mohr and Jack Dongarra and Shirley Moore} } @techreport {icl:200, title = {NetBuild: Automated Installation and Use of Network-Accessible Software Libraries}, journal = {ICL Technical Report}, number = {ICL-UT-04-02}, year = {2004}, month = {2004-01}, keywords = {netbuild}, author = {Keith Moore and Jack Dongarra and Shirley Moore and Eric Grosse} } @conference {icl:130, title = {Experiences and Lessons Learned with a Portable Interface to Hardware Performance Counters}, booktitle = {PADTAD Workshop, IPDPS 2003}, year = {2003}, month = {2003-04}, publisher = {IEEE}, organization = {IEEE}, address = {Nice, France}, abstract = {The PAPI project has defined and implemented a cross-platform interface to the hardware counters available on most modern microprocessors. The interface has gained widespread use and acceptance from hardware vendors, users, and tool developers. This paper reports on experiences with the community-based open-source effort to define the PAPI specification and implement it on a variety of platforms. Collaborations with tool developers who have incorporated support for PAPI are described. Issues related to interpretation and accuracy of hardware counter data and to the overheads of collecting this data are discussed. The paper concludes with implications for the design of the next version of PAPI.}, keywords = {lacsi, papi}, isbn = {0-7695-1926-1}, author = {Jack Dongarra and Kevin London and Shirley Moore and Phil Mucci and Dan Terpstra and Haihang You and Min Zhou} } @conference {icl:159, title = {Performance Instrumentation and Measurement for Terascale Systems}, booktitle = {ICCS 2003 Terascale Workshop}, year = {2003}, month = {2003-06}, publisher = {Springer, Berlin, Heidelberg}, organization = {Springer, Berlin, Heidelberg}, address = {Melbourne, Australia}, abstract = {As computer systems grow in size and complexity, tool support is needed to facilitate the efficient mapping of large-scale applications onto these systems. To help achieve this mapping, performance analysis tools must provide robust performance observation capabilities at all levels of the system, as well as map low-level behavior to high-level program constructs. Instrumentation and measurement strategies, developed over the last several years, must evolve together with performance analysis infrastructure to address the challenges of new scalable parallel systems.}, keywords = {papi}, doi = {https://doi.org/10.1007/3-540-44864-0_6}, author = {Jack Dongarra and Allen D. Malony and Shirley Moore and Phil Mucci and Sameer Shende} } @article {icl:92, title = {Active Netlib: An Active Mathematical Software Collection for Inquiry-based Computational Science and Engineering Education}, journal = {Journal of Digital Information special issue on Interactivity in Digital Libraries}, volume = {2}, number = {4}, year = {2002}, month = {2002-00}, keywords = {activenetlib, rib}, author = {Shirley Moore and A.J. Baker and Jack Dongarra and Christian Halloy and Chung Ng} } @conference {icl:76, title = {A Comparison of Counting and Sampling Modes of Using Performance Monitoring Hardware}, booktitle = {International Conference on Computational Science (ICCS 2002)}, year = {2002}, month = {2002-04}, publisher = {Springer}, organization = {Springer}, address = {Amsterdam, Netherlands}, abstract = {Performance monitoring hardware is available on most modern microprocessors in the form of hardware counters and other registers that record data about processor events. This hardware may be used in counting mode, in which aggregate events counts are accumulated, and/or in sampling mode, in which time-based or event-based sampling is used to collect profiling data. This paper discusses uses of these two modes and considers the issues of efficiency and accuracy raised by each. Implications for the PAPI cross-platform hardware counter interface are also discussed.}, keywords = {papi}, doi = {https://doi.org/10.1007/3-540-46080-2_95}, author = {Shirley Moore} } @article {icl:108, title = {Numerical Libraries and Tools for Scalable Parallel Cluster Computing}, journal = {International Journal of High Performance Applications and Supercomputing}, volume = {15}, number = {2}, year = {2002}, month = {2002-10}, pages = {175-180}, author = {Shirley Browne and Jack Dongarra and Anne Trefethen} } @conference {icl:15, title = {End-user Tools for Application Performance Analysis, Using Hardware Counters}, booktitle = {International Conference on Parallel and Distributed Computing Systems}, year = {2001}, month = {2001-08}, address = {Dallas, TX}, abstract = {One purpose of the end-user tools described in this paper is to give users a graphical representation of performance information that has been gathered by instrumenting an application with the PAPI library. PAPI is a project that specifies a standard API for accessing hardware performance counters available on most modern microprocessors. These counters exist as a small set of registers that count \"events\", which are occurrences of specific signals and states related to a processor{\textquoteright}s function. Monitoring these events facilitates correlation between the structure of source/object code and the efficiency of the mapping of that code to the underlying architecture. The perfometer tool developed by the PAPI project provides a graphical view of this information, allowing users to quickly see where performance bottlenecks are in their application. Only one function call has to be added by the user to their program to take advantage of perfometer. This makes it quick and simple to add and remove instrumentation from a program. Also, perfometer allows users to change the \"event\" they are monitoring. Add the ability to monitor parallel applications, set alarms and a Java front-end that can run anywhere, and this gives the user a powerful tool for quickly discovering where and why a bottleneck exists. A number of third-party tools for analyzing performance of message-passing and/or threaded programs have also incorporated support for PAPI so as to be able to display and analyze hardware counter data from their interfaces.}, keywords = {papi}, author = {Kevin London and Jack Dongarra and Shirley Moore and Phil Mucci and Keith Seymour and T. Spencer} } @inproceedings {icl:19, title = {Metacomputing Support for the SARA3D Structural Acoustics Application}, journal = {Department of Defense Users{\textquoteright} Group Conference (to appear)}, year = {2001}, month = {2001-06}, address = {Biloxi, Mississippi}, keywords = {netsolve}, author = {Shirley Moore and Dorian Arnold and David Cronk} } @article {icl:88, title = {Numerical Libraries and Tools for Scalable Parallel Cluster Computing}, journal = {International Journal of High Performance Applications and Supercomputing}, volume = {15}, number = {2}, year = {2001}, month = {2001-01}, pages = {175-180}, author = {Jack Dongarra and Shirley Moore and Anne Trefethen} } @conference {icl:16, title = {The PAPI Cross-Platform Interface to Hardware Performance Counters}, booktitle = {Department of Defense Users{\textquoteright} Group Conference Proceedings}, year = {2001}, month = {2001-06}, address = {Biloxi, Mississippi}, abstract = {The purpose of the PAPI project is to specify a standard API for accessing hardware performance counters available on most modern microprocessors. These counters exist as a small set of registers that count \"events,\" which are occurrences of specific signals and states related to the processor{\textquoteright}s function. Monitoring these events facilitates correlation between the structure of source/object code and the efficiency of the mapping of that code to the underlying architecture. This correlation has a variety of uses in performance analysis and tuning. The PAPI project has developed a standard set of hardware events and a standard cross-platform library interface to the underlying counter hardware. The PAPI library has been implemented for a number of Shared Resource Center platforms. The PAPI project is developing end-user tools for dynamically selecting and displaying hardware counter performance data. PAPI support is also being incorporated into a number of third-party tools.}, keywords = {papi}, author = {Kevin London and Shirley Moore and Phil Mucci and Keith Seymour and Richard Luczak} } @inproceedings {icl:8, title = {Parallel I/O for EQM Applications}, journal = {Department of Defense Users{\textquoteright} Group Conference Proceedings (to appear),}, year = {2001}, month = {2001-06}, address = {Biloxi, Mississippi}, keywords = {ftmpi}, author = {David Cronk and Graham Fagg and Shirley Moore} } @techreport {icl:61, title = {Repository in a Box Toolkit for Software and Resource Sharing}, journal = {University of Tennessee Computer Science Department Technical Report}, number = {ICL-UT-05-05}, year = {2001}, month = {2001-00}, keywords = {rib}, author = {Shirley Browne and Paul McMahan and Scott Wells} } @article {icl:20, title = {Review of Performance Analysis Tools for MPI Parallel Programs}, journal = {European Parallel Virtual Machine / Message Passing Interface Users{\textquoteright} Group Meeting, Lecture Notes in Computer Science 2131}, year = {2001}, month = {2001-09}, pages = {241-248}, publisher = {Springer Verlag, Berlin}, address = {Greece}, abstract = {In order to produce MPI applications that perform well on today{\textquoteright}s parallel architectures, programmers need effective tools for collecting and analyzing performance data. A variety of such tools, both commercial and research, are becoming available. This paper reviews and evaluations the available cross-platform MPI performance analysis tools.}, keywords = {papi}, doi = {https://doi.org/10.1007/3-540-45417-9_34}, author = {Shirley Moore and David Cronk and Kevin London and Jack Dongarra} } @conference {icl:11, title = {Using PAPI for Hardware Performance Monitoring on Linux Systems}, booktitle = {Conference on Linux Clusters: The HPC Revolution}, year = {2001}, month = {2001-06}, publisher = {Linux Clusters Institute}, organization = {Linux Clusters Institute}, address = {Urbana, Illinois}, abstract = {PAPI is a specification of a cross-platform interface to hardware performance counters on modern microprocessors. These counters exist as a small set of registers that count events, which are occurrences of specific signals related to a processor{\textquoteright}s function. Monitoring these events has a variety of uses in application performance analysis and tuning. The PAPI specification consists of both a standard set of events deemed most relevant for application performance tuning, as well as both high-level and low-level sets of routines for accessing the counters. The high level interface simply provides the ability to start, stop, and read sets of events, and is intended for the acquisition of simple but accurate measurement by application engineers. The fully programmable low-level interface provides sophisticated options for controlling the counters, such as setting thresholds for interrupt on overflow, as well as access to all native counting modes and events, and is intended for third-party tool writers or users with more sophisticated needs. PAPI has been implemented on a number of platforms, including Linux/x86 and Linux/IA-64. The Linux/x86 implementation requires a kernel patch that provides a driver for the hardware counters. The driver memory maps the counter registers into user space and allows virtualizing the counters on a perprocess or per-thread basis. The kernel patch is being proposed for inclusion in the main Linux tree. The PAPI library provides access on Linux platforms not only to the standard set of events mentioned above but also to all the Linux/x86 and Linux/IA-64 native events. PAPI has been installed and is in use, either directly or through incorporation into third-party end-user performance analysis tools, on a number of Linux clusters, including the New Mexico LosLobos cluster and Linux clusters at NCSA and the University of Tennessee being used for the GrADS (Grid Application Development Software) project. }, keywords = {papi}, author = {Jack Dongarra and Kevin London and Shirley Moore and Phil Mucci and Dan Terpstra} } @article {icl:31, title = {A Portable Programming Interface for Performance Evaluation on Modern Processors}, journal = {The International Journal of High Performance Computing Applications}, volume = {14}, number = {3}, year = {2000}, month = {2000-09}, pages = {189-204}, keywords = {papi}, doi = {https://doi.org/10.1177/109434200001400303}, author = {Shirley Browne and Jack Dongarra and Nathan Garner and George Ho and Phil Mucci} } @techreport {icl:226, title = {A Portable Programming Interface for Performance Evaluation on Modern Processors}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-00-444}, year = {2000}, month = {2000-07}, author = {Shirley Browne and Jack Dongarra and Nathan Garner and Kevin London and Phil Mucci} } @inproceedings {icl:32, title = {A Scalable Cross-Platform Infrastructure for Application Performance Tuning Using Hardware Counters}, journal = {Proceedings of SuperComputing 2000 (SC{\textquoteright}00)}, year = {2000}, month = {2000-11}, address = {Dallas, TX}, keywords = {papi}, author = {Shirley Browne and Jack Dongarra and Nathan Garner and Kevin London and Phil Mucci} } @techreport {icl:228, title = {Secure Remote Access to Numerical Software and Computation Hardware}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-00-446}, year = {2000}, month = {2000-07}, author = {Dorian Arnold and Shirley Browne and Jack Dongarra and Graham Fagg and Keith Moore} } @inproceedings {icl:26, title = {Secure Remote Access to Numerical Software and Computational Hardware}, journal = {Proceedings of the DoD HPC Users Group Conference (HPCUG) 2000}, year = {2000}, month = {2000-06}, address = {Albuquerque, NM}, keywords = {netsolve}, author = {Dorian Arnold and Shirley Browne and Jack Dongarra and Graham Fagg and Keith Moore} } @article {icl:60, title = {Numerical Libraries and Tools for Scalable Parallel Cluster Computing}, journal = {IEEE Cluster Computing BOF at SC99}, year = {1999}, month = {1999-01}, address = {Portland, Oregon}, author = {Shirley Browne and Jack Dongarra and Anne Trefethen} } @inproceedings {icl:59, title = {PAPI: A Portable Interface to Hardware Performance Counters}, journal = {Proceedings of Department of Defense HPCMP Users Group Conference}, year = {1999}, month = {1999-06}, keywords = {papi}, author = {Shirley Browne and Christine Deane and George Ho and Phil Mucci} } @article {icl:256, title = {National HPCC Software Exchange (NHSE): Uniting the High Performance Computing and Communications Community}, journal = {D-Lib Magazine}, year = {1998}, month = {1998-01}, keywords = {rib}, author = {Shirley Browne and Jack Dongarra and Jeff Horner and Paul McMahan and Scott Wells} }