@conference {764, title = {Power Monitoring with PAPI for Extreme Scale Architectures and Dataflow-based Programming Models}, booktitle = {2014 IEEE International Conference on Cluster Computing}, number = {ICL-UT-14-04}, year = {2014}, month = {2014-09}, publisher = {IEEE}, organization = {IEEE}, address = {Madrid, Spain}, abstract = {For more than a decade, the PAPI performance-monitoring library has provided a clear, portable interface to the hardware performance counters available on all modern CPUs and other components of interest (e.g., GPUs, network, and I/O systems). Most major end-user tools that application developers use to analyze the performance of their applications rely on PAPI to gain access to these performance counters. One of the critical road-blockers on the way to larger, more complex high performance systems, has been widely identified as being the energy efficiency constraints. With modern extreme scale machines having hundreds of thousands of cores, the ability to reduce power consumption for each CPU at the software level becomes critically important, both for economic and environmental reasons. In order for PAPI to continue playing its well established role in HPC, it is pressing to provide valuable performance data that not only originates from within the processing cores but also delivers insight into the power consumption of the system as a whole. An extensive effort has been made to extend the Performance API to support power monitoring capabilities for various platforms. This paper provides detailed information about three components that allow power monitoring on the Intel Xeon Phi and Blue Gene/Q. Furthermore, we discuss the integration of PAPI in PARSEC {\textendash} a taskbased dataflow-driven execution engine {\textendash} enabling hardware performance counter and power monitoring at true task granularity.}, doi = {10.1109/CLUSTER.2014.6968672}, author = {Heike McCraw and James Ralph and Anthony Danalis and Jack Dongarra} } @conference {768, title = {Utilizing Dataflow-based Execution for Coupled Cluster Methods}, booktitle = {2014 IEEE International Conference on Cluster Computing}, number = {ICL-UT-14-02}, year = {2014}, month = {2014-09}, publisher = {IEEE}, organization = {IEEE}, address = {Madrid, Spain}, abstract = {Computational chemistry comprises one of the driving forces of High Performance Computing. In particular, many-body methods, such as Coupled Cluster (CC) methods of the quantum chemistry package NWCHEM, are of particular interest for the applied chemistry community. Harnessing large fractions of the processing power of modern large scale computing platforms has become increasingly difficult. With the increase in scale, complexity, and heterogeneity of modern platforms, traditional programming models fail to deliver the expected performance scalability. On our way to Exascale and with these extremely hybrid platforms, dataflow-based programming models may be the only viable way for achieving and maintaining computation at scale. In this paper, we discuss a dataflow-based programming model and its applicability to NWCHEM{\textquoteright}s CC methods. Our dataflow version of the CC kernels breaks down the algorithm into fine-grained tasks with explicitly defined data dependencies. As a result, many of the traditional synchronization points can be eliminated, allowing for a dynamic reshaping of the execution based on the ongoing availability of computational resources. We build this experiment using PARSEC {\textendash} a task-based dataflow-driven execution engine {\textendash} that enables efficient task scheduling on distributed systems, providing a desirable portability layer for application developers.}, author = {Heike McCraw and Anthony Danalis and George Bosilca and Jack Dongarra and Karol Kowalski and Theresa Windus} } @conference {1383, title = {Beyond the CPU: Hardware Performance Counter Monitoring on Blue Gene/Q}, booktitle = {International Supercomputing Conference 2013 (ISC{\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {Springer}, organization = {Springer}, address = {Leipzig, Germany}, author = {Heike McCraw and Dan Terpstra and Jack Dongarra and Kris Davis and Roy Musselman} } @article {1382, title = {PAPI 5: Measuring Power, Energy, and the Cloud}, year = {2013}, month = {2013-04}, publisher = {2013 IEEE International Symposium on Performance Analysis of Systems and Software}, address = {Austin, TX}, author = {Vincent Weaver and Dan Terpstra and Heike McCraw and Matt Johnson and Kiran Kasichayanula and James Ralph and John Nelson and Phil Mucci and Tushar Mohan and Shirley Moore} } @article {icl:688, title = {PAPI-V: Performance Monitoring for Virtual Machines}, journal = {CloudTech-HPC 2012}, year = {2012}, month = {2012-09}, address = {Pittsburgh, PA}, abstract = {This paper describes extensions to the PAPI hardware counter library for virtual environments, called PAPI-V. The extensions support timing routines, I/O measurements, and processor counters. The PAPI-V extensions will allow application and tool developers to use a familiar interface to obtain relevant hardware performance monitoring information in virtual environments.}, keywords = {papi}, doi = {10.1109/ICPPW.2012.29}, author = {Matt Johnson and Heike McCraw and Shirley Moore and Phil Mucci and John Nelson and Dan Terpstra and Vincent M Weaver and Tushar Mohan} } @techreport {icl:682, title = {Performance Counter Monitoring for the Blue Gene/Q Architecture}, journal = {University of Tennessee Computer Science Technical Report}, number = {ICL-UT-12-01}, year = {2012}, month = {2012-00}, keywords = {papi}, author = {Heike McCraw} }