@inbook {1384, title = {Performance Analysis and Debugging Tools at Scale}, booktitle = {Exascale Scientific Applications: Scalability and Performance Portability}, year = {2017}, month = {2017-11}, pages = {17-50}, publisher = {Chapman \& Hall / CRC Press}, organization = {Chapman \& Hall / CRC Press}, chapter = {2}, abstract = {This chapter explores present-day challenges and those likely to arise as new hardware and software technologies are introduced on the path to exascale. It covers some of the underlying hardware, software, and techniques that enable tools and debuggers. Performance tools and debuggers are critical components that enable computational scientists to fully exploit the computing power of While high-performance computing systems. Instrumentation is the insertion of code to perform measurement in a program. It is vital step in performance analysis, especially for parallel programs. The essence of a debugging tool is enabling observation, exploration, and control of program state, such that a developer can, for example, verify that what is currently occurring correlates to what is intended. The increased complexity and volume of performance and debugging data likely to be seen on exascale systems risks overwhelming tool users. Tools and debuggers may need to develop advanced techniques such as automated filtering and analysis to reduce the complexity seen by the user.}, isbn = {9781315277400}, doi = {https://doi.org/10.1201/b21930}, author = {Scott Parker and John Mellor-Crummey and Dong H. Ahn and Heike Jagode and Holger Brunst and Sameer Shende and Allen D. Malony and David DelSignore and Ronny Tschuter and Ralph Castain and Kevin Harms and Philip Carns and Ray Loy and Kalyan Kumaran} } @conference {687, title = {Diagnosis and Optimization of Application Prefetching Performance}, booktitle = {Proceedings of the 27th ACM International Conference on Supercomputing (ICS {\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Eugene, Oregon, USA}, abstract = {Hardware prefetchers are effective at recognizing streaming memory access patterns and at moving data closer to the processing units to hide memory latency. However, hardware prefetchers can track only a limited number of data streams due to finite hardware resources. In this paper, we introduce the term streaming concurrency to characterize the number of parallel, logical data streams in an application. We present a simulation algorithm for understanding the streaming concurrency at any point in an application, and we show that this metric is a good predictor of the number of memory requests initiated by streaming prefetchers. Next, we try to understand the causes behind poor prefetching performance. We identified four prefetch unfriendly conditions and we show how to classify an application{\textquoteright}s memory references based on these conditions. We evaluated our analysis using the SPEC CPU2006 benchmark suite. We selected two benchmarks with unfavorable access patterns and transformed them to improve their prefetching effectiveness. Results show that making applications more prefetcher friendly can yield meaningful performance gains.}, isbn = {9781450321303}, doi = {10.1145/2464996.2465014}, url = {http://dl.acm.org/citation.cfm?doid=2464996.2465014}, author = {Gabriel Marin and Colin McCurdy and Jeffrey Vetter}, editor = {Allen D. Malony and Nemirovsky, Mario and Midkiff, Sam} } @conference {686, title = {Toward a scalable multi-GPU eigensolver via compute-intensive kernels and efficient communication}, booktitle = {Proceedings of the 27th ACM International Conference on Supercomputing (ICS {\textquoteright}13)}, year = {2013}, month = {2013-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Eugene, Oregon, USA}, abstract = {The enormous gap between the high-performance capabilities of GPUs and the slow interconnect between them has made the development of numerical software that is scalable across multiple GPUs extremely challenging. We describe a successful methodology on how to address the challenges---starting from our algorithm design, kernel optimization and tuning, to our programming model---in the development of a scalable high-performance tridiagonal reduction algorithm for the symmetric eigenvalue problem. This is a fundamental linear algebra problem with many engineering and physics applications. We use a combination of a task-based approach to parallelism and a new algorithmic design to achieve high performance. The goal of the new design is to increase the computational intensity of the major compute kernels and to reduce synchronization and data transfers between GPUs. This may increase the number of flops, but the increase is offset by the more efficient execution and reduced data transfers. Our performance results are the best available, providing an enormous performance boost compared to current state-of-the-art solutions. In particular, our software scales up to 1070 Gflop/s using 16 Intel E5-2670 cores and eight M2090 GPUs, compared to 45 Gflop/s achieved by the optimized Intel Math Kernel Library (MKL) using only the 16 CPU cores.}, keywords = {eigenvalue, gpu communication, gpu computation, heterogeneous programming model, performance, reduction to tridiagonal, singular value decomposiiton, task parallelism}, isbn = {9781450321303}, doi = {10.1145/2464996.2465438}, url = {http://dl.acm.org/citation.cfm?doid=2464996.2465438}, author = {Azzam Haidar and Mark Gates and Stanimire Tomov and Jack Dongarra}, editor = {Allen D. Malony and Nemirovsky, Mario and Midkiff, Sam} } @conference {icl:633, title = {Parallel Performance Measurement of Heterogeneous Parallel Systems with GPUs}, booktitle = {International Conference on Parallel Processing (ICPP{\textquoteright}11)}, year = {2011}, month = {2011-09}, publisher = {ACM}, organization = {ACM}, address = {Taipei, Taiwan}, abstract = {The power of GPUs is giving rise to heterogeneous parallel computing, with new demands on programming environments, runtime systems, and tools to deliver high-performing applications. This paper studies the problems associated with performance measurement of heterogeneous machines with GPUs. A heterogeneous computation model and alternative host-GPU measurement approaches are discussed to set the stage for reporting new capabilities for heterogeneous parallel performance measurement in three leading HPC tools: PAPI, Vampir, and the TAU Performance System. Our work leverages the new CUPTI tool support in NVIDIA{\textquoteright}s CUDA device library. Heterogeneous benchmarks from the SHOC suite are used to demonstrate the measurement methods and tool support.}, keywords = {magma, mumi, papi}, isbn = {978-0-7695-4510-3}, doi = {10.1109/ICPP.2011.71}, author = {Allen D. Malony and Scott Biersdorff and Sameer Shende and Heike Jagode and Stanimire Tomov and Guido Juckeland and Robert Dietrich and Duncan Poole and Christopher Lamb} } @inproceedings {icl:474, title = {A Holistic Approach for Performance Measurement and Analysis for Petascale Applications}, journal = {ICCS 2009 Joint Workshop: Tools for Program Development and Analysis in Computational Science and Software Engineering for Large-Scale Computing}, volume = {2009}, year = {2009}, month = {2009-05}, pages = {686-695}, publisher = {Springer-Verlag Berlin Heidelberg 2009}, address = {Baton Rouge, Louisiana}, keywords = {point, test}, author = {Heike Jagode and Jack Dongarra and Sadaf Alam and Jeffrey Vetter and W. Spear and Allen D. Malony}, editor = {Gabrielle Allen} } @inproceedings {icl:390, title = {Memory Leak Detection in Fortran Applications using TAU}, journal = {Proc. DoD HPCMP Users Group Conference (HPCMP-UGC{\textquoteright}07)}, year = {2007}, month = {2007-01}, publisher = {IEEE Computer Society}, address = {Pittsburgh, PA}, author = {Sameer Shende and Allen D. Malony and Shirley Moore and David Cronk} } @inproceedings {icl:287, title = {Performance Analysis of GYRO: A Tool Evaluation}, journal = {In Proceedings of the 2005 SciDAC Conference}, year = {2005}, month = {2005-06}, address = {San Francisco, CA}, keywords = {kojak}, author = {Patrick H. Worley and Jeff Candy and Laura Carrington and Kevin Huck and Timothy Kaiser and Kumar Mahinthakumar and Allen D. Malony and Shirley Moore and Dan Reed and Philip C. Roth and H. Shan and Sameer Shende and Allan Snavely and S. Sreepathi and Felix Wolf and Y. Zhang} } @inproceedings {icl:269, title = {Performance Profiling Overhead Compensation for MPI Programs}, journal = {In Proc. of the 12th European Parallel Virtual Machine and Message Passing Interface Conference}, year = {2005}, month = {2005-09}, publisher = {Springer LNCS}, keywords = {kojak}, author = {Sameer Shende and Allen D. Malony and Alan Morris and Felix Wolf} } @inproceedings {icl:270, title = {A Scalable Approach to MPI Application Performance Analysis}, journal = {In Proc. of the 12th European Parallel Virtual Machine and Message Passing Interface Conference}, year = {2005}, month = {2005-09}, publisher = {Springer LNCS}, keywords = {kojak}, author = {Shirley Moore and Felix Wolf and Jack Dongarra and Sameer Shende and Allen D. Malony and Bernd Mohr} } @inproceedings {icl:268, title = {Trace-Based Parallel Performance Overhead Compensation}, journal = {In Proc. of the International Conference on High Performance Computing and Communications (HPCC)}, year = {2005}, month = {2005-09}, address = {Sorrento (Naples), Italy}, keywords = {kojak}, author = {Felix Wolf and Allen D. Malony and Sameer Shende and Alan Morris} } @conference {icl:159, title = {Performance Instrumentation and Measurement for Terascale Systems}, booktitle = {ICCS 2003 Terascale Workshop}, year = {2003}, month = {2003-06}, publisher = {Springer, Berlin, Heidelberg}, organization = {Springer, Berlin, Heidelberg}, address = {Melbourne, Australia}, abstract = {As computer systems grow in size and complexity, tool support is needed to facilitate the efficient mapping of large-scale applications onto these systems. To help achieve this mapping, performance analysis tools must provide robust performance observation capabilities at all levels of the system, as well as map low-level behavior to high-level program constructs. Instrumentation and measurement strategies, developed over the last several years, must evolve together with performance analysis infrastructure to address the challenges of new scalable parallel systems.}, keywords = {papi}, doi = {https://doi.org/10.1007/3-540-44864-0_6}, author = {Jack Dongarra and Allen D. Malony and Shirley Moore and Phil Mucci and Sameer Shende} }