@article {1377, title = {PAPI Software-Defined Events for in-Depth Performance Analysis}, journal = {The International Journal of High Performance Computing Applications}, volume = {33}, year = {2019}, month = {2019-11}, pages = {1113-1127}, abstract = {The methodology and standardization layer provided by the Performance Application Programming Interface (PAPI) has played a vital role in application profiling for almost two decades. It has enabled sophisticated performance analysis tool designers and performance-conscious scientists to gain insights into their applications by simply instrumenting their code using a handful of PAPI functions that {\textquotedblleft}just work{\textquotedblright} across different hardware components. In the past, PAPI development had focused primarily on hardware-specific performance metrics. However, the rapidly increasing complexity of software infrastructure poses new measurement and analysis challenges for the developers of large-scale applications. In particular, acquiring information regarding the behavior of libraries and runtimes{\textemdash}used by scientific applications{\textemdash}requires low-level binary instrumentation, or APIs specific to each library and runtime. No uniform API for monitoring events that originate from inside the software stack has emerged. In this article, we present our efforts to extend PAPI{\textquoteright}s role so that it becomes the de facto standard for exposing performance-critical events, which we refer to as software-defined events (SDEs), from different software layers. Upgrading PAPI with SDEs enables monitoring of both types of performance events{\textemdash}hardware- and software-related events{\textemdash}in a uniform way, through the same consistent PAPI. The goal of this article is threefold. First, we motivate the need for SDEs and describe our design decisions regarding the functionality we offer through PAPI{\textquoteright}s new SDE interface. Second, we illustrate how SDEs can be utilized by different software packages, specifically, by showcasing their use in the numerical linear algebra library MAGMA-Sparse, the tensor algebra library TAMM that is part of the NWChem suite, and the compiler-based performance analysis tool Byfl. Third, we provide a performance analysis of the overhead that results from monitoring SDEs and discuss the trade-offs between overhead and functionality.}, url = {https://doi.org/10.1177/1094342019846287}, author = {Heike Jagode and Anthony Danalis and Hartwig Anzt and Jack Dongarra} }