@article {icl:192, title = {Hardware-Counter Based Automatic Performance Analysis of Parallel Programs}, journal = {Advances in Parallel Computing}, volume = {13}, year = {2003}, month = {2004-01}, pages = {753-760}, publisher = {Elsevier}, address = {Dresden, Germany}, abstract = {The KOJAK performance-analysis environment identifies a large number of performance problems on parallel computers with SMP nodes. The current version concentrates on parallelism-related performance problems that arise from an inefficient usage of the parallel programming interfaces MPI and OpenMP, while ignoring individual CPU performance. This chapter describes an extended design of KOJAK capable of diagnosing low individual-CPU performance based on hardware-counter information and of integrating the results with those of the parallelism-centered analysis. The performance of parallel applications is determined by a variety of different factors. Performance of single components frequently influences the overall behavior in unexpected ways. Application programmers on current parallel machines have to deal with numerous performance-critical aspects: different modes of parallel execution, such as message passing, multi-threading or even a combination of the two, and performance on individual CPU that is determined by the interaction of different functional units. The KOJAK analysis process is composed of two parts: a semi-automatic instrumentation of the user application followed by an automatic analysis of the generated performance data. KOJAK{\textquoteright}s instrumentation software runs on most major UNlX platforms and works on multiple levels, including source-code, compiler, and linker.}, keywords = {kojak, papi}, doi = {https://doi.org/10.1016/S0927-5452(04)80092-3}, author = {Felix Wolf and Bernd Mohr} }