@article {1269, title = {Evaluation of Directive-Based Performance Portable Programming Models}, journal = {International Journal of High Performance Computing and Networking}, volume = {14}, year = {2019}, month = {2019{\textendash}07}, pages = {165-182}, abstract = {We present an extended exploration of the performance portability of directives provided by OpenMP 4 and OpenACC to program various types of node architecture with attached accelerators, both self-hosted multicore and offload multicore/GPU. Our goal is to examine how successful OpenACC and the newer offload features of OpenMP 4.5 are for moving codes between architectures, and we document how much tuning might be required and what lessons we can learn from these experiences. To do this, we use examples of algorithms with varying computational intensities for our evaluation, as both compute and data access efficiency are important considerations for overall application performance. To better understand fundamental compute vs. bandwidth bound characteristics, we add the compute-bound Level 3 BLAS GEMM kernel to our linear algebra evaluation. We implement the kernels of interest using various methods provided by newer OpenACC and OpenMP implementations, and we evaluate their performance on various platforms including both x86_64 and Power8 with attached NVIDIA GPUs, x86_64 multicores, self-hosted Intel Xeon Phi KNL, as well as an x86_64 host system with Intel Xeon Phi coprocessors. We update these evaluations with the newest version of the NVIDIA Pascal architecture (P100), Intel KNL 7230, Power8+, and the newest supporting compiler implementations. Furthermore, we present in detail what factors affected the performance portability, including how to pick the right programming model, its programming style, its availability on different platforms, and how well compilers can optimise and target multiple platforms.}, keywords = {OpenACC, OpenMP 4, performance portability, Programming models}, doi = {http://dx.doi.org/10.1504/IJHPCN.2017.10009064 }, author = {M. Graham Lopez and Wayne Joubert and Ver{\'o}nica Larrea and Oscar Hernandez and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @conference {977, title = {Towards Achieving Performance Portability Using Directives for Accelerators}, booktitle = {The International Conference for High Performance Computing, Networking, Storage and Analysis (SC{\textquoteright}16), Third Workshop on Accelerator Programming Using Directives (WACCPD)}, year = {2016}, month = {2016-11}, publisher = {Innovative Computing Laboratory, University of Tennessee}, organization = {Innovative Computing Laboratory, University of Tennessee}, address = {Salt Lake City, Utah}, abstract = {In this paper we explore the performance portability of directives provided by OpenMP 4 and OpenACC to program various types of node architectures with attached accelerators, both self-hosted multicore and offload multicore/GPU. Our goal is to examine how successful OpenACC and the newer of- fload features of OpenMP 4.5 are for moving codes between architectures, how much tuning might be required and what lessons we can learn from this experience. To do this, we use examples of algorithms with varying computational intensities for our evaluation, as both compute and data access efficiency are important considerations for overall application performance. We implement these kernels using various methods provided by newer OpenACC and OpenMP implementations, and we evaluate their performance on various platforms including both X86 64 with attached NVIDIA GPUs, self-hosted Intel Xeon Phi KNL, as well as an X86 64 host system with Intel Xeon Phi coprocessors. In this paper, we explain what factors affected the performance portability such as how to pick the right programming model, its programming style, its availability on different platforms, and how well compilers can optimize and target to multiple platforms.}, author = {M. Graham Lopez and Larrea, V and Joubert, W and Hernandez, O and Azzam Haidar and Stanimire Tomov and Jack Dongarra} } @inproceedings {1310, title = {From MPI to OpenSHMEM: Porting LAMMPS}, journal = {OpenSHMEM and Related Technologies. Experiences, Implementations, and Technologies}, year = {2015}, pages = {121{\textendash}137}, publisher = {Springer International Publishing}, address = {Annapolis, MD, USA}, abstract = {This work details the opportunities and challenges of porting a Petascale, MPI-based application {\textendash}-LAMMPS{\textendash}- to OpenSHMEM. We investigate the major programming challenges stemming from the differences in communication semantics, address space organization, and synchronization operations between the two programming models. This work provides several approaches to solve those challenges for representative communication patterns in LAMMPS, e.g., by considering group synchronization, peer{\textquoteright}s buffer status tracking, and unpacked direct transfer of scattered data. The performance of LAMMPS is evaluated on the Titan HPC system at ORNL. The OpenSHMEM implementations are compared with MPI versions in terms of both strong and weak scaling. The results outline that OpenSHMEM provides a rich semantic to implement scalable scientific applications. In addition, the experiments demonstrate that OpenSHMEM can compete with, and often improve on, the optimized MPI implementation.}, isbn = {978-3-319-26428-8}, doi = {10.1007/978-3-319-26428-8_8}, author = {Tang, Chunyan and Aurelien Bouteiller and Thomas Herault and Manjunath Gorentla Venkata and George Bosilca}, editor = {Manjunath Gorentla Venkata and Shamis, Pavel and Imam, Neena and M. Graham Lopez} } @inproceedings {1309, title = {UCX: An Open Source Framework for HPC Network APIs and Beyond}, journal = {2015 IEEE 23rd Annual Symposium on High-Performance Interconnects}, year = {2015}, month = {Aug}, pages = {40-43}, publisher = {IEEE}, address = {Santa Clara, CA, USA}, abstract = {This paper presents Unified Communication X (UCX), a set of network APIs and their implementations for high throughput computing. UCX comes from the combined effort of national laboratories, industry, and academia to design and implement a high-performing and highly-scalable network stack for next generation applications and systems. UCX design provides the ability to tailor its APIs and network functionality to suit a wide variety of application domains and hardware. We envision these APIs to satisfy the networking needs of many programming models such as Message Passing Interface (MPI), OpenSHMEM, Partitioned Global Address Space (PGAS) languages, task-based paradigms and I/O bound applications. To evaluate the design we implement the APIs and protocols, and measure the performance of overhead-critical network primitives fundamental for implementing many parallel programming models and system libraries. Our results show that the latency, bandwidth, and message rate achieved by the portable UCX prototype is very close to that of the underlying driver. With UCX, we achieved a message exchange latency of 0.89 us, a bandwidth of 6138.5 MB/s, and a message rate of 14 million messages per second. As far as we know, this is the highest bandwidth and message rate achieved by any network stack (publicly known) on this hardware.}, keywords = {application program interfaces, Bandwidth, Electronics packaging, Hardware, high throughput computing, highly-scalable network stack, HPC, HPC network APIs, I/O bound applications, Infiniband, input-output programs, Libraries, Memory management, message passing, message passing interface, Middleware, MPI, open source framework, OpenSHMEM, parallel programming, parallel programming models, partitioned global address space languages, PGAS, PGAS languages, Programming, protocols, public domain software, RDMA, system libraries, task-based paradigms, UCX, Unified Communication X}, isbn = {978-1-4673-9160-3}, doi = {10.1109/HOTI.2015.13}, author = {P. Shamis and Manjunath Gorentla Venkata and M. Graham Lopez and M. B. Baker and O. Hernandez and Y. Itigin and M. Dubman and G. Shainer and R. L. Graham and L. Liss and Y. Shahar and S. Potluri and D. Rossetti and D. Becker and D. Poole and C. Lamb and S. Kumar and C. Stunkel and George Bosilca and Aurelien Bouteiller} }