@article {1385, title = {GPUDirect MPI Communications and Optimizations to Accelerate FFTs on Exascale Systems}, journal = {EuroMPI{\textquoteright}19 Posters, Zurich, Switzerland}, number = {icl-ut-19-06}, year = {2019}, month = {2019-09}, publisher = {ICL}, type = {Extended Abstract}, abstract = {Fast Fourier transforms (FFTs) are used in applications ranging from molecular dynamics and spectrum estimation to machine learn- ing, fast convolution and correlation, signal modulation, wireless multimedia applications, and others. However, FFTs are memory bound, and therefore, to accelerate them, it is crucial to avoid and optimize the FFTs{\textquoteright} communications. To this end, we present a 3-D FFT design for distributed graphics processing unit (GPU) systems that: (1) efficiently uses GPUs{\textquoteright} high bandwidth, (2) reduces global communications algorithmically, when possible, and (3) employs GPUDirect technologies as well as MPI optimizations in the development of high-performance FFTs for large-scale GPU-accelerated systems. We show that these developments and optimizations lead to very good strong scalability and a performance that is close to 90\% of the theoretical peak.}, keywords = {CUDA-Aware MPI, ECP, FFT, FFT-ECP, gpu, GPUDirect}, author = {Hejer Shaiek and Stanimire Tomov and Alan Ayala and Azzam Haidar and Jack Dongarra} }