@conference {, title = {Using Advanced Vector Extensions AVX-512 for MPI Reduction}, booktitle = {EuroMPI/USA {\textquoteright}20: 27th European MPI Users{\textquoteright} Group Meeting}, year = {2020}, month = {2020-09}, address = {Austin, TX}, abstract = {As the scale of high-performance computing (HPC) systems continues to grow, researchers are devoted themselves to explore increasing levels of parallelism to achieve optimal performance. The modern CPU{\textquoteright}s design, including its features of hierarchical memory and SIMD/vectorization capability, governs algorithms{\textquoteright} efficiency. The recent introduction of wide vector instruction set extensions (AVX and SVE) motivated vectorization to become of critical importance to increase efficiency and close the gap to peak performance. In this paper, we propose an implementation of predefined MPI reduction operations utilizing AVX, AVX2 and AVX-512 intrinsics to provide vector-based reduction operation and to improve the timeto- solution of these predefined MPI reduction operations. With these optimizations, we achieve higher efficiency for local computations, which directly benefit the overall cost of collective reductions. The evaluation of the resulting software stack under different scenarios demonstrates that the solution is at the same time generic and efficient. Experiments are conducted on an Intel Xeon Gold cluster, which shows our AVX-512 optimized reduction operations achieve 10X performance benefits than Open MPI default for MPI local reduction.}, keywords = {Instruction level parallelism, Intel AVX2/AVX-512, Long vector extension, MPI reduction operation, Single instruction multiple data, Vector operation}, doi = {https://doi.org/10.1145/3416315.3416316}, author = {Dong Zhong and Qinglei Cao and George Bosilca and Jack Dongarra} }