@conference {, title = {FFT-Based Gradient Sparsification for the Distributed Training of Deep Neural Networks}, booktitle = {9th International Symposium on High-Performance Parallel and Distributed Computing (HPDC 20)}, year = {2020}, month = {2020-06}, publisher = {ACM}, organization = {ACM}, address = {Stockholm, Sweden}, abstract = {The performance and efficiency of distributed training of Deep Neural Networks (DNN) highly depend on the performance of gradient averaging among participating processes, a step bound by communication costs. There are two major approaches to reduce communication overhead: overlap communications with computations (lossless), or reduce communications (lossy). The lossless solution works well for linear neural architectures, e.g. VGG, AlexNet, but more recent networks such as ResNet and Inception limit the opportunity for such overlapping. Therefore, approaches that reduce the amount of data (lossy) become more suitable. In this paper, we present a novel, explainable lossy method that sparsifies gradients in the frequency domain, in addition to a new range-based float point representation to quantize and further compress gradients. These dynamic techniques strike a balance between compression ratio, accuracy, and computational overhead, and are optimized to maximize performance in heterogeneous environments. Unlike existing works that strive for a higher compression ratio, we stress the robustness of our methods, and provide guidance to recover accuracy from failures. To achieve this, we prove how the FFT sparsification affects the convergence and accuracy, and show that our method is guaranteed to converge using a diminishing θ in training. Reducing θ can also be used to recover accuracy from the failure. Compared to STOA lossy methods, e.g., QSGD, TernGrad, and Top-k sparsification, our approach incurs less approximation error, thereby better in both the wall-time and accuracy. On an 8 GPUs, InfiniBand interconnected cluster, our techniques effectively accelerate AlexNet training up to 2.26x to the baseline of no compression, and 1.31x to QSGD, 1.25x to Terngrad and 1.47x to Top-K sparsification.}, keywords = {FFT, Gradient Compression, Loosy Gradients, Machine Learning, Neural Networks}, doi = {https://doi.org/10.1145/3369583.3392681}, author = {Linnan Wang and Wei Wu and Junyu Zhang and Hang Liu and George Bosilca and Maurice Herlihy and Rodrigo Fonseca} } @conference {1205, title = {ADAPT: An Event-Based Adaptive Collective Communication Framework}, booktitle = {The 27th International Symposium on High-Performance Parallel and Distributed Computing (HPDC {\textquoteright}18)}, year = {2018}, month = {2018-06}, publisher = {ACM Press}, organization = {ACM Press}, address = {Tempe, Arizona}, abstract = {The increase in scale and heterogeneity of high-performance computing (HPC) systems predispose the performance of Message Passing Interface (MPI) collective communications to be susceptible to noise, and to adapt to a complex mix of hardware capabilities. The designs of state of the art MPI collectives heavily rely on synchronizations; these designs magnify noise across the participating processes, resulting in significant performance slowdown. Therefore, such design philosophy must be reconsidered to efficiently and robustly run on the large-scale heterogeneous platforms. In this paper, we present ADAPT, a new collective communication framework in Open MPI, using event-driven techniques to morph collective algorithms to heterogeneous environments. The core concept of ADAPT is to relax synchronizations, while mamtaining the minimal data dependencies of MPI collectives. To fully exploit the different bandwidths of data movement lanes in heterogeneous systems, we extend the ADAPT collective framework with a topology-aware communication tree. This removes the boundaries of different hardware topologies while maximizing the speed of data movements. We evaluate our framework with two popular collective operations: broadcast and reduce on both CPU and GPU clusters. Our results demonstrate drastic performance improvements and a strong resistance against noise compared to other state of the art MPI libraries. In particular, we demonstrate at least 1.3X and 1.5X speedup for CPU data and 2X and 10X speedup for GPU data using ADAPT event-based broadcast and reduce operations.}, isbn = {9781450357852}, doi = {10.1145/3208040.3208054}, author = {Xi Luo and Wei Wu and George Bosilca and Thananon Patinyasakdikul and Linnan Wang and Jack Dongarra} }