@article {, title = {Accelerating Geostatistical Modeling and Prediction With Mixed-Precision Computations: A High-Productivity Approach With PaRSEC}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {33}, year = {2022}, month = {2022-04}, pages = {964 - 976}, abstract = {Geostatistical modeling, one of the prime motivating applications for exascale computing, is a technique for predicting desired quantities from geographically distributed data, based on statistical models and optimization of parameters. Spatial data are assumed to possess properties of stationarity or non-stationarity via a kernel fitted to a covariance matrix. A primary workhorse of stationary spatial statistics is Gaussian maximum log-likelihood estimation (MLE), whose central data structure is a dense, symmetric positive definite covariance matrix of the dimension of the number of correlated observations. Two essential operations in MLE are the application of the inverse and evaluation of the determinant of the covariance matrix. These can be rendered through the Cholesky decomposition and triangular solution. In this contribution, we reduce the precision of weakly correlated locations to single- or half- precision based on distance. We thus exploit mathematical structure to migrate MLE to a three-precision approximation that takes advantage of contemporary architectures offering BLAS3-like operations in a single instruction that are extremely fast for reduced precision. We illustrate application-expected accuracy worthy of double-precision from a majority half-precision computation, in a context where uniform single-precision is by itself insufficient. In tackling the complexity and imbalance caused by the mixing of three precisions, we deploy the PaRSEC runtime system. PaRSEC delivers on-demand casting of precisions while orchestrating tasks and data movement in a multi-GPU distributed-memory environment within a tile-based Cholesky factorization. Application-expected accuracy is maintained while achieving up to 1.59X by mixing FP64/FP32 operations on 1536 nodes of HAWK or 4096 nodes of Shaheen II , and up to 2.64X by mixing FP64/FP32/FP16 operations on 128 nodes of Summit , relative to FP64-only operations. This translates into up to 4.5, 4.7, ...}, keywords = {Computational modeling, Covariance matrices, Data models, Maximum likelihood estimation, Predictive models, runtime, Task analysis}, issn = {1045-9219}, doi = {10.1109/TPDS.2021.3084071}, url = {https://ieeexplore.ieee.org/document/9442267/https://ieeexplore.ieee.org/ielam/71/9575177/9442267-aam.pdfhttp://xplorestaging.ieee.org/ielx7/71/9575177/09442267.pdf?arnumber=9442267}, author = {Abdulah, Sameh and Qinglei Cao and Pei, Yu and George Bosilca and Jack Dongarra and Genton, Marc G. and Keyes, David E. and Ltaief, Hatem and Sun, Ying} } @article {, title = {Evaluating Data Redistribution in PaRSEC}, journal = {IEEE Transactions on Parallel and Distributed Systems}, volume = {33}, number = {8}, year = {2022}, month = {2022-08}, pages = {1856-1872}, doi = {10.1109/TPDS.2021.3131657}, author = {Qinglei Cao and George Bosilca and Losada, Nuria and Wu, Wei and Zhong, Dong and Jack Dongarra} } @conference {, title = {A Framework to Exploit Data Sparsity in Tile Low-Rank Cholesky Factorization}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2022}, month = {2022-07}, doi = {10.1109/IPDPS53621.2022.00047}, url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=9820680\&isnumber=9820610}, author = {Qinglei Cao and Rabab Alomairy and Yu Pei and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @conference {, title = {Leveraging PaRSEC Runtime Support to Tackle Challenging 3D Data-Sparse Matrix Problems}, booktitle = {35th IEEE International Parallel \& Distributed Processing Symposium (IPDPS 2021)}, year = {2021}, month = {2021-05}, publisher = {IEEE}, organization = {IEEE}, address = {Portland, OR}, abstract = {The task-based programming model associated with dynamic runtime systems has gained popularity for challenging problems because of workload imbalance, heterogeneous resources, or extreme concurrency. During the last decade, lowrank matrix approximations, where the main idea consists of exploiting data sparsity typically by compressing off-diagonal tiles up to an application-specific accuracy threshold, have been adopted to address the curse of dimensionality at extreme scale. In this paper, we create a bridge between the runtime and the linear algebra by communicating knowledge of the data sparsity to the runtime. We design and implement this synergistic approach with high user productivity in mind, in the context of the PaRSEC runtime system and the HiCMA numerical library. This requires to extend PaRSEC with new features to integrate rank information into the dataflow so that proper decisions can be taken at runtime. We focus on the tile low-rank (TLR) Cholesky factorization for solving 3D data-sparse covariance matrix problems arising in environmental applications. In particular, we employ the 3D exponential model of Matern matrix kernel, which exhibits challenging nonuniform {\textasciiacute}high ranks in off-diagonal tiles. We first provide a dynamic data structure management driven by a performance model to reduce extra floating-point operations. Next, we optimize the memory footprint of the application by relying on a dynamic memory allocator, and supported by a rank-aware data distribution to cope with the workload imbalance. Finally, we expose further parallelism using kernel recursive formulations to shorten the critical path. Our resulting high-performance implementation outperforms existing data-sparse TLR Cholesky factorization by up to 7-fold on a large-scale distributed-memory system, while minimizing the memory footprint up to a 44-fold factor. This multidisciplinary work highlights the need to empower runtime systems beyond their original duty of task scheduling for servicing next-generation low-rank matrix algebra libraries.}, keywords = {asynchronous executions and load balancing, dynamic runtime system, environmental applications, High-performance computing, low-rank matrix computations, task-based programming model, user productivity}, author = {Qinglei Cao and Yu Pei and Kadir Akbudak and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @conference {1478, title = {Communication Avoiding 2D Stencil Implementations over PaRSEC Task-Based Runtime}, booktitle = {2020 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2020}, month = {2020-05}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, abstract = {Stencil computation or general sparse matrix-vector product (SpMV) are key components in many algorithms like geometric multigrid or Krylov solvers. But their low arithmetic intensity means that memory bandwidth and network latency will be the performance limiting factors. The current architectural trend favors computations over bandwidth, worsening the already unfavorable imbalance. Previous work approached stencil kernel optimization either by improving memory bandwidth usage or by providing a Communication Avoiding (CA) scheme to minimize network latency in repeated sparse vector multiplication by replicating remote work in order to delay communications on the critical path. Focusing on minimizing communication bottleneck in distributed stencil computation, in this study we combine a CA scheme with the computation and communication overlapping that is inherent in a dataflow task-based runtime system such as PaRSEC to demonstrate their combined benefits. We implemented the 2D five point stencil (Jacobi iteration) in PETSc, and over PaRSEC in two flavors, full communications (base-PaRSEC) and CA-PaRSEC which operate directly on a 2D compute grid. Our results running on two clusters, NaCL and Stampede2 indicate that we can achieve 2{\texttimes} speedup over the standard SpMV solution implemented in PETSc, and in certain cases when kernel execution is not dominating the execution time, the CA-PaRSEC version achieved up to 57\% and 33\% speedup over base-PaRSEC implementation on NaCL and Stampede2 respectively.}, doi = {https://doi.org/10.1109/IPDPSW50202.2020.00127}, author = {Yu Pei and Qinglei Cao and George Bosilca and Piotr Luszczek and Victor Eijkhout and Jack Dongarra} } @conference {, title = {Extreme-Scale Task-Based Cholesky Factorization Toward Climate and Weather Prediction Applications}, booktitle = {Platform for Advanced Scientific Computing Conference (PASC20)}, year = {2020}, month = {2020-06}, publisher = {ACM}, organization = {ACM}, address = {Geneva, Switzerland}, abstract = {Climate and weather can be predicted statistically via geospatial Maximum Likelihood Estimates (MLE), as an alternative to running large ensembles of forward models. The MLE-based iterative optimization procedure requires the solving of large-scale linear systems that performs a Cholesky factorization on a symmetric positive-definite covariance matrix---a demanding dense factorization in terms of memory footprint and computation. We propose a novel solution to this problem: at the mathematical level, we reduce the computational requirement by exploiting the data sparsity structure of the matrix off-diagonal tiles by means of low-rank approximations; and, at the programming-paradigm level, we integrate PaRSEC, a dynamic, task-based runtime to reach unparalleled levels of efficiency for solving extreme-scale linear algebra matrix operations. The resulting solution leverages fine-grained computations to facilitate asynchronous execution while providing a flexible data distribution to mitigate load imbalance. Performance results are reported using 3D synthetic datasets up to 42M geospatial locations on 130, 000 cores, which represent a cornerstone toward fast and accurate predictions of environmental applications.}, doi = {https://doi.org/10.1145/3394277.3401846}, author = {Qinglei Cao and Yu Pei and Kadir Akbudak and Aleksandr Mikhalev and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @conference {, title = {Flexible Data Redistribution in a Task-Based Runtime System}, booktitle = {IEEE International Conference on Cluster Computing (Cluster 2020)}, year = {2020}, month = {2020-09}, publisher = {IEEE}, organization = {IEEE}, address = {Kobe, Japan}, abstract = {Data redistribution aims to reshuffle data to optimize some objective for an algorithm. The objective can be multi-dimensional, such as improving computational load balance or decreasing communication volume or cost, with the ultimate goal to increase the efficiency and therefore decrease the time-to-solution for the algorithm. The classical redistribution problem focuses on optimally scheduling communications when reshuffling data between two regular, usually block-cyclic, data distributions. Recently, task-based runtime systems have gained popularity as a potential candidate to address the programming complexity on the way to exascale. In addition to an increase in portability against complex hardware and software systems, task-based runtime systems have the potential to be able to more easily cope with less-regular data distribution, providing a more balanced computational load during the lifetime of the execution. In this scenario, it becomes paramount to develop a general redistribution algorithm for task-based runtime systems, which could support all types of regular and irregular data distributions. In this paper, we detail a flexible redistribution algorithm, capable of dealing with redistribution problems without constraints of data distribution and data size and implement it in a task-based runtime system, PaRSEC. Performance results show great capability compared to ScaLAPACK, and applications highlight an increased efficiency with little overhead in terms of data distribution and data size.}, doi = {https://doi.org/10.1109/CLUSTER49012.2020.00032}, author = {Qinglei Cao and George Bosilca and Wei Wu and Dong Zhong and Aurelien Bouteiller and Jack Dongarra} } @conference {, title = {HAN: A Hierarchical AutotuNed Collective Communication Framework}, booktitle = {IEEE Cluster Conference}, year = {2020}, month = {2020-09}, publisher = {Best Paper Award, IEEE Computer Society Press}, organization = {Best Paper Award, IEEE Computer Society Press}, address = {Kobe, Japan}, abstract = {High-performance computing (HPC) systems keep growing in scale and heterogeneity to satisfy the increasing computational need, and this brings new challenges to the design of MPI libraries, especially with regard to collective operations. To address these challenges, we present "HAN," a new hierarchical autotuned collective communication framework in Open MPI, which selects suitable homogeneous collective communication modules as submodules for each hardware level, uses collective operations from the submodules as tasks, and organizes these tasks to perform efficient hierarchical collective operations. With a task-based design, HAN can easily swap out submodules, while keeping tasks intact, to adapt to new hardware. This makes HAN suitable for the current platform and provides a strong and flexible support for future HPC systems. To provide a fast and accurate autotuning mechanism, we present a novel cost model based on benchmarking the tasks instead of a whole collective operation. This method drastically reduces tuning time, as the cost of tasks can be reused across different message sizes, and is more accurate than existing cost models. Our cost analysis suggests the autotuning component can find the optimal configuration in most cases. The evaluation of the HAN framework suggests our design significantly improves the default Open MPI and achieves decent speedups against state-of-the-art MPI implementations on tested applications.}, author = {Xi Luo and Wei Wu and George Bosilca and Yu Pei and Qinglei Cao and Thananon Patinyasakdikul and Dong Zhong and Jack Dongarra} } @conference {, title = {Task Bench: A Parameterized Benchmark for Evaluating Parallel Runtime Performance}, booktitle = {International Conference for High Performance Computing Networking, Storage, and Analysis (SC20)}, year = {2020}, month = {2020-11}, publisher = {ACM}, organization = {ACM}, abstract = {We present Task Bench, a parameterized benchmark designed to explore the performance of distributed programming systems under a variety of application scenarios. Task Bench dramatically lowers the barrier to benchmarking and comparing multiple programming systems by making the implementation for a given system orthogonal to the benchmarks themselves: every benchmark constructed with Task Bench runs on every Task Bench implementation. Furthermore, Task Bench{\textquoteright}s parameterization enables a wide variety of benchmark scenarios that distill the key characteristics of larger applications. To assess the effectiveness and overheads of the tested systems, we introduce a novel metric, minimum effective task granularity (METG). We conduct a comprehensive study with 15 programming systems on up to 256 Haswell nodes of the Cori supercomputer. Running at scale, 100μs-long tasks are the finest granularity that any system runs efficiently with current technologies. We also study each system{\textquoteright}s scalability, ability to hide communication and mitigate load imbalance.}, url = {https://dl.acm.org/doi/10.5555/3433701.3433783}, author = {Elliott Slaughter and Wei Wu and Yuankun Fu and Legend Brandenburg and Nicolai Garcia and Wilhem Kautz and Emily Marx and Kaleb S. Morris and Qinglei Cao and George Bosilca and Seema Mirchandaney and Wonchan Lee and Sean Treichler and Patrick McCormick and Alex Aiken} } @conference {, title = {Using Advanced Vector Extensions AVX-512 for MPI Reduction}, booktitle = {EuroMPI/USA {\textquoteright}20: 27th European MPI Users{\textquoteright} Group Meeting}, year = {2020}, month = {2020-09}, address = {Austin, TX}, abstract = {As the scale of high-performance computing (HPC) systems continues to grow, researchers are devoted themselves to explore increasing levels of parallelism to achieve optimal performance. The modern CPU{\textquoteright}s design, including its features of hierarchical memory and SIMD/vectorization capability, governs algorithms{\textquoteright} efficiency. The recent introduction of wide vector instruction set extensions (AVX and SVE) motivated vectorization to become of critical importance to increase efficiency and close the gap to peak performance. In this paper, we propose an implementation of predefined MPI reduction operations utilizing AVX, AVX2 and AVX-512 intrinsics to provide vector-based reduction operation and to improve the timeto- solution of these predefined MPI reduction operations. With these optimizations, we achieve higher efficiency for local computations, which directly benefit the overall cost of collective reductions. The evaluation of the resulting software stack under different scenarios demonstrates that the solution is at the same time generic and efficient. Experiments are conducted on an Intel Xeon Gold cluster, which shows our AVX-512 optimized reduction operations achieve 10X performance benefits than Open MPI default for MPI local reduction.}, keywords = {Instruction level parallelism, Intel AVX2/AVX-512, Long vector extension, MPI reduction operation, Single instruction multiple data, Vector operation}, doi = {https://doi.org/10.1145/3416315.3416316}, author = {Dong Zhong and Qinglei Cao and George Bosilca and Jack Dongarra} } @article {, title = {Using Advanced Vector Extensions AVX-512 for MPI Reduction (Poster)}, year = {2020}, month = {2020-09}, publisher = {EuroMPI/USA {\textquoteright}20: 27th European MPI Users{\textquoteright} Group Meeting}, address = {Austin, TX}, author = {Dong Zhong and George Bosilca and Qinglei Cao and Jack Dongarra} } @conference {1484, title = {Using Arm Scalable Vector Extension to Optimize Open MPI}, booktitle = {20th IEEE/ACM International Symposium on Cluster, Cloud and Internet Computing (CCGRID 2020)}, year = {2020}, month = {2020-05}, publisher = {IEEE/ACM}, organization = {IEEE/ACM}, address = {Melbourne, Australia}, abstract = {As the scale of high-performance computing (HPC) systems continues to grow, increasing levels of parallelism must be implored to achieve optimal performance. Recently, the processors support wide vector extensions, vectorization becomes much more important to exploit the potential peak performance of target architecture. Novel processor architectures, such as the Armv8-A architecture, introduce Scalable Vector Extension (SVE) - an optional separate architectural extension with a new set of A64 instruction encodings, which enables even greater parallelisms. In this paper, we analyze the usage and performance of the SVE instructions in Arm SVE vector Instruction Set Architecture (ISA); and utilize those instructions to improve the memcpy and various local reduction operations. Furthermore, we propose new strategies to improve the performance of MPI operations including datatype packing/unpacking and MPI reduction. With these optimizations, we not only provide a higher-parallelism for a single node, but also achieve a more efficient communication scheme of message exchanging. The resulting efforts have been implemented in the context of OPEN MPI, providing efficient and scalable capabilities of SVE usage and extending the possible implementations of SVE to a more extensive range of programming and execution paradigms. The evaluation of the resulting software stack under different scenarios with both simulator and Fujitsu{\textquoteright}s A64FX processor demonstrates that the solution is at the same time generic and efficient.}, keywords = {ARMIE, datatype pack and unpack, local reduction, non-contiguous accesses, SVE, Vector Length Agnostic}, doi = {https://doi.org/10.1109/CCGrid49817.2020.00-71}, author = {Dong Zhong and Pavel Shamis and Qinglei Cao and George Bosilca and Jack Dongarra} } @conference {1452, title = {Performance Analysis of Tile Low-Rank Cholesky Factorization Using PaRSEC Instrumentation Tools}, booktitle = {Workshop on Programming and Performance Visualization Tools (ProTools 19) at SC19}, year = {2019}, month = {2019-11}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, author = {Qinglei Cao and Yu Pei and Thomas Herault and Kadir Akbudak and Aleksandr Mikhalev and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} }