@conference {, title = {A Framework to Exploit Data Sparsity in Tile Low-Rank Cholesky Factorization}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2022}, month = {2022-07}, doi = {10.1109/IPDPS53621.2022.00047}, url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=9820680\&isnumber=9820610}, author = {Qinglei Cao and Rabab Alomairy and Yu Pei and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @conference {, title = {Leveraging PaRSEC Runtime Support to Tackle Challenging 3D Data-Sparse Matrix Problems}, booktitle = {35th IEEE International Parallel \& Distributed Processing Symposium (IPDPS 2021)}, year = {2021}, month = {2021-05}, publisher = {IEEE}, organization = {IEEE}, address = {Portland, OR}, abstract = {The task-based programming model associated with dynamic runtime systems has gained popularity for challenging problems because of workload imbalance, heterogeneous resources, or extreme concurrency. During the last decade, lowrank matrix approximations, where the main idea consists of exploiting data sparsity typically by compressing off-diagonal tiles up to an application-specific accuracy threshold, have been adopted to address the curse of dimensionality at extreme scale. In this paper, we create a bridge between the runtime and the linear algebra by communicating knowledge of the data sparsity to the runtime. We design and implement this synergistic approach with high user productivity in mind, in the context of the PaRSEC runtime system and the HiCMA numerical library. This requires to extend PaRSEC with new features to integrate rank information into the dataflow so that proper decisions can be taken at runtime. We focus on the tile low-rank (TLR) Cholesky factorization for solving 3D data-sparse covariance matrix problems arising in environmental applications. In particular, we employ the 3D exponential model of Matern matrix kernel, which exhibits challenging nonuniform {\textasciiacute}high ranks in off-diagonal tiles. We first provide a dynamic data structure management driven by a performance model to reduce extra floating-point operations. Next, we optimize the memory footprint of the application by relying on a dynamic memory allocator, and supported by a rank-aware data distribution to cope with the workload imbalance. Finally, we expose further parallelism using kernel recursive formulations to shorten the critical path. Our resulting high-performance implementation outperforms existing data-sparse TLR Cholesky factorization by up to 7-fold on a large-scale distributed-memory system, while minimizing the memory footprint up to a 44-fold factor. This multidisciplinary work highlights the need to empower runtime systems beyond their original duty of task scheduling for servicing next-generation low-rank matrix algebra libraries.}, keywords = {asynchronous executions and load balancing, dynamic runtime system, environmental applications, High-performance computing, low-rank matrix computations, task-based programming model, user productivity}, author = {Qinglei Cao and Yu Pei and Kadir Akbudak and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @conference {1478, title = {Communication Avoiding 2D Stencil Implementations over PaRSEC Task-Based Runtime}, booktitle = {2020 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2020}, month = {2020-05}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, abstract = {Stencil computation or general sparse matrix-vector product (SpMV) are key components in many algorithms like geometric multigrid or Krylov solvers. But their low arithmetic intensity means that memory bandwidth and network latency will be the performance limiting factors. The current architectural trend favors computations over bandwidth, worsening the already unfavorable imbalance. Previous work approached stencil kernel optimization either by improving memory bandwidth usage or by providing a Communication Avoiding (CA) scheme to minimize network latency in repeated sparse vector multiplication by replicating remote work in order to delay communications on the critical path. Focusing on minimizing communication bottleneck in distributed stencil computation, in this study we combine a CA scheme with the computation and communication overlapping that is inherent in a dataflow task-based runtime system such as PaRSEC to demonstrate their combined benefits. We implemented the 2D five point stencil (Jacobi iteration) in PETSc, and over PaRSEC in two flavors, full communications (base-PaRSEC) and CA-PaRSEC which operate directly on a 2D compute grid. Our results running on two clusters, NaCL and Stampede2 indicate that we can achieve 2{\texttimes} speedup over the standard SpMV solution implemented in PETSc, and in certain cases when kernel execution is not dominating the execution time, the CA-PaRSEC version achieved up to 57\% and 33\% speedup over base-PaRSEC implementation on NaCL and Stampede2 respectively.}, doi = {https://doi.org/10.1109/IPDPSW50202.2020.00127}, author = {Yu Pei and Qinglei Cao and George Bosilca and Piotr Luszczek and Victor Eijkhout and Jack Dongarra} } @conference {, title = {Extreme-Scale Task-Based Cholesky Factorization Toward Climate and Weather Prediction Applications}, booktitle = {Platform for Advanced Scientific Computing Conference (PASC20)}, year = {2020}, month = {2020-06}, publisher = {ACM}, organization = {ACM}, address = {Geneva, Switzerland}, abstract = {Climate and weather can be predicted statistically via geospatial Maximum Likelihood Estimates (MLE), as an alternative to running large ensembles of forward models. The MLE-based iterative optimization procedure requires the solving of large-scale linear systems that performs a Cholesky factorization on a symmetric positive-definite covariance matrix---a demanding dense factorization in terms of memory footprint and computation. We propose a novel solution to this problem: at the mathematical level, we reduce the computational requirement by exploiting the data sparsity structure of the matrix off-diagonal tiles by means of low-rank approximations; and, at the programming-paradigm level, we integrate PaRSEC, a dynamic, task-based runtime to reach unparalleled levels of efficiency for solving extreme-scale linear algebra matrix operations. The resulting solution leverages fine-grained computations to facilitate asynchronous execution while providing a flexible data distribution to mitigate load imbalance. Performance results are reported using 3D synthetic datasets up to 42M geospatial locations on 130, 000 cores, which represent a cornerstone toward fast and accurate predictions of environmental applications.}, doi = {https://doi.org/10.1145/3394277.3401846}, author = {Qinglei Cao and Yu Pei and Kadir Akbudak and Aleksandr Mikhalev and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @conference {, title = {HAN: A Hierarchical AutotuNed Collective Communication Framework}, booktitle = {IEEE Cluster Conference}, year = {2020}, month = {2020-09}, publisher = {Best Paper Award, IEEE Computer Society Press}, organization = {Best Paper Award, IEEE Computer Society Press}, address = {Kobe, Japan}, abstract = {High-performance computing (HPC) systems keep growing in scale and heterogeneity to satisfy the increasing computational need, and this brings new challenges to the design of MPI libraries, especially with regard to collective operations. To address these challenges, we present "HAN," a new hierarchical autotuned collective communication framework in Open MPI, which selects suitable homogeneous collective communication modules as submodules for each hardware level, uses collective operations from the submodules as tasks, and organizes these tasks to perform efficient hierarchical collective operations. With a task-based design, HAN can easily swap out submodules, while keeping tasks intact, to adapt to new hardware. This makes HAN suitable for the current platform and provides a strong and flexible support for future HPC systems. To provide a fast and accurate autotuning mechanism, we present a novel cost model based on benchmarking the tasks instead of a whole collective operation. This method drastically reduces tuning time, as the cost of tasks can be reused across different message sizes, and is more accurate than existing cost models. Our cost analysis suggests the autotuning component can find the optimal configuration in most cases. The evaluation of the HAN framework suggests our design significantly improves the default Open MPI and achieves decent speedups against state-of-the-art MPI implementations on tested applications.}, author = {Xi Luo and Wei Wu and George Bosilca and Yu Pei and Qinglei Cao and Thananon Patinyasakdikul and Dong Zhong and Jack Dongarra} } @conference {1451, title = {Evaluation of Programming Models to Address Load Imbalance on Distributed Multi-Core CPUs: A Case Study with Block Low-Rank Factorization}, booktitle = {PAW-ATM Workshop at SC19}, year = {2019}, month = {2019-11}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, author = {Yu Pei and George Bosilca and Ichitaro Yamazaki and Akihiro Ida and Jack Dongarra} } @conference {1452, title = {Performance Analysis of Tile Low-Rank Cholesky Factorization Using PaRSEC Instrumentation Tools}, booktitle = {Workshop on Programming and Performance Visualization Tools (ProTools 19) at SC19}, year = {2019}, month = {2019-11}, publisher = {ACM}, organization = {ACM}, address = {Denver, CO}, author = {Qinglei Cao and Yu Pei and Thomas Herault and Kadir Akbudak and Aleksandr Mikhalev and George Bosilca and Hatem Ltaief and David Keyes and Jack Dongarra} } @conference {1169, title = {Autotuning Batch Cholesky Factorization in CUDA with Interleaved Layout of Matrices}, booktitle = {Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2017}, month = {2017-06}, publisher = {IEEE}, organization = {IEEE}, address = {Orlando, FL}, abstract = {Batch matrix operations address the case of solving the same linear algebra problem for a very large number of very small matrices. In this paper, we focus on implementing the batch Cholesky factorization in CUDA, in single precision arithmetic, for NVIDIA GPUs. Specifically, we look into the benefits of using noncanonical data layouts, where consecutive memory locations store elements with the same row and column index in a set of consecutive matrices. We discuss a number of different implementation options and tuning parameters. We demonstrate superior performance to traditional implementations for the case of very small matrices.}, keywords = {batch computation, Cholesky Factorization, data layout, GPU computing, numerical linear algebra}, doi = {10.1109/IPDPSW.2017.18}, author = {Mark Gates and Jakub Kurzak and Piotr Luszczek and Yu Pei and Jack Dongarra} }