@conference {1478, title = {Communication Avoiding 2D Stencil Implementations over PaRSEC Task-Based Runtime}, booktitle = {2020 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)}, year = {2020}, month = {2020-05}, publisher = {IEEE}, organization = {IEEE}, address = {New Orleans, LA}, abstract = {Stencil computation or general sparse matrix-vector product (SpMV) are key components in many algorithms like geometric multigrid or Krylov solvers. But their low arithmetic intensity means that memory bandwidth and network latency will be the performance limiting factors. The current architectural trend favors computations over bandwidth, worsening the already unfavorable imbalance. Previous work approached stencil kernel optimization either by improving memory bandwidth usage or by providing a Communication Avoiding (CA) scheme to minimize network latency in repeated sparse vector multiplication by replicating remote work in order to delay communications on the critical path. Focusing on minimizing communication bottleneck in distributed stencil computation, in this study we combine a CA scheme with the computation and communication overlapping that is inherent in a dataflow task-based runtime system such as PaRSEC to demonstrate their combined benefits. We implemented the 2D five point stencil (Jacobi iteration) in PETSc, and over PaRSEC in two flavors, full communications (base-PaRSEC) and CA-PaRSEC which operate directly on a 2D compute grid. Our results running on two clusters, NaCL and Stampede2 indicate that we can achieve 2{\texttimes} speedup over the standard SpMV solution implemented in PETSc, and in certain cases when kernel execution is not dominating the execution time, the CA-PaRSEC version achieved up to 57\% and 33\% speedup over base-PaRSEC implementation on NaCL and Stampede2 respectively.}, doi = {https://doi.org/10.1109/IPDPSW50202.2020.00127}, author = {Yu Pei and Qinglei Cao and George Bosilca and Piotr Luszczek and Victor Eijkhout and Jack Dongarra} }