@inproceedings {996, title = {Domain Overlap for Iterative Sparse Triangular Solves on GPUs}, journal = {Software for Exascale Computing - SPPEXA}, volume = {113}, year = {2016}, month = {2016-09}, pages = {527{\textendash}545}, publisher = {Springer International Publishing}, abstract = {Iterative methods for solving sparse triangular systems are an attractive alternative to exact forward and backward substitution if an approximation of the solution is acceptable. On modern hardware, performance benefits are available as iterative methods allow for better parallelization. In this paper, we investigate how block-iterative triangular solves can benefit from using overlap. Because the matrices are triangular, we use {\textquotedblleft}directed{\textquotedblright} overlap, depending on whether the matrix is upper or lower triangular. We enhance a GPU implementation of the block-asynchronous Jacobi method with directed overlap. For GPUs and other cases where the problem must be overdecomposed, i.e., more subdomains and threads than cores, there is a preference in processing or scheduling the subdomains in a specific order, following the dependencies specified by the sparse triangular matrix. For sparse triangular factors from incomplete factorizations, we demonstrate that moderate directed overlap with subdomain scheduling can improve convergence and time-to-solution.}, doi = {10.1007/978-3-319-40528-5_24}, author = {Hartwig Anzt and Edmond Chow and Daniel Szyld and Jack Dongarra}, editor = {Hans-Joachim Bungartz and Philipp Neumann and Wolfgang E. Nagel} } @conference {893, title = {Random-Order Alternating Schwarz for Sparse Triangular Solves}, booktitle = {2015 SIAM Conference on Applied Linear Algebra (SIAM LA)}, year = {2015}, month = {2015-10}, publisher = {SIAM}, organization = {SIAM}, address = {Atlanta, GA}, abstract = {Block-asynchronous Jacobi is an iteration method where a locally synchronous iteration is embedded in an asynchronous global iteration. The unknowns are partitioned into small subsets, and while the components within the same subset are iterated in Jacobi fashion, no update order in-between the subsets is enforced. The values of the non-local entries remain constant during the local iterations, which can result in slow inter-subset information propagation and slow convergence. Interpreting of the subsets as subdomains allows to transfer the concept of domain overlap typically enhancing the information propagation to block-asynchronous solvers. In this talk we explore the impact of overlapping domains to convergence and performance of block-asynchronous Jacobi iterations, and present results obtained by running this solver class on state-of-the-art HPC systems.}, author = {Hartwig Anzt and Edmond Chow and Daniel Szyld and Jack Dongarra} }