@inproceedings {1011, title = {Improving Performance of GMRES by Reducing Communication and Pipelining Global Collectives}, journal = {Proceedings of The 18th IEEE International Workshop on Parallel and Distributed Scientific and Engineering Computing (PDSEC 2017), Best Paper Award}, year = {2017}, month = {2017-06}, address = {Orlando, FL}, abstract = {We compare the performance of pipelined and s-step GMRES, respectively referred to as l-GMRES and s-GMRES, on distributed multicore CPUs. Compared to standard GMRES, s-GMRES requires fewer all-reduces, while l-GMRES overlaps the all-reduces with computation. To combine the best features of two algorithms, we propose another variant, (l, t)-GMRES, that not only does fewer global all-reduces than standard GMRES, but also overlaps those all-reduces with other work. We implemented the thread-parallelism and communication-overlap in two different ways. The first uses nonblocking MPI collectives with thread-parallel computational kernels. The second relies on a shared-memory task scheduler. In our experiments, (l, t)-GMRES performed better than l-GMRES by factors of up to 1.67{\texttimes}. In addition, though we only used 50 nodes, when the latency cost became significant, our variant performed up to 1.22{\texttimes} better than s-GMRES by hiding all-reduces.}, doi = {https://doi.org/10.1109/IPDPSW.2017.65}, author = {Ichitaro Yamazaki and Mark Hoemmen and Piotr Luszczek and Jack Dongarra} }