@conference {1449,
	title = {Characterization of Power Usage and Performance in Data-Intensive Applications using MapReduce over MPI},
	booktitle = {2019 International Conference on Parallel Computing (ParCo2019)},
	year = {2019},
	month = {2019-09},
	address = {Prague, Czech Republic},
	author = {Joshua Davis and Tao Gao and Sunita Chandrasekaran and Heike Jagode and Anthony Danalis and Pavan Balaji and Jack Dongarra and Michela Taufer}
}
@article {1176,
	title = {Argobots: A Lightweight Low-Level Threading and Tasking Framework},
	journal = {IEEE Transactions on Parallel and Distributed Systems},
	year = {2017},
	month = {2017-10},
	abstract = {In the past few decades, a number of user-level threading and tasking models have been proposed in the literature to address the shortcomings of OS-level threads, primarily with respect to cost and flexibility. Current state-of-the-art user-level threading and tasking models, however, are either too specific to applications or architectures or are not as powerful or flexible. In this paper, we present Argobots, a lightweight, low-level threading and tasking framework that is designed as a portable and performant substrate for high-level programming models or runtime systems. Argobots offers a carefully designed execution model that balances generality of functionality with providing a rich set of controls to allow specialization by the user or high-level programming model. We describe the design, implementation, and optimization of Argobots and present integrations with three example high-level models: OpenMP, MPI, and co-located I/O service. Evaluations show that (1) Argobots outperforms existing generic threading runtimes; (2) our OpenMP runtime offers more efficient interoperability capabilities than production OpenMP runtimes do; (3) when MPI interoperates with Argobots instead of Pthreads, it enjoys reduced synchronization costs and better latency hiding capabilities; and (4) I/O service with Argobots reduces interference with co-located applications, achieving performance competitive with that of the Pthreads version.},
	keywords = {Argobots, context switch, I/O, interoperability, lightweight, MPI, OpenMP, stackable scheduler, tasklet, user-level thread},
	doi = {10.1109/TPDS.2017.2766062},
	url = {http://ieeexplore.ieee.org/document/8082139/},
	author = {Sangmin Seo and Abdelhalim Amer and Pavan Balaji and Cyril Bordage and George Bosilca and Alex Brooks and Philip Carns and Adrian Castello and Damien Genet and Thomas Herault and Shintaro Iwasaki and Prateek Jindal and Sanjay Kale and Sriram Krishnamoorthy and Jonathan Lifflander and Huiwei Lu and Esteban Meneses and Mar Snir and Yanhua Sun and Kenjiro Taura and Pete Beckman}
}
@inbook {997,
	title = {Performance, Design, and Autotuning of Batched GEMM for GPUs},
	booktitle = {High Performance Computing: 31st International Conference, ISC High Performance 2016, Frankfurt, Germany, June 19-23, 2016, Proceedings},
	number = {9697},
	year = {2016},
	pages = {21{\textendash}38},
	publisher = {Springer International Publishing},
	organization = {Springer International Publishing},
	abstract = {The general matrix-matrix multiplication (GEMM) is the most important numerical kernel in dense linear algebra, and is the key component for obtaining high performance in most LAPACK routines. As batched computations on relatively small problems continue to gain interest in many scientific applications, a need arises for a high performance GEMM kernel for batches of small matrices. Such a kernel should be well designed and tuned to handle small sizes, and to maintain high performance for realistic test cases found in the higher level LAPACK routines, and scientific computing applications in general.

This paper presents a high performance batched GEMM kernel on Graphics Processing Units (GPUs). We address batched problems with both fixed and variable sizes, and show that specialized GEMM designs and a comprehensive autotuning process are needed to handle problems of small sizes. For most performance tests reported in this paper, the proposed kernels outperform state-of-the-art approaches using a K40c GPU.},
	isbn = {978-3-319-41321-1},
	doi = {10.1007/978-3-319-41321-1_2},
	url = {http://dx.doi.org/10.1007/978-3-319-41321-1_2},
	author = {Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jack Dongarra},
	editor = {Julian M. Kunkel and Pavan Balaji and Jack Dongarra}
}