@conference {964, title = {High-performance Matrix-matrix Multiplications of Very Small Matrices}, booktitle = {22nd International European Conference on Parallel and Distributed Computing (Euro-Par{\textquoteright}16)}, year = {2016}, month = {2016-08}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Grenoble, France}, abstract = {The use of the general dense matrix-matrix multiplication (GEMM) is fundamental for obtaining high performance in many scientific computing applications. GEMMs for small matrices (of sizes less than 32) however, are not sufficiently optimized in existing libraries. In this paper we consider the case of many small GEMMs on either CPU or GPU architectures. This is a case that often occurs in applications like big data analytics, machine learning, high-order FEM, and others. The GEMMs are grouped together in a single batched routine. We present specialized for these cases algorithms and optimization techniques to obtain performance that is within 90\% of the optimal. We show that these results outperform currently available state-of-the-art implementations and vendor-tuned math libraries.}, author = {Ian Masliah and Ahmad Abdelfattah and Azzam Haidar and Stanimire Tomov and Jo{\"e}l Falcou and Jack Dongarra} }