@article {, title = {libCEED: Fast algebra for high-order element-based discretizations}, journal = {Journal of Open Source Software}, volume = {6}, number = {63}, year = {2021}, pages = {2945}, abstract = {Finite element methods are widely used to solve partial differential equations (PDE) in science and engineering, but their standard implementation (Arndt et al., 2020; Kirk et al., 2006; Logg et al., 2012) relies on assembling sparse matrices. Sparse matrix multiplication and triangular operations perform a scalar multiply and add for each nonzero entry, just 2 floating point operations (flops) per scalar that must be loaded from memory (Williams et al., 2009). Modern hardware is capable of nearly 100 flops per scalar streamed from memory (Rupp, 2020) so sparse matrix operations cannot achieve more than about 2\% utilization of arithmetic units. Matrix assembly becomes even more problematic when the polynomial degree p of the basis functions is increased, resulting in O(pd) storage and O(p2d) compute per degree of freedom (DoF) in d dimensions. Methods pioneered by the spectral element community (Deville et al., 2002; Orszag, 1980) exploit problem structure to reduce costs to O(1) storage and O(p) compute per DoF, with very high utilization of modern CPUs and GPUs. Unfortunately, highquality implementations have been relegated to applications and intrusive frameworks that are often difficult to extend to new problems or incorporate into legacy applications, especially when strong preconditioners are required. libCEED, the Code for Efficient Extensible Discretization (Abdelfattah et al., 2021), is a lightweight library that provides a purely algebraic interface for linear and nonlinear operators and preconditioners with element-based discretizations. libCEED provides portable performance via run-time selection of implementations optimized for CPUs and GPUs, including support for just-in-time (JIT) compilation. It is designed for convenient use in new and legacy software, and offers interfaces in C99 (International Standards Organisation, 1999), Fortran77 (ANSI, 1978), Python (Python, 2021), Julia (Bezanson et al., 2017), and Rust (Rust, 2021). Users and library developers can integrate libCEED at a low level into existing applications in place of existing matrix-vector products without significant refactoring of their own discretization infrastructure. Alternatively, users can utilize integrated libCEED support in MFEM (Anderson et al., 2020; MFEM, 2021). In addition to supporting applications and discretization libraries, libCEED provides a platform for performance engineering and co-design, as well as an algebraic interface for solvers research like adaptive p-multigrid, much like how sparse matrix libraries enable development and deployment of algebraic multigrid solvers}, keywords = {finite elements, high-order methods, High-performance computing, matrix-free, spectral elements}, doi = {10.21105/joss.02945}, url = {https://doi.org/10.21105/joss.02945}, author = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jean-Sylvain Camier and Veselin Dobrev and Yohann Dudouit and Leila Ghaffari and Tzanio Kolev and David Medina and Will Pazner and Thilina Ratnayaka and Jeremy Thompson and Stanimire Tomov} } @techreport {1433, title = {CEED ECP Milestone Report: Performance Tuning of CEED Software and 1st and 2nd Wave Apps}, year = {2019}, month = {2019-10}, publisher = {Zenodo}, doi = {https://doi.org/10.5281/zenodo.3477618}, author = {Stanimire Tomov and Ahmad Abdelfattah and Valeria Barra and Natalie Beams and Jed Brown and Jean-Sylvain Camier and Veselin Dobrev and Jack Dongarra and Yohann Dudouit and Paul Fischer and Ali Karakus and Stefan Kerkemeier and Tzanio Kolev and YuHsiang Lan and Elia Merzari and Misun Min and Aleks Obabko and Scott Parker and Thilina Ratnayaka and Jeremy Thompson and Ananias Tomboulides and Vladimir Tomov and Tim Warburton} } @techreport {1434, title = {CEED ECP Milestone Report: Public release of CEED 2.0}, year = {2019}, month = {2019-04}, publisher = {Zenodo}, doi = {10.5281/zenodo.2641316}, url = {https://doi.org/10.5281/zenodo.2641316}, author = {Jed Brown and Ahmad Abdelfattah and Valeria Barra and Veselin Dobrev and Yohann Dudouit and Paul Fischer and Tzanio Kolev and David Medina and Misun Min and Thilina Ratnayaka and Cameron Smith and Jeremy Thompson and Stanimire Tomov and Vladimir Tomov and Tim Warburton} } @article {1341, title = {Accelerating Tensor Contractions in High-Order FEM with MAGMA Batched}, year = {2017}, month = {2017-03}, publisher = {SIAM Conference on Computer Science and Engineering (SIAM CSE17), Presentation}, address = {Atlanta, GA}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @techreport {1082, title = {Small Tensor Operations on Advanced Architectures for High-Order Applications}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-17-749}, year = {2017}, month = {2017-04}, publisher = {Innovative Computing Laboratory, University of Tennessee}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @article {1342, title = {Accelerating Tensor Contractions for High-Order FEM on CPUs, GPUs, and KNLs}, year = {2016}, month = {2016-09}, publisher = {moky Mountains Computational Sciences and Engineering Conference (SMC16), Poster}, address = {Gatlinburg, TN}, author = {Azzam Haidar and Ahmad Abdelfattah and Veselin Dobrev and Ian Karlin and Tzanio Kolev and Stanimire Tomov and Jack Dongarra} } @conference {942, title = {High-Performance Tensor Contractions for GPUs}, booktitle = {International Conference on Computational Science (ICCS{\textquoteright}16)}, year = {2016}, month = {2016-06}, address = {San Diego, CA}, abstract = {We present a computational framework for high-performance tensor contractions on GPUs. High-performance is difficult to obtain using existing libraries, especially for many independent contractions where each contraction is very small, e.g., sub-vector/warp in size. However, using our framework to batch contractions plus application-specifics, we demonstrate close to peak performance results. In particular, to accelerate large scale tensor-formulated high-order finite element method (FEM) simulations, which is the main focus and motivation for this work, we represent contractions as tensor index reordering plus matrix-matrix multiplications (GEMMs). This is a key factor to achieve algorithmically many-fold acceleration (vs. not using it) due to possible reuse of data loaded in fast memory. In addition to using this context knowledge, we design tensor data-structures, tensor algebra interfaces, and new tensor contraction algorithms and implementations to achieve 90+\% of a theoretically derived peak on GPUs. On a K40c GPU for contractions resulting in GEMMs on square matrices of size 8 for example, we are 2.8{\texttimes} faster than CUBLAS, and 8.5{\texttimes} faster than MKL on 16 cores of Intel Xeon E5-2670 (Sandy Bridge) 2.60GHz CPUs. Finally, we apply autotuning and code generation techniques to simplify tuning and provide an architecture-aware, user-friendly interface.}, keywords = {Applications, Batched linear algebra, FEM, gpu, Tensor contractions, Tensor HPC}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @techreport {929, title = {High-Performance Tensor Contractions for GPUs}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-738}, year = {2016}, month = {2016-01}, publisher = {University of Tennessee}, abstract = {We present a computational framework for high-performance tensor contractions on GPUs. High-performance is difficult to obtain using existing libraries, especially for many independent contractions where each contraction is very small, e.g., sub-vector/warp in size. However, using our framework to batch contractions plus application-specifics, we demonstrate close to peak performance results. In particular, to accelerate large scale tensor-formulated high-order finite element method (FEM) simulations, which is the main focus and motivation for this work, we represent contractions as tensor index reordering plus matrix-matrix multiplications (GEMMs). This is a key factor to achieve algorithmically many-fold acceleration (vs. not using it) due to possible reuse of data loaded in fast memory. In addition to using this context knowledge, we design tensor data-structures, tensor algebra interfaces, and new tensor contraction algorithms and implementations to achieve 90+\% of a theoretically derived peak on GPUs. On a K40c GPU for contractions resulting in GEMMs on square matrices of size 8 for example, we are 2.8{\texttimes} faster than CUBLAS, and 8.5{\texttimes} faster than MKL on 16 cores of Intel Xeon ES-2670 (Sandy Bridge) 2.60GHz CPUs. Finally, we apply autotuning and code generation techniques to simplify tuning and provide an architecture-aware, user-friendly interface.}, author = {Ahmad Abdelfattah and Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @article {1346, title = {Towards a High-Performance Tensor Algebra Package for Accelerators}, year = {2015}, month = {2015-09}, publisher = {moky Mountains Computational Sciences and Engineering Conference (SMC15)}, address = {Gatlinburg, TN}, author = {Marc Baboulin and Veselin Dobrev and Jack Dongarra and Christopher Earl and Jo{\"e}l Falcou and Azzam Haidar and Ian Karlin and Tzanio Kolev and Ian Masliah and Stanimire Tomov} } @conference {767, title = {A Step towards Energy Efficient Computing: Redesigning A Hydrodynamic Application on CPU-GPU}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {Power and energy consumption are becoming an increasing concern in high performance computing. Compared to multi-core CPUs, GPUs have a much better performance per watt. In this paper we discuss efforts to redesign the most computation intensive parts of BLAST, an application that solves the equations for compressible hydrodynamics with high order finite elements, using GPUs [10, 1]. In order to exploit the hardware parallelism of GPUs and achieve high performance, we implemented custom linear algebra kernels. We intensively optimized our CUDA kernels by exploiting the memory hierarchy, which exceed the vendor{\textquoteright}s library routines substantially in performance. We proposed an autotuning technique to adapt our CUDA kernels to the orders of the finite element method. Compared to a previous base implementation, our redesign and optimization lowered the energy consumption of the GPU in two aspects: 60\% less time to solution and 10\% less power required. Compared to the CPU-only solution, our GPU accelerated BLAST obtained a 2:5x overall speedup and 1:42x energy efficiency (greenup) using 4th order (Q4) finite elements, and a 1:9x speedup and 1:27x greenup using 2nd order (Q2) finite elements.}, keywords = {Computer science, CUDA, FEM, Finite element method, linear algebra, nVidia, Tesla K20}, author = {Tingxing Dong and Veselin Dobrev and Tzanio Kolev and Robert Rieben and Stanimire Tomov and Jack Dongarra} } @techreport {690, title = {Hydrodynamic Computation with Hybrid Programming on CPU-GPU Clusters}, journal = {University of Tennessee Computer Science Technical Report}, number = {ut-cs-13-714}, year = {2013}, month = {2013-07}, abstract = {The explosion of parallelism and heterogeneity in today{\textquoteright}s computer architectures has created opportunities as well as challenges for redesigning legacy numerical software to harness the power of new hardware. In this paper we address the main challenges in redesigning BLAST { a numerical library that solves the equations of compressible hydrodynamics using high order nite element methods (FEM) in a moving Lagrangian frame { to support CPU-GPU clusters. We use a hybrid MPI + OpenMP + CUDA programming model that includes two layers: domain decomposed MPI parallelization and OpenMP + CUDA acceleration in a given domain. To optimize the code, we implemented custom linear algebra kernels and introduced an auto-tuning technique to deal with heterogeneity and load balancing at runtime. Our tests show that 12 Intel Xeon cores and two M2050 GPUs deliver a 24x speedup compared to a single core, and a 2.5x speedup compared to 12 MPI tasks in one node. Further, we achieve perfect weak scaling, demonstrated on a cluster with up to 64 GPUs in 32 nodes. Our choice of programming model and proposed solutions, as related to parallelism and load balancing, specifically targets high order FEM discretizations, and can be used equally successfully for applications beyond hydrodynamics. A major accomplishment is that we further establish the appeal of high order FEMs, which despite their better approximation properties, are often avoided due to their high computational cost. GPUs, as we show, have the potential to make them the method of choice, as the increased computational cost is also localized, e.g., cast as Level 3 BLAS, and thus can be done very efficiently (close to \free" relative to the usual overheads inherent in sparse computations).}, author = {Tingxing Dong and Veselin Dobrev and Tzanio Kolev and Robert Rieben and Stanimire Tomov and Jack Dongarra} } @article {icl:731, title = {Acceleration of the BLAST Hydro Code on GPU}, journal = {Supercomputing {\textquoteright}12 (poster)}, year = {2012}, month = {2012-11}, publisher = {SC12}, address = {Salt Lake City, Utah}, author = {Tingxing Dong and Tzanio Kolev and Robert Rieben and Veselin Dobrev and Stanimire Tomov and Jack Dongarra} }