@techreport {, title = {Prospectus for the Next LAPACK and ScaLAPACK Libraries: Basic ALgebra LIbraries for Sustainable Technology with Interdisciplinary Collaboration (BALLISTIC)}, journal = {LAPACK Working Notes}, number = {297, ICL-UT-20-07}, year = {2020}, month = {2020/07}, publisher = {University of Tennessee}, abstract = {The convergence of several unprecedented changes, including formidable new system design constraints and revolutionary levels of heterogeneity, has made it clear that much of the essential software infrastructure of computational science and engineering is, or will soon be, obsolete. Math libraries have historically been in the vanguard of software that must be adapted first to such changes, both because these low-level workhorses are so critical to the accuracy and performance of so many different types of applications, and because they have proved to be outstanding vehicles for finding and implementing solutions to the problems that novel architectures pose. Under the Basic ALgebra LIbraries for Sustainable Technology with Interdisciplinary Collaboration (BALLISTIC) project, the principal designers of the Linear Algebra PACKage (LAPACK) and the Scalable Linear Algebra PACKage (ScaLAPACK), the combination of which is abbreviated Sca/LAPACK, aim to enhance and update these libraries for the ongoing revolution in processor architecture, system design, and application requirements by incorporating them into a layered package of software components{\textemdash}the BALLISTIC ecosystem{\textemdash}that provides users seamless access to state-of-the-art solver implementations through familiar and improved Sca/LAPACK interfaces.}, author = {James Demmel and Jack Dongarra and Julie Langou and Julien Langou and Piotr Luszczek and Michael Mahoney} } @conference {1093, title = {Bidiagonalization and R-Bidiagonalization: Parallel Tiled Algorithms, Critical Paths and Distributed-Memory Implementation}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, year = {2017}, month = {2017-05}, publisher = {IEEE}, organization = {IEEE}, address = {Orlando, FL}, abstract = {We study tiled algorithms for going from a "full" matrix to a condensed "band bidiagonal" form using orthog-onal transformations: (i) the tiled bidiagonalization algorithm BIDIAG, which is a tiled version of the standard scalar bidiago-nalization algorithm; and (ii) the R-bidiagonalization algorithm R-BIDIAG, which is a tiled version of the algorithm which consists in first performing the QR factorization of the initial matrix, then performing the band-bidiagonalization of the R- factor. For both BIDIAG and R-BIDIAG, we use four main types of reduction trees, namely FLATTS, FLATTT, GREEDY, and a newly introduced auto-adaptive tree, AUTO. We provide a study of critical path lengths for these tiled algorithms, which shows that (i) R-BIDIAG has a shorter critical path length than BIDIAG for tall and skinny matrices, and (ii) GREEDY based schemes are much better than earlier proposed algorithms with unbounded resources. We provide experiments on a single multicore node, and on a few multicore nodes of a parallel distributed shared- memory system, to show the superiority of the new algorithms on a variety of matrix sizes, matrix shapes and core counts.}, keywords = {Algorithm design and analysis, Approximation algorithms, Kernel, Multicore processing, Shape, Software algorithms, Transforms}, doi = {10.1109/IPDPS.2017.46}, author = {Mathieu Faverge and Julien Langou and Yves Robert and Jack Dongarra} } @techreport {1174, title = {2016 Dense Linear Algebra Software Packages Survey}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-EECS-16-744 / LAWN 290}, year = {2016}, month = {2016-09}, publisher = {University of Tennessee}, abstract = {The 2016 Dense Linear Algebra Software Packages Survey was administered from January 1st 2016 to April 12 2016. 234 respondents answered the survey. The survey was advertised directly to the Linear Algebra community via our LAPACK/ScaLAPACK forum, NA Digest and we also directly contacted vendors and linear algebra experts. The breakdown of respondents was: 74\% researchers or scientists, 25\% were Principal Investigators and 25\% Software maintainers or System administrators. The goal of the survey was to get the Linear Algebra community opinion and provide input on dense linear algebra software packages, in particular LAPACK, ScaLAPACK, PLASMA and MAGMA. The ultimate purpose of the survey was to improve these libraries to benefit our user community. The survey would allow the team to prioritize the many possible improvements that could be done. We also asked input from users accessing these libraries via 3rd party interfaces, for example MATLAB, Intel{\textquoteright}s MKL, Python{\textquoteright}s NumPy, AMD{\textquoteright}s ACML, and many others.}, author = {Jack Dongarra and Jim Demmel and Julien Langou and Julie Langou} } @article {917, title = {Mixing LU-QR Factorization Algorithms to Design High-Performance Dense Linear Algebra Solvers}, journal = {Journal of Parallel and Distributed Computing}, volume = {85}, year = {2015}, month = {2015-11}, pages = {32-46}, abstract = {This paper introduces hybrid LU{\textendash}QR algorithms for solving dense linear systems of the form Ax=b. Throughout a matrix factorization, these algorithms dynamically alternate LU with local pivoting and QR elimination steps based upon some robustness criterion. LU elimination steps can be very efficiently parallelized, and are twice as cheap in terms of floating-point operations, as QR steps. However, LU steps are not necessarily stable, while QR steps are always stable. The hybrid algorithms execute a QR step when a robustness criterion detects some risk for instability, and they execute an LU step otherwise. The choice between LU and QR steps must have a small computational overhead and must provide a satisfactory level of stability with as few QR steps as possible. In this paper, we introduce several robustness criteria and we establish upper bounds on the growth factor of the norm of the updated matrix incurred by each of these criteria. In addition, we describe the implementation of the hybrid algorithms through an extension of the PaRSEC software to allow for dynamic choices during execution. Finally, we analyze both stability and performance results compared to state-of-the-art linear solvers on parallel distributed multicore platforms. A comprehensive set of experiments shows that hybrid LU{\textendash}QR algorithms provide a continuous range of trade-offs between stability and performances.}, keywords = {lu factorization, Numerical algorithms, QR factorization, Stability; Performance}, doi = {doi:10.1016/j.jpdc.2015.06.007}, author = {Mathieu Faverge and Julien Herrmann and Julien Langou and Bradley Lowery and Yves Robert and Jack Dongarra} } @conference {813, title = {Designing LU-QR Hybrid Solvers for Performance and Stability}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {This paper introduces hybrid LU-QR algorithms for solving dense linear systems of the form Ax = b. Throughout a matrix factorization, these algorithms dynamically alternate LU with local pivoting and QR elimination steps, based upon some robustness criterion. LU elimination steps can be very efficiently parallelized, and are twice as cheap in terms of operations, as QR steps. However, LU steps are not necessarily stable, while QR steps are always stable. The hybrid algorithms execute a QR step when a robustness criterion detects some risk for instability, and they execute an LU step otherwise. Ideally, the choice between LU and QR steps must have a small computational overhead and must provide a satisfactory level of stability with as few QR steps as possible. In this paper, we introduce several robustness criteria and we establish upper bounds on the growth factor of the norm of the updated matrix incurred by each of these criteria. In addition, we describe the implementation of the hybrid algorithms through an extension of the Parsec software to allow for dynamic choices during execution. Finally, we analyze both stability and performance results compared to state-of-the-art linear solvers on parallel distributed multicore platforms.}, keywords = {plasma}, isbn = {978-1-4799-3800-1}, doi = {10.1109/IPDPS.2014.108}, author = {Mathieu Faverge and Julien Herrmann and Julien Langou and Bradley Lowery and Yves Robert and Jack Dongarra} } @techreport {703, title = {Designing LU-QR hybrid solvers for performance and stability}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 282)}, number = {ut-eecs-13-719}, year = {2013}, month = {2013-10}, publisher = {University of Tennessee}, author = {Mathieu Faverge and Julien Herrmann and Julien Langou and Bradley Lowery and Yves Robert and Jack Dongarra} } @article {752, title = {Hierarchical QR Factorization Algorithms for Multi-core Cluster Systems}, journal = {Parallel Computing}, volume = {39}, year = {2013}, month = {2013-05}, pages = {212-232}, abstract = {This paper describes a new QR factorization algorithm which is especially designed for massively parallel platforms combining parallel distributed nodes, where a node is a multi-core processor. These platforms represent the present and the foreseeable future of high-performance computing. Our new QR factorization algorithm falls in the category of the tile algorithms which naturally enables good data locality for the sequential kernels executed by the cores (high sequential performance), low number of messages in a parallel distributed setting (small latency term), and fine granularity (high parallelism). Each tile algorithm is uniquely characterized by its sequence of reduction trees. In the context of a cluster of nodes, in order to minimize the number of inter-processor communications (aka, {\textquoteleft}{\textquoteleft}communication-avoiding{\textquoteright}{\textquoteright}), it is natural to consider hierarchical trees composed of an {\textquoteleft}{\textquoteleft}inter-node{\textquoteright}{\textquoteright} tree which acts on top of {\textquoteleft}{\textquoteleft}intra-node{\textquoteright}{\textquoteright} trees. At the intra-node level, we propose a hierarchical tree made of three levels: (0) {\textquoteleft}{\textquoteleft}TS level{\textquoteright}{\textquoteright} for cache-friendliness, (1) {\textquoteleft}{\textquoteleft}low-level{\textquoteright}{\textquoteright} for decoupled highly parallel inter-node reductions, (2) {\textquoteleft}{\textquoteleft}domino level{\textquoteright}{\textquoteright} to efficiently resolve interactions between local reductions and global reductions. Our hierarchical algorithm and its implementation are flexible and modular, and can accommodate several kernel types, different distribution layouts, and a variety of reduction trees at all levels, both inter-node and intra-node. Numerical experiments on a cluster of multi-core nodes (i) confirm that each of the four levels of our hierarchical tree contributes to build up performance and (ii) build insights on how these levels influence performance and interact within each other. Our implementation of the new algorithm with the DAGUE scheduling tool significantly outperforms currently available QR factorization software for all matrix shapes, thereby bringing a new advance in numerical linear algebra for petascale and exascale platforms.}, keywords = {Cluster, Distributed memory, Hierarchical architecture, multi-core, numerical linear algebra, QR factorization}, author = {Jack Dongarra and Mathieu Faverge and Thomas Herault and Mathias Jacquelin and Julien Langou and Yves Robert} } @inbook {747, title = {LAPACK}, booktitle = {Handbook of Linear Algebra}, year = {2013}, publisher = {CRC Press}, organization = {CRC Press}, edition = {Second}, address = {Boca Raton, FL}, abstract = {With a substantial amount of new material, the Handbook of Linear Algebra, Second Edition provides comprehensive coverage of linear algebra concepts, applications, and computational software packages in an easy-to-use format. It guides you from the very elementary aspects of the subject to the frontiers of current research. Along with revisions and updates throughout, the second edition of this bestseller includes 20 new chapters.}, isbn = {9781466507289}, author = {Zhaojun Bai and James Demmel and Jack Dongarra and Julien Langou and Jenny Wang} } @article {756, title = {Level-3 Cholesky Factorization Routines Improve Performance of Many Cholesky Algorithms}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {39}, year = {2013}, month = {2013-02}, abstract = {Four routines called DPOTF3i, i = a,b,c,d, are presented. DPOTF3i are a novel type of level-3 BLAS for use by BPF (Blocked Packed Format) Cholesky factorization and LAPACK routine DPOTRF. Performance of routines DPOTF3i are still increasing when the performance of Level-2 routine DPOTF2 of LAPACK starts decreasing. This is our main result and it implies, due to the use of larger block size nb, that DGEMM, DSYRK, and DTRSM performance also increases! The four DPOTF3i routines use simple register blocking. Different platforms have different numbers of registers. Thus, our four routines have different register blocking sizes. BPF is introduced. LAPACK routines for POTRF and PPTRF using BPF instead of full and packed format are shown to be trivial modifications of LAPACK POTRF source codes. We call these codes BPTRF. There are two variants of BPF: lower and upper. Upper BPF is {\textquotedblleft}identical{\textquotedblright} to Square Block Packed Format (SBPF). {\textquotedblleft}LAPACK{\textquotedblright} implementations on multicore processors use SBPF. Lower BPF is less efficient than upper BPF. Vector inplace transposition converts lower BPF to upper BPF very efficiently. Corroborating performance results for DPOTF3i versus DPOTF2 on a variety of common platforms are given for n ≈ nb as well as results for large n comparing DBPTRF versus DPOTRF.}, doi = {10.1145/2427023.2427026}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra and Jos{\'e} Herrero and Julien Langou} } @article {icl:704, title = {Multithreading in the PLASMA Library}, journal = {Multi and Many-Core Processing: Architecture, Programming, Algorithms, \& Applications}, year = {2013}, month = {2013-00}, publisher = {Taylor \& Francis}, keywords = {plasma}, author = {Jakub Kurzak and Piotr Luszczek and Asim YarKhan and Mathieu Faverge and Julien Langou and Henricus Bouwmeester and Jack Dongarra}, editor = {Mohamed Ahmed and Reda Ammar and Sanguthevar Rajasekaran} } @inproceedings {icl:687, title = {Hierarchical QR Factorization Algorithms for Multi-Core Cluster Systems}, journal = {IPDPS 2012, the 26th IEEE International Parallel and Distributed Processing Symposium}, year = {2012}, month = {2012-05}, publisher = {IEEE Computer Society Press}, address = {Shanghai, China}, author = {Jack Dongarra and Mathieu Faverge and Thomas Herault and Julien Langou and Yves Robert} } @techreport {icl:717, title = {How LAPACK library enables Microsoft Visual Studio support with CMake and LAPACKE}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 270)}, number = {UT-CS-12-698}, year = {2012}, month = {2012-07}, author = {Julien Langou and Bill Hoffman and Brad King} } @inproceedings {icl:676, title = {Flexible Development of Dense Linear Algebra Algorithms on Massively Parallel Architectures with DPLASMA}, journal = {Proceedings of the Workshops of the 25th IEEE International Symposium on Parallel and Distributed Processing (IPDPS 2011 Workshops)}, year = {2011}, month = {2011-05}, pages = {1432-1441}, publisher = {IEEE}, address = {Anchorage, Alaska, USA}, keywords = {dague, dplasma, parsec}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra} } @techreport {icl:645, title = {Hierarchical QR Factorization Algorithms for Multi-Core Cluster Systems}, journal = {University of Tennessee Computer Science Technical Report (also Lawn 257)}, number = {UT-CS-11-684}, year = {2011}, month = {2011-10}, keywords = {magma, plasma}, author = {Jack Dongarra and Mathieu Faverge and Thomas Herault and Julien Langou and Yves Robert} } @article {icl:599, title = {LU Factorization for Accelerator-Based Systems}, journal = {IEEE/ACS AICCSA 2011}, year = {2011}, month = {2011-12}, address = {Sharm-El-Sheikh, Egypt}, keywords = {magma, morse}, author = {Emmanuel Agullo and Cedric Augonnet and Jack Dongarra and Mathieu Faverge and Julien Langou and Hatem Ltaeif and Stanimire Tomov} } @article {icl:677, title = {QCG-OMPI: MPI Applications on Grids.}, journal = {Future Generation Computer Systems}, volume = {27}, number = {4}, year = {2011}, month = {2011-01}, pages = {435-369}, author = {Emmanuel Agullo and Camille Coti and Thomas Herault and Julien Langou and Sylvain Peyronnet and A. Rezmerita and Franck Cappello and Jack Dongarra} } @article {icl:572, title = {A Class of Parallel Tiled Linear Algebra Algorithms for Multicore Architectures}, journal = {Parallel Computing (to appear)}, year = {2010}, month = {2010-00}, author = {Alfredo Buttari and Julien Langou and Jakub Kurzak and Jack Dongarra} } @techreport {icl:563, title = {Distributed Dense Numerical Linear Algebra Algorithms on Massively Parallel Architectures: DPLASMA}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-10-660}, year = {2010}, month = {2010-09}, keywords = {dague, dplasma, parsec, plasma}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra} } @techreport {icl:529, title = {Distributed-Memory Task Execution and Dependence Tracking within DAGuE and the DPLASMA Project}, journal = {Innovative Computing Laboratory Technical Report}, number = {ICL-UT-10-02}, year = {2010}, month = {2010-00}, keywords = {dague, plasma}, author = {George Bosilca and Aurelien Bouteiller and Anthony Danalis and Mathieu Faverge and Azzam Haidar and Thomas Herault and Jakub Kurzak and Julien Langou and Pierre Lemariner and Hatem Ltaeif and Piotr Luszczek and Asim YarKhan and Jack Dongarra} } @article {icl:574, title = {QCG-OMPI: MPI Applications on Grids}, journal = {Future Generation Computer Systems}, volume = {27}, number = {4}, year = {2010}, month = {2010-03}, pages = {357-369}, author = {Emmanuel Agullo and Camille Coti and Thomas Herault and Julien Langou and Sylvain Peyronnet and A. Rezmerita and Franck Cappello and Jack Dongarra} } @inproceedings {icl:532, title = {QR Factorization of Tall and Skinny Matrices in a Grid Computing Environment}, journal = {24th IEEE International Parallel and Distributed Processing Symposium (also LAWN 224)}, year = {2010}, month = {2010-04}, address = {Atlanta, GA}, author = {Emmanuel Agullo and Camille Coti and Jack Dongarra and Thomas Herault and Julien Langou} } @article {icl:551, title = {Rectangular Full Packed Format for Cholesky{\textquoteright}s Algorithm: Factorization, Solution, and Inversion}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {37}, number = {2}, year = {2010}, month = {2010-04}, address = {Atlanta, GA}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra and Julien Langou} } @article {icl:570, title = {Rectangular Full Packed Format for Cholesky{\textquoteright}s Algorithm: Factorization, Solution and Inversion}, journal = {ACM Transactions on Mathematical Software (TOMS)}, volume = {37}, number = {2}, year = {2010}, month = {2010-04}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra and Julien Langou} } @article {, title = {Accelerating Scientific Computations with Mixed Precision Algorithms}, journal = {Computer Physics Communications}, volume = {180}, year = {2009}, month = {2009-12}, pages = {2526-2533}, abstract = {On modern architectures, the performance of 32-bit operations is often at least twice as fast as the performance of 64-bit operations. By using a combination of 32-bit and 64-bit floating point arithmetic, the performance of many dense and sparse linear algebra algorithms can be significantly enhanced while maintaining the 64-bit accuracy of the resulting solution. The approach presented here can apply not only to conventional processors but also to other technologies such as Field Programmable Gate Arrays (FPGA), Graphical Processing Units (GPU), and the STI Cell BE processor. Results on modern processor architectures and the STI Cell BE are presented.}, doi = {https://doi.org/10.1016/j.cpc.2008.11.005}, author = {Marc Baboulin and Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julie Langou and Julien Langou and Piotr Luszczek and Stanimire Tomov} } @article {icl:490, title = {Algorithmic Based Fault Tolerance Applied to High Performance Computing}, journal = {Journal of Parallel and Distributed Computing}, volume = {69}, year = {2009}, month = {2009-00}, pages = {410-416}, author = {Jack Dongarra and George Bosilca and Remi Delmas and Julien Langou} } @article {icl:509, title = {A Class of Parallel Tiled Linear Algebra Algorithms for Multicore Architectures}, journal = {Parallel Computing}, volume = {35}, year = {2009}, month = {2009-00}, pages = {38-53}, keywords = {plasma}, author = {Alfredo Buttari and Julien Langou and Jakub Kurzak and Jack Dongarra} } @article {icl:482, title = {Computing the Conditioning of the Components of a Linear Least-squares Solution}, journal = {Numerical Linear Algebra with Applications}, volume = {16}, number = {7}, year = {2009}, month = {2009-00}, pages = {517-533}, author = {Marc Baboulin and Jack Dongarra and Serge Gratton and Julien Langou} } @article {1352, title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects}, year = {2009}, month = {2009-11}, publisher = {The International Conference for High Performance Computing, Networking, Storage, and Analysis (SC09)}, address = {Portland, OR}, author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Rajib Nath and Stanimire Tomov and Asim YarKhan and Vasily Volkov} } @inproceedings {icl:486, title = {Numerical Linear Algebra on Emerging Architectures: The PLASMA and MAGMA Projects}, journal = {Journal of Physics: Conference Series}, volume = {180}, year = {2009}, month = {2009-00}, keywords = {magma, plasma}, author = {Emmanuel Agullo and James Demmel and Jack Dongarra and Bilel Hadri and Jakub Kurzak and Julien Langou and Hatem Ltaeif and Piotr Luszczek and Stanimire Tomov} } @article {icl:505, title = {Parallel Dense Linear Algebra Software in the Multicore Era}, journal = {in Cyberinfrastructure Technologies and Applications}, year = {2009}, month = {2009-00}, pages = {9-24}, publisher = {Nova Science Publishers, Inc.}, keywords = {plasma}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou}, editor = {Junwei Cao} } @article {icl:491, title = {The Problem with the Linpack Benchmark Matrix Generator}, journal = {International Journal of High Performance Computing Applications}, volume = {23}, number = {1}, year = {2009}, month = {2009-00}, pages = {5-14}, keywords = {hpl}, author = {Julien Langou and Jack Dongarra} } @article {icl:511, title = {Rectangular Full Packed Format for Cholesky{\textquoteright}s Algorithm: Factorization, Solution and Inversion}, journal = {ACM TOMS (to appear)}, year = {2009}, month = {2009-00}, author = {Fred G. Gustavson and Jerzy Wasniewski and Jack Dongarra and Julien Langou} } @techreport {icl:426, title = {Algorithmic Based Fault Tolerance Applied to High Performance Computing}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-620 (also LAPACK Working Note 205)}, year = {2008}, month = {2008-01}, author = {George Bosilca and Remi Delmas and Jack Dongarra and Julien Langou} } @article {icl:457, title = {Computing the Conditioning of the Components of a Linear Least Squares Solution}, journal = {VECPAR {\textquoteright}08, High Performance Computing for Computational Science}, year = {2008}, month = {2008-01}, address = {Toulouse, France}, author = {Marc Baboulin and Jack Dongarra and Serge Gratton and Julien Langou} } @article {icl:449, title = {Exploiting Mixed Precision Floating Point Hardware in Scientific Computations}, journal = {in High Performance Computing and Grids in Action}, year = {2008}, month = {2008-01}, publisher = {IOS Press}, address = {Amsterdam}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Julien Langou and Piotr Luszczek and Stanimire Tomov}, editor = {Lucio Grandinetti} } @inproceedings {icl:416, title = {Interior State Computation of Nano Structures}, journal = {PARA 2008, 9th International Workshop on State-of-the-Art in Scientific and Parallel Computing}, year = {2008}, month = {2008-05}, address = {Trondheim, Norway}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @article {icl:446, title = {Parallel Tiled QR Factorization for Multicore Architectures}, journal = {Concurrency and Computation: Practice and Experience}, volume = {20}, year = {2008}, month = {2008-01}, pages = {1573-1590}, author = {Alfredo Buttari and Julien Langou and Jakub Kurzak and Jack Dongarra} } @techreport {icl:423, title = {The Problem with the Linpack Benchmark Matrix Generator}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-08-621 (also LAPACK Working Note 206)}, year = {2008}, month = {2008-06}, author = {Jack Dongarra and Julien Langou} } @techreport {icl:375, title = {A Class of Parallel Tiled Linear Algebra Algorithms for Multicore Architectures}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-07-600 (also LAPACK Working Note 191)}, year = {2007}, month = {2007-01}, keywords = {plasma}, author = {Alfredo Buttari and Julien Langou and Jakub Kurzak and Jack Dongarra} } @techreport {icl:391, title = {Computing the Conditioning of the Components of a Linear Least Squares Solution}, journal = {University of Tennessee Computer Science Technical Report}, number = {UT-CS-07-604, (also LAPACK Working Note 193)}, year = {2007}, month = {2007-01}, author = {Marc Baboulin and Jack Dongarra and Serge Gratton and Julien Langou} } @article {icl:366, title = {Disaster Survival Guide in Petascale Computing: An Algorithmic Approach}, journal = {in Petascale Computing: Algorithms and Applications (to appear)}, year = {2007}, month = {2007-00}, publisher = {Chapman \& Hall - CRC Press}, author = {Jack Dongarra and Zizhong Chen and George Bosilca and Julien Langou} } @article {icl:392, title = {Exploiting Mixed Precision Floating Point Hardware in Scientific Computations}, journal = {In High Performance Computing and Grids in Action (to appear)}, year = {2007}, month = {2007-00}, publisher = {IOS Press}, address = {Amsterdam}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Julie Langou and Piotr Luszczek and Stanimire Tomov}, editor = {Lucio Grandinetti} } @article {icl:395, title = {Mixed Precision Iterative Refinement Techniques for the Solution of Dense Linear Systems}, journal = {International Journal of High Performance Computer Applications (to appear)}, year = {2007}, month = {2007-08}, author = {Alfredo Buttari and Jack Dongarra and Julien Langou and Julie Langou and Piotr Luszczek and Jakub Kurzak} } @techreport {icl:363, title = {Parallel Tiled QR Factorization for Multicore Architectures}, journal = {University of Tennessee Computer Science Dept. Technical Report, UT-CS-07-598 (also LAPACK Working Note 190)}, year = {2007}, month = {2007-00}, keywords = {plasma}, author = {Alfredo Buttari and Julien Langou and Jakub Kurzak and Jack Dongarra} } @article {icl:397, title = {Recovery Patterns for Iterative Methods in a Parallel Unstable Environment}, journal = {SIAM SISC (to appear)}, year = {2007}, month = {2007-05}, author = {Julien Langou and Zizhong Chen and George Bosilca and Jack Dongarra} } @article {icl:402, title = {Conjugate-Gradient Eigenvalue Solvers in Computing Electronic Properties of Nanostructure Architectures}, journal = {International Journal of Computational Science and Engineering}, volume = {2}, number = {3/4}, year = {2006}, month = {2006-00}, pages = {205-212}, author = {Stanimire Tomov and Julien Langou and Jack Dongarra and Andrew Canning and Lin-Wang Wang} } @article {icl:317, title = {Exploiting the Performance of 32 bit Floating Point Arithmetic in Obtaining 64 bit Accuracy}, journal = {University of Tennessee Computer Science Tech Report}, number = {UT-CS-06-574, LAPACK Working Note $\#$175}, year = {2006}, month = {2006-04}, keywords = {iter-ref}, author = {Julien Langou and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Jack Dongarra} } @article {icl:369, title = {The Impact of Multicore on Math Software}, journal = {PARA 2006}, year = {2006}, month = {2006-06}, address = {Umea, Sweden}, keywords = {plasma}, author = {Alfredo Buttari and Jack Dongarra and Jakub Kurzak and Julien Langou and Piotr Luszczek and Stanimire Tomov} } @inproceedings {icl:325, title = {Performance evaluation of eigensolvers in nano-structure computations}, journal = {IEEE/ACM Proceedings of HPCNano SC06 (to appear)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @article {icl:327, title = {Predicting the electronic properties of 3D, million-atom semiconductor nanostructure architectures}, journal = {J. Phys.: Conf. Ser. 46}, volume = {:101088/1742-6596/46/1/040}, year = {2006}, month = {2006-01}, pages = {292-298}, keywords = {DOE_NANO}, author = {Alex Zunger and Alberto Franceschetti and Gabriel Bester and Wesley B. Jones and Kwiseon Kim and Peter A. Graf and Lin-Wang Wang and Andrew Canning and Osni Marques and Christof Voemel and Jack Dongarra and Julien Langou and Stanimire Tomov} } @article {icl:370, title = {Prospectus for the Next LAPACK and ScaLAPACK Libraries}, journal = {PARA 2006}, year = {2006}, month = {2006-06}, address = {Umea, Sweden}, author = {James Demmel and Jack Dongarra and B. Parlett and William Kahan and Ming Gu and David Bindel and Yozo Hida and Xiaoye Li and Osni Marques and Jason E. Riedy and Christof Voemel and Julien Langou and Piotr Luszczek and Jakub Kurzak and Alfredo Buttari and Julien Langou and Stanimire Tomov} } @article {icl:332, title = {Self Adapting Numerical Software SANS Effort}, journal = {IBM Journal of Research and Development}, volume = {50}, number = {2/3}, year = {2006}, month = {2006-01}, pages = {223-238}, keywords = {gco}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Victor Eijkhout and Graham Fagg and Erika Fuentes and Julien Langou and Piotr Luszczek and Jelena Pjesivac{\textendash}Grbovic and Keith Seymour and Haihang You and Sathish Vadhiyar} } @inproceedings {icl:324, title = {Towards bulk based preconditioning for quantum dot computations}, journal = {IEEE/ACM Proceedings of HPCNano SC06 (to appear)}, year = {2006}, month = {2006-01}, keywords = {doe-nano}, author = {Andrew Canning and Jack Dongarra and Julien Langou and Osni Marques and Stanimire Tomov and Christof Voemel and Lin-Wang Wang} } @inproceedings {icl:284, title = {Comparison of Nonlinear Conjugate-Gradient methods for computing the Electronic Properties of Nanostructure Architectures}, journal = {Proceedings of 5th International Conference on Computational Science (ICCS)}, year = {2005}, month = {2005-01}, pages = {317-325}, publisher = {Springer{\textquoteright}s Lecture Notes in Computer Science}, address = {Atlanta, GA, USA}, keywords = {doe-nano}, author = {Stanimire Tomov and Julien Langou and Andrew Canning and Lin-Wang Wang and Jack Dongarra}, editor = {V. S. Sunderman and Geert Dick van Albada and Peter M. Sloot and Jack Dongarra} } @article {icl:292, title = {Conjugate-Gradient Eigenvalue Solvers in Computing Electronic Properties of Nanostructure Architectures}, journal = {International Journal of Computational Science and Engineering (to appear)}, year = {2005}, month = {2005-01}, author = {Stanimire Tomov and Julien Langou and Andrew Canning and Lin-Wang Wang and Jack Dongarra} } @inproceedings {icl:265, title = {Fault Tolerant High Performance Computing by a Coding Approach}, journal = {Proceedings of ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming (to appear)}, year = {2005}, month = {2005-01}, address = {Chicago, Illinois}, keywords = {ftmpi, grads, lacsi, sans}, author = {Zizhong Chen and Graham Fagg and Edgar Gabriel and Julien Langou and Thara Angskun and George Bosilca and Jack Dongarra} } @inproceedings {icl:280, title = {Hash Functions for Datatype Signatures in MPI}, journal = {Proceedings of 12th European Parallel Virtual Machine and Message Passing Interface Conference - Euro PVM/MPI}, volume = {3666}, year = {2005}, month = {2005-09}, pages = {76-83}, publisher = {Springer-Verlag Berlin}, address = {Sorrento (Naples), Italy}, keywords = {ftmpi}, author = {George Bosilca and Jack Dongarra and Graham Fagg and Julien Langou}, editor = {Beniamino Di Martino} } @article {, title = {NanoPSE: A Nanoscience Problem Solving Environment for Atomistic Electronic Structure of Semiconductor Nanostructures}, journal = {Journal of Physics: Conference Series}, year = {2005}, month = {2005-06}, pages = {277-282}, abstract = {Researchers at the National Renewable Energy Laboratory and their collaborators have developed over the past ~10 years a set of algorithms for an atomistic description of the electronic structure of nanostructures, based on plane-wave pseudopotentials and configuration interaction. The present contribution describes the first step in assembling these various codes into a single, portable, integrated set of software packages. This package is part of an ongoing research project in the development stage. Components of NanoPSE include codes for atomistic nanostructure generation and passivation, valence force field model for atomic relaxation, code for potential field generation, empirical pseudopotential method solver, strained linear combination of bulk bands method solver, configuration interaction solver for excited states, selection of linear algebra methods, and several inverse band structure solvers. Although not available for general distribution at this time as it is being developed and tested, the design goal of the NanoPSE software is to provide a software context for collaboration. The software package is enabled by fcdev, an integrated collection of best practice GNU software for open source development and distribution augmented to better support FORTRAN.}, doi = {https://doi.org/10.1088/1742-6596/16/1/038}, url = {https://iopscience.iop.org/article/10.1088/1742-6596/16/1/038/meta}, author = {Wesley B. Jones and Gabriel Bester and Andrew Canning and Alberto Franceschetti and Peter A. Graf and Kwiseon Kim and Julien Langou and Lin-Wang Wang and Jack Dongarra and Alex Zunger} } @article {icl:286, title = {On the Parallel Solution of Large Industrial Wave Propagation Problems}, journal = {Journal of Computational Acoustics (to appear)}, year = {2005}, month = {2005-01}, author = {Luc Giraud and Julien Langou and G. Sylvand} } @techreport {icl:301, title = {Recovery Patterns for Iterative Methods in a Parallel Unstable Environment}, journal = {University of Tennessee Computer Science Department Technical Report, UT-CS-04-538}, year = {2005}, month = {2005-00}, keywords = {ft-la}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Julien Langou} } @article {icl:285, title = {Rounding Error Analysis of the Classical Gram-Schmidt Orthogonalization Process}, journal = {Numerische Mathematik}, volume = {101}, number = {1}, year = {2005}, month = {2005-01}, pages = {87-100}, author = {Luc Giraud and Julien Langou and Miroslav Rozlo{\v z}n{\'\i}k and Jasper van den Eshof} } @techreport {icl:252, title = {Performance Optimization and Modeling of Blocked Sparse Kernels}, journal = {ICL Technical Report}, number = {ICL-UT-04-05}, year = {2004}, month = {2004-00}, keywords = {sans}, author = {Alfredo Buttari and Victor Eijkhout and Julien Langou and Salvatore Filippone} } @techreport {icl:251, title = {Recovery Patterns for Iterative Methods in a Parallel Unstable Environment}, journal = {ICL Technical Report}, number = {ICL-UT-04-04}, year = {2004}, month = {2004-01}, author = {George Bosilca and Zizhong Chen and Jack Dongarra and Julien Langou} }