@article {1193, title = {Computing the Expected Makespan of Task Graphs in the Presence of Silent Errors}, journal = {Parallel Computing}, volume = {75}, year = {2018}, month = {2018-07}, pages = {41{\textendash}60}, abstract = {Applications structured as Directed Acyclic Graphs (DAGs) of tasks occur in many domains, including popular scientific workflows. DAG scheduling has thus received an enormous amount of attention. Many of the popular DAG scheduling heuristics make scheduling deci- sions based on path lengths. At large scale compute platforms are subject to various types of failures with non-negligible probabilities of occurrence. Failures that have recently re- ceived increased attention are {\textquotedblleft}silent errors,{\textquotedblright} which cause data corruption. Tolerating silent errors is done by checking the validity of computed results and possibly re-executing a task from scratch. The execution time of a task then becomes a random variable, and so do path lengths in a DAG. Unfortunately, computing the expected makespan of a DAG (and equivalently computing expected path lengths in a DAG) is a computationally dif- ficult problem. Consequently, designing effective scheduling heuristics in this context is challenging. In this work, we propose an algorithm that computes a first order approxi- mation of the expected makespan of a DAG when tasks are subject to silent errors. We find that our proposed approximation outperforms previously proposed approaches both in terms of approximation error and of speed.}, keywords = {Expected makespan, fault-tolerance, scheduling, Scientific workflows, silent errors, Task graphs}, doi = {https://doi.org/10.1016/j.parco.2018.03.004}, author = {Henri Casanova and Julien Herrmann and Yves Robert} } @article {924, title = {Assessing the Cost of Redistribution followed by a Computational Kernel: Complexity and Performance Results}, journal = {Parallel Computing}, volume = {52}, year = {2016}, month = {2016-02}, pages = {22-41}, abstract = {The classical redistribution problem aims at optimally scheduling communications when reshuffling from an initial data distribution to a target data distribution. This target data distribution is usually chosen to optimize some objective for the algorithmic kernel under study (good computational balance or low communication volume or cost), and therefore to provide high efficiency for that kernel. However, the choice of a distribution minimizing the target objective is not unique. This leads to generalizing the redistribution problem as follows: find a re-mapping of data items onto processors such that the data redistribution cost is minimal, and the operation remains as efficient. This paper studies the complexity of this generalized problem. We compute optimal solutions and evaluate, through simulations, their gain over classical redistribution. We also show the NP-hardness of the problem to find the optimal data partition and processor permutation (defined by new subsets) that minimize the cost of redistribution followed by a simple computational kernel. Finally, experimental validation of the new redistribution algorithms are conducted on a multicore cluster, for both a 1D-stencil kernel and a more compute-intensive dense linear algebra routine.}, keywords = {Data partition, linear algebra, parsec, QR factorization, Redistribution, Stencil}, doi = {doi:10.1016/j.parco.2015.09.005}, author = {Julien Herrmann and George Bosilca and Thomas Herault and Loris Marchal and Yves Robert and Jack Dongarra} } @article {917, title = {Mixing LU-QR Factorization Algorithms to Design High-Performance Dense Linear Algebra Solvers}, journal = {Journal of Parallel and Distributed Computing}, volume = {85}, year = {2015}, month = {2015-11}, pages = {32-46}, abstract = {This paper introduces hybrid LU{\textendash}QR algorithms for solving dense linear systems of the form Ax=b. Throughout a matrix factorization, these algorithms dynamically alternate LU with local pivoting and QR elimination steps based upon some robustness criterion. LU elimination steps can be very efficiently parallelized, and are twice as cheap in terms of floating-point operations, as QR steps. However, LU steps are not necessarily stable, while QR steps are always stable. The hybrid algorithms execute a QR step when a robustness criterion detects some risk for instability, and they execute an LU step otherwise. The choice between LU and QR steps must have a small computational overhead and must provide a satisfactory level of stability with as few QR steps as possible. In this paper, we introduce several robustness criteria and we establish upper bounds on the growth factor of the norm of the updated matrix incurred by each of these criteria. In addition, we describe the implementation of the hybrid algorithms through an extension of the PaRSEC software to allow for dynamic choices during execution. Finally, we analyze both stability and performance results compared to state-of-the-art linear solvers on parallel distributed multicore platforms. A comprehensive set of experiments shows that hybrid LU{\textendash}QR algorithms provide a continuous range of trade-offs between stability and performances.}, keywords = {lu factorization, Numerical algorithms, QR factorization, Stability; Performance}, doi = {doi:10.1016/j.jpdc.2015.06.007}, author = {Mathieu Faverge and Julien Herrmann and Julien Langou and Bradley Lowery and Yves Robert and Jack Dongarra} } @conference {813, title = {Designing LU-QR Hybrid Solvers for Performance and Stability}, booktitle = {IPDPS 2014}, year = {2014}, month = {2014-05}, publisher = {IEEE}, organization = {IEEE}, address = {Phoenix, AZ}, abstract = {This paper introduces hybrid LU-QR algorithms for solving dense linear systems of the form Ax = b. Throughout a matrix factorization, these algorithms dynamically alternate LU with local pivoting and QR elimination steps, based upon some robustness criterion. LU elimination steps can be very efficiently parallelized, and are twice as cheap in terms of operations, as QR steps. However, LU steps are not necessarily stable, while QR steps are always stable. The hybrid algorithms execute a QR step when a robustness criterion detects some risk for instability, and they execute an LU step otherwise. Ideally, the choice between LU and QR steps must have a small computational overhead and must provide a satisfactory level of stability with as few QR steps as possible. In this paper, we introduce several robustness criteria and we establish upper bounds on the growth factor of the norm of the updated matrix incurred by each of these criteria. In addition, we describe the implementation of the hybrid algorithms through an extension of the Parsec software to allow for dynamic choices during execution. Finally, we analyze both stability and performance results compared to state-of-the-art linear solvers on parallel distributed multicore platforms.}, keywords = {plasma}, isbn = {978-1-4799-3800-1}, doi = {10.1109/IPDPS.2014.108}, author = {Mathieu Faverge and Julien Herrmann and Julien Langou and Bradley Lowery and Yves Robert and Jack Dongarra} } @article {icl:721, title = {Accelerating Linear System Solutions Using Randomization Techniques}, journal = {ACM Transactions on Mathematical Software (also LAWN 246)}, volume = {39}, year = {2013}, month = {2013-02}, abstract = {We illustrate how linear algebra calculations can be enhanced by statistical techniques in the case of a square linear system Ax = b. We study a random transformation of A that enables us to avoid pivoting and then to reduce the amount of communication. Numerical experiments show that this randomization can be performed at a very affordable computational price while providing us with a satisfying accuracy when compared to partial pivoting. This random transformation called Partial Random Butterfly Transformation (PRBT) is optimized in terms of data storage and flops count. We propose a solver where PRBT and the LU factorization with no pivoting take advantage of the current hybrid multicore/GPU machines and we compare its Gflop/s performance with a solver implemented in a current parallel library.}, keywords = {algorithms, dense linear algebra, experimentation, graphics processing units, linear systems, lu factorization, multiplicative preconditioning, numerical linear algebra, performance, plasma, randomization}, doi = {10.1145/2427023.2427025}, url = {http://dl.acm.org/citation.cfm?id=2427025}, author = {Marc Baboulin and Jack Dongarra and Julien Herrmann and Stanimire Tomov} } @techreport {703, title = {Designing LU-QR hybrid solvers for performance and stability}, journal = {University of Tennessee Computer Science Technical Report (also LAWN 282)}, number = {ut-eecs-13-719}, year = {2013}, month = {2013-10}, publisher = {University of Tennessee}, author = {Mathieu Faverge and Julien Herrmann and Julien Langou and Bradley Lowery and Yves Robert and Jack Dongarra} } @article {icl:637, title = {Accelerating Linear System Solutions Using Randomization Techniques}, journal = {INRIA RR-7616 / LAWN $\#$246 (presented at International AMMCS{\textquoteright}11)}, year = {2011}, month = {2011-07}, address = {Waterloo, Ontario, Canada}, keywords = {magma}, author = {Marc Baboulin and Jack Dongarra and Julien Herrmann and Stanimire Tomov} }