@article {1187, title = {Checkpointing Workflows for Fail-Stop Errors}, journal = {IEEE Transactions on Computers}, volume = {67}, year = {2018}, month = {2018-08}, pages = {1105{\textendash}1120}, abstract = {We consider the problem of orchestrating the execution of workflow applications structured as Directed Acyclic Graphs (DAGs) on parallel computing platforms that are subject to fail-stop failures. The objective is to minimize expected overall execution time, or makespan. A solution to this problem consists of a schedule of the workflow tasks on the available processors and of a decision of which application data to checkpoint to stable storage, so as to mitigate the impact of processor failures. To address this challenge, we consider a restricted class of graphs, Minimal Series-Parallel Graphs (M-SPGS), which is relevant to many real-world workflow applications. For this class of graphs, we propose a recursive list-scheduling algorithm that exploits the M-SPG structure to assign sub-graphs to individual processors, and uses dynamic programming to decide how to checkpoint these sub-graphs. We assess the performance of our algorithm for production workflow configurations, comparing it to an approach in which all application data is checkpointed and an approach in which no application data is checkpointed. Results demonstrate that our algorithm outperforms both the former approach, because of lower checkpointing overhead, and the latter approach, because of better resilience to failures.}, keywords = {checkpoint, fail-stop error, resilience, workflow}, url = {http://ieeexplore.ieee.org/document/8279499/}, author = {Li Han and Louis-Claude Canon and Henri Casanova and Yves Robert and Frederic Vivien} } @article {1193, title = {Computing the Expected Makespan of Task Graphs in the Presence of Silent Errors}, journal = {Parallel Computing}, volume = {75}, year = {2018}, month = {2018-07}, pages = {41{\textendash}60}, abstract = {Applications structured as Directed Acyclic Graphs (DAGs) of tasks occur in many domains, including popular scientific workflows. DAG scheduling has thus received an enormous amount of attention. Many of the popular DAG scheduling heuristics make scheduling deci- sions based on path lengths. At large scale compute platforms are subject to various types of failures with non-negligible probabilities of occurrence. Failures that have recently re- ceived increased attention are {\textquotedblleft}silent errors,{\textquotedblright} which cause data corruption. Tolerating silent errors is done by checking the validity of computed results and possibly re-executing a task from scratch. The execution time of a task then becomes a random variable, and so do path lengths in a DAG. Unfortunately, computing the expected makespan of a DAG (and equivalently computing expected path lengths in a DAG) is a computationally dif- ficult problem. Consequently, designing effective scheduling heuristics in this context is challenging. In this work, we propose an algorithm that computes a first order approxi- mation of the expected makespan of a DAG when tasks are subject to silent errors. We find that our proposed approximation outperforms previously proposed approaches both in terms of approximation error and of speed.}, keywords = {Expected makespan, fault-tolerance, scheduling, Scientific workflows, silent errors, Task graphs}, doi = {https://doi.org/10.1016/j.parco.2018.03.004}, author = {Henri Casanova and Julien Herrmann and Yves Robert} } @conference {1098, title = {Checkpointing Workflows for Fail-Stop Errors}, booktitle = {IEEE Cluster}, year = {2017}, month = {2017-09}, publisher = {IEEE}, organization = {IEEE}, address = {Honolulu, Hawaii}, abstract = {We consider the problem of orchestrating the execution of workflow applications structured as Directed Acyclic Graphs (DAGs) on parallel computing platforms that are subject to fail-stop failures. The objective is to minimize expected overall execution time, or makespan. A solution to this problem consists of a schedule of the workflow tasks on the available processors and of a decision of which application data to checkpoint to stable storage, so as to mitigate the impact of processor failures. For general DAGs this problem is hopelessly intractable. In fact, given a solution, computing its expected makespan is still a difficult problem. To address this challenge, we consider a restricted class of graphs, Minimal Series-Parallel Graphs (M-SPGS). It turns out that many real-world workflow applications are naturally structured as M-SPGS. For this class of graphs, we propose a recursive list-scheduling algorithm that exploits the M-SPG structure to assign sub-graphs to individual processors, and uses dynamic programming to decide which tasks in these sub-graphs should be checkpointed. Furthermore, it is possible to efficiently compute the expected makespan for the solution produced by this algorithm, using a first-order approximation of task weights and existing evaluation algorithms for 2-state probabilistic DAGs. We assess the performance of our algorithm for production workflow configurations, comparing it to (i) an approach in which all application data is checkpointed, which corresponds to the standard way in which most production workflows are executed today; and (ii) an approach in which no application data is checkpointed. Our results demonstrate that our algorithm strikes a good compromise between these two approaches, leading to lower checkpointing overhead than the former and to better resilience to failure than the latter.}, author = {Li Han and Louis-Claude Canon and Henri Casanova and Yves Robert and Frederic Vivien} } @article {932, title = {Scheduling Computational Workflows on Failure-prone Platforms}, journal = {International Journal of Networking and Computing}, volume = {6}, number = {1}, year = {2016}, pages = {2-26}, abstract = {We study the scheduling of computational workflows on compute resources that experience exponentially distributed failures. When a failure occurs, rollback and recovery is used to resume the execution from the last checkpointed state. The scheduling problem is to minimize the expected execution time by deciding in which order to execute the tasks in the workflow and deciding for each task whether to checkpoint it or not after it completes. We give a polynomialtime optimal algorithm for fork DAGs (Directed Acyclic Graphs) and show that the problem is NP-complete with join DAGs. We also investigate the complexity of the simple case in which no task is checkpointed. Our main result is a polynomial-time algorithm to compute the expected execution time of a workflow, with a given task execution order and specified to-be-checkpointed tasks. Using this algorithm as a basis, we propose several heuristics for solving the scheduling problem. We evaluate these heuristics for representative workflow configurations.}, keywords = {checkpointing, fault-tolerance, reliability, scheduling, workflow}, issn = { 2185-2847}, author = {Guillaume Aupy and Anne Benoit and Henri Casanova and Yves Robert} } @article {icl:278, title = {New Grid Scheduling and Rescheduling Methods in the GrADS Project}, journal = {International Journal of Parallel Programming}, volume = {33}, number = {2}, year = {2005}, month = {2005-06}, pages = {209-229}, publisher = {Springer}, keywords = {grads}, author = {Francine Berman and Henri Casanova and Andrew Chien and Keith Cooper and Holly Dail and Anshuman Dasgupta and Wei Deng and Jack Dongarra and Lennart Johnsson and Ken Kennedy and Charles Koelbel and Bo Liu and Xu Liu and Anirban Mandal and Gabriel Marin and Mark Mazina and John Mellor-Crummey and Celso Mendes and A. Olugbile and Jignesh M. Patel and Dan Reed and Zhiao Shi and Otto Sievert and H. Xia and Asim YarKhan} } @article {icl:202, title = {The Virtual Instrument: Support for Grid-enabled Scientific Simulations}, journal = {International Journal of High Performance Computing Applications}, volume = {18}, number = {1}, year = {2004}, month = {2004-01}, pages = {3-17}, author = {Henri Casanova and Thomas Bartol and Francine Berman and Adam Birnbaum and Jack Dongarra and Mark Ellisman and Marcio Faerman and Erhan Gockay and Michelle Miller and Graziano Obertelli and Stuart Pomerantz and Terry Sejnowski and Joel Stiles and Rich Wolski} } @inproceedings {icl:171, title = {Optimizing Performance and Reliability in Distributed Computing Systems Through Wide Spectrum Storage}, journal = {Proceedings of the IPDPS 2003, NGS Workshop}, year = {2003}, month = {2003-01}, pages = {209}, address = {Nice, France}, author = {James Plank and Micah Beck and Jack Dongarra and Rich Wolski and Henri Casanova} } @article {icl:138, title = {Scheduling in the Grid Application Development Software Project}, journal = {Resource Management in the Grid}, year = {2003}, month = {2003-03}, publisher = {Kluwer Publishers}, keywords = {grads}, author = {Holly Dail and Otto Sievert and Francine Berman and Henri Casanova and Asim YarKhan and Sathish Vadhiyar and Jack Dongarra and Chuang Liu and Lingyun Yang and Dave Angulo and Ian Foster} } @article {icl:119, title = {Adaptive Scheduling for Task Farming with Grid Middleware}, journal = {International Journal of Supercomputer Applications and High-Performance Computing}, volume = {13}, number = {3}, year = {2002}, month = {2002-10}, pages = {231-240}, author = {Henri Casanova and Myung Ho Kim and James Plank and Jack Dongarra} } @techreport {icl:97, title = {GridRPC: A Remote Procedure Call API for Grid Computing}, journal = {ICL Technical Report}, number = {ICL-UT-02-06}, year = {2002}, month = {2002-11}, author = {Keith Seymour and Hidemoto Nakada and Satoshi Matsuoka and Jack Dongarra and Craig Lee and Henri Casanova} } @article {icl:207, title = {Innovations of the NetSolve Grid Computing System}, journal = {Concurrency: Practice and Experience}, volume = {14}, number = {13-15}, year = {2002}, month = {2002-01}, pages = {1457-1479}, keywords = {netsolve}, author = {Dorian Arnold and Henri Casanova and Jack Dongarra} } @article {icl:101, title = {Middleware for the Use of Storage in Communication}, journal = {Parallel Computing}, volume = {28}, number = {12}, year = {2002}, month = {2002-08}, pages = {1773-1788}, keywords = {netsolve}, author = {Micah Beck and Dorian Arnold and Alessandro Bassi and Francine Berman and Henri Casanova and Jack Dongarra and Terry Moore and Graziano Obertelli and James Plank and Martin Swany and Sathish Vadhiyar and Rich Wolski} } @inproceedings {icl:187, title = {Overview of GridRPC: A Remote Procedure Call API for Grid Computing}, journal = {Proceedings of the Third International Workshop on Grid Computing}, year = {2002}, month = {2002-01}, pages = {274-278}, author = {Keith Seymour and Hidemoto Nakada and Satoshi Matsuoka and Jack Dongarra and Craig Lee and Henri Casanova}, editor = {Manish Parashar} } @article {icl:120, title = {Stochastic Performance Prediction for Iterative Algorithms in Distributed Environments}, journal = {Journal of Parallel and Distributed Computing}, volume = {98}, number = {1}, year = {2002}, month = {2002-10}, pages = {68-91}, author = {Henri Casanova and Michael G. Thomason and Jack Dongarra} } @article {icl:95, title = {The Virtual Instrument: Support for Grid-enabled Scientific Simulations}, journal = {Journal of Parallel and Distributed Computing (submitted)}, year = {2002}, month = {2002-10}, author = {Henri Casanova and Thomas Bartol and Francine Berman and Adam Birnbaum and Jack Dongarra and Mark Ellisman and Marcio Faerman and Erhan Gockay and Michelle Miller and Graziano Obertelli and Stuart Pomerantz and Terry Sejnowski and Joel Stiles and Rich Wolski} } @article {icl:4, title = {Logistical Computing and Internetworking: Middleware for the Use of Storage in Communication}, journal = {submitted to SC2001}, year = {2001}, month = {2001-11}, address = {Denver, Colorado}, keywords = {netsolve}, author = {Micah Beck and Dorian Arnold and Alessandro Bassi and Francine Berman and Henri Casanova and Jack Dongarra and Terry Moore and Graziano Obertelli and James Plank and Martin Swany and Sathish Vadhiyar and Rich Wolski} } @inproceedings {icl:7, title = {Network-Enabled Server Systems: Deploying Scientific Simulations on the Grid}, journal = {2001 High Performance Computing Symposium (HPC{\textquoteright}01), part of the Advance Simulation Technologies Conference}, year = {2001}, month = {2001-04}, address = {Seattle, Washington}, author = {Henri Casanova and Satoshi Matsuoka and Jack Dongarra} } @article {icl:33, title = {Deploying Fault-tolerance and Task Migration with NetSolve}, journal = {Future Generation Computer Systems}, volume = {15}, number = {5-6}, year = {1999}, month = {1999-10}, pages = {745-755}, publisher = {Elsevier}, keywords = {netsolve}, author = {Henri Casanova and James Plank and Micah Beck and Jack Dongarra} } @article {icl:53, title = {Logistical Quality of Service in NetSolve}, journal = {Computer Communications}, volume = {22}, number = {11}, year = {1999}, month = {1999-01}, pages = {1034-1044}, keywords = {netsolve}, author = {Micah Beck and Henri Casanova and Jack Dongarra and Terry Moore and James Plank and Francine Berman and Rich Wolski} } @article {icl:229, title = {A Numerical Linear Algebra Problem Solving Environment Designer{\textquoteright}s Perspective (LAPACK Working Note 139)}, journal = {SIAM Annual Meeting}, year = {1999}, month = {1999-05}, address = {Atlanta, GA}, author = {Antoine Petitet and Henri Casanova and Clint Whaley and Jack Dongarra and Yves Robert} } @article {icl:75, title = {Parallel and Distributed Scientific Computing: A Numerical Linear Algebra Problem Solving Environment Designer{\textquoteright}s Perspective}, journal = {Handbook on Parallel and Distributed Processing}, year = {1999}, month = {1999-01}, author = {Antoine Petitet and Henri Casanova and Jack Dongarra and Yves Robert and Clint Whaley} } @article {icl:63, title = {Stochastic Performance Prediction for Iterative Algorithms in Distributed Environments}, journal = {Journal of Parallel and Distributed Computing}, volume = {98}, number = {1}, year = {1999}, month = {1999-01}, pages = {68-91}, author = {Henri Casanova and Myung Ho Kim and James Plank and Jack Dongarra} }