@article {1193,
title = {Computing the Expected Makespan of Task Graphs in the Presence of Silent Errors},
journal = {Parallel Computing},
volume = {75},
year = {2018},
month = {2018-07},
pages = {41{\textendash}60},
abstract = {Applications structured as Directed Acyclic Graphs (DAGs) of tasks occur in many domains, including popular scientific workflows. DAG scheduling has thus received an enormous amount of attention. Many of the popular DAG scheduling heuristics make scheduling deci- sions based on path lengths. At large scale compute platforms are subject to various types of failures with non-negligible probabilities of occurrence. Failures that have recently re- ceived increased attention are {\textquotedblleft}silent errors,{\textquotedblright} which cause data corruption. Tolerating silent errors is done by checking the validity of computed results and possibly re-executing a task from scratch. The execution time of a task then becomes a random variable, and so do path lengths in a DAG. Unfortunately, computing the expected makespan of a DAG (and equivalently computing expected path lengths in a DAG) is a computationally dif- ficult problem. Consequently, designing effective scheduling heuristics in this context is challenging. In this work, we propose an algorithm that computes a first order approxi- mation of the expected makespan of a DAG when tasks are subject to silent errors. We find that our proposed approximation outperforms previously proposed approaches both in terms of approximation error and of speed.},
keywords = {Expected makespan, fault-tolerance, scheduling, Scientific workflows, silent errors, Task graphs},
doi = {https://doi.org/10.1016/j.parco.2018.03.004},
author = {Henri Casanova and Julien Herrmann and Yves Robert}
}
@article {932,
title = {Scheduling Computational Workflows on Failure-prone Platforms},
journal = {International Journal of Networking and Computing},
volume = {6},
number = {1},
year = {2016},
pages = {2-26},
abstract = {We study the scheduling of computational workflows on compute resources that experience exponentially distributed failures. When a failure occurs, rollback and recovery is used to resume the execution from the last checkpointed state. The scheduling problem is to minimize the expected execution time by deciding in which order to execute the tasks in the workflow and deciding for each task whether to checkpoint it or not after it completes. We give a polynomialtime optimal algorithm for fork DAGs (Directed Acyclic Graphs) and show that the problem is NP-complete with join DAGs. We also investigate the complexity of the simple case in which no task is checkpointed. Our main result is a polynomial-time algorithm to compute the expected execution time of a workflow, with a given task execution order and specified to-be-checkpointed tasks. Using this algorithm as a basis, we propose several heuristics for solving the scheduling problem. We evaluate these heuristics for representative workflow configurations.},
keywords = {checkpointing, fault-tolerance, reliability, scheduling, workflow},
issn = { 2185-2847},
author = {Guillaume Aupy and Anne Benoit and Henri Casanova and Yves Robert}
}