@article {1305, title = {Checkpointing Strategies for Shared High-Performance Computing Platforms}, journal = {International Journal of Networking and Computing}, volume = {9}, number = {1}, year = {2019}, pages = {28{\textendash}52}, abstract = {Input/output (I/O) from various sources often contend for scarcely available bandwidth. For example, checkpoint/restart (CR) protocols can help to ensure application progress in failure-prone environments. However, CR I/O alongside an application{\textquoteright}s normal, requisite I/O can increase I/O contention and might negatively impact performance. In this work, we consider different aspects (system-level scheduling policies and hardware) that optimize the overall performance of concurrently executing CR-based applications that share I/O resources. We provide a theoretical model and derive a set of necessary constraints to minimize the global waste on a given platform. Our results demonstrate that Young/Daly{\textquoteright}s optimal checkpoint interval, despite providing a sensible metric for a single, undisturbed application, is not sufficient to optimally address resource contention at scale. We show that by combining optimal checkpointing periods with contention-aware system-level I/O scheduling strategies, we can significantly improve overall application performance and maximize the platform throughput. Finally, we evaluate how specialized hardware, namely burst buffers, may help to mitigate the I/O contention problem. Overall, these results provide critical analysis and direct guidance on how to design efficient, CR ready, large -scale platforms without a large investment in the I/O subsystem.}, issn = {2185-2847}, url = {http://www.ijnc.org/index.php/ijnc/article/view/195}, author = {Thomas Herault and Yves Robert and Aurelien Bouteiller and Dorian Arnold and Kurt Ferreira and George Bosilca and Jack Dongarra} } @conference {1196, title = {Optimal Cooperative Checkpointing for Shared High-Performance Computing Platforms}, booktitle = {2018 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW), Best Paper Award}, year = {2018}, month = {2018-05}, publisher = {IEEE}, organization = {IEEE}, address = {Vancouver, BC, Canada}, abstract = {In high-performance computing environments, input/output (I/O) from various sources often contend for scarce available bandwidth. Adding to the I/O operations inherent to the failure-free execution of an application, I/O from checkpoint/restart (CR) operations (used to ensure progress in the presence of failures) place an additional burden as it increase I/O contention, leading to degraded performance. In this work, we consider a cooperative scheduling policy that optimizes the overall performance of concurrently executing CR-based applications which share valuable I/O resources. First, we provide a theoretical model and then derive a set of necessary constraints needed to minimize the global waste on the platform. Our results demonstrate that the optimal checkpoint interval, as defined by Young/Daly, despite providing a sensible metric for a single application, is not sufficient to optimally address resource contention at the platform scale. We therefore show that combining optimal checkpointing periods with I/O scheduling strategies can provide a significant improvement on the overall application performance, thereby maximizing platform throughput. Overall, these results provide critical analysis and direct guidance on checkpointing large-scale workloads in the presence of competing I/O while minimizing the impact on application performance.}, doi = {10.1109/IPDPSW.2018.00127}, author = {Thomas Herault and Yves Robert and Aurelien Bouteiller and Dorian Arnold and Kurt Ferreira and George Bosilca and Jack Dongarra} } @article {icl:207, title = {Innovations of the NetSolve Grid Computing System}, journal = {Concurrency: Practice and Experience}, volume = {14}, number = {13-15}, year = {2002}, month = {2002-01}, pages = {1457-1479}, keywords = {netsolve}, author = {Dorian Arnold and Henri Casanova and Jack Dongarra} } @article {icl:101, title = {Middleware for the Use of Storage in Communication}, journal = {Parallel Computing}, volume = {28}, number = {12}, year = {2002}, month = {2002-08}, pages = {1773-1788}, keywords = {netsolve}, author = {Micah Beck and Dorian Arnold and Alessandro Bassi and Francine Berman and Henri Casanova and Jack Dongarra and Terry Moore and Graziano Obertelli and James Plank and Martin Swany and Sathish Vadhiyar and Rich Wolski} } @techreport {icl:96, title = {Users{\textquoteright} Guide to NetSolve v1.4.1}, journal = {ICL Technical Report}, number = {ICL-UT-02-05}, year = {2002}, month = {2002-06}, keywords = {netsolve}, author = {Sudesh Agrawal and Dorian Arnold and Susan Blackford and Jack Dongarra and Michelle Miller and Kiran Sagi and Zhiao Shi and Keith Seymour and Sathish Vadhiyar} } @article {icl:87, title = {On the Convergence of Computational and Data Grids}, journal = {Parallel Processing Letters}, volume = {11}, number = {2-3}, year = {2001}, month = {2001-01}, pages = {187-202}, keywords = {netsolve}, author = {Dorian Arnold and Sathish Vadhiyar and Jack Dongarra} } @article {icl:4, title = {Logistical Computing and Internetworking: Middleware for the Use of Storage in Communication}, journal = {submitted to SC2001}, year = {2001}, month = {2001-11}, address = {Denver, Colorado}, keywords = {netsolve}, author = {Micah Beck and Dorian Arnold and Alessandro Bassi and Francine Berman and Henri Casanova and Jack Dongarra and Terry Moore and Graziano Obertelli and James Plank and Martin Swany and Sathish Vadhiyar and Rich Wolski} } @inproceedings {icl:19, title = {Metacomputing Support for the SARA3D Structural Acoustics Application}, journal = {Department of Defense Users{\textquoteright} Group Conference (to appear)}, year = {2001}, month = {2001-06}, address = {Biloxi, Mississippi}, keywords = {netsolve}, author = {Shirley Moore and Dorian Arnold and David Cronk} } @inproceedings {icl:27, title = {Developing an Architecture to Support the Implementation and Development of Scientific Computing Applications}, journal = {to appear in Proceedings of Working Conference 8: Software Architecture for Scientific Computing Applications}, year = {2000}, month = {2000-10}, address = {Ottawa, Canada}, keywords = {netsolve}, author = {Dorian Arnold and Jack Dongarra} } @inproceedings {icl:28, title = {The NetSolve Environment: Progressing Towards the Seamless Grid}, journal = {2000 International Conference on Parallel Processing (ICPP-2000)}, year = {2000}, month = {2000-08}, address = {Toronto, Canada}, keywords = {netsolve}, author = {Dorian Arnold and Jack Dongarra} } @article {icl:227, title = {Providing Infrastructure and Interface to High Performance Applications in a Distributed Setting}, journal = {ASTC-HPC 2000}, year = {2000}, month = {2000-04}, address = {Washington, DC}, author = {Dorian Arnold and Wonsuck Lee and Jack Dongarra and Mary Wheeler} } @inproceedings {icl:24, title = {Request Sequencing: Optimizing Communication for the Grid}, journal = {Lecture Notes in Computer Science: Proceedings of 6th International Euro-Par Conference 2000, Parallel Processing}, year = {2000}, month = {2000-01}, pages = {V1900,1213-1222}, address = {(Germany: Springer Verlag 2000)}, keywords = {netsolve}, author = {Dorian Arnold and Dieter Bachmann and Jack Dongarra} } @inproceedings {icl:25, title = {Seamless Access to Adaptive Solver Algorithms}, journal = {Proceedings of 16th IMACS World Congress 2000 on Scientific Computing, Applications Mathematics and Simulation}, year = {2000}, month = {2000-08}, address = {Lausanne, Switzerland}, keywords = {netsolve}, author = {Dorian Arnold and Susan Blackford and Jack Dongarra and Victor Eijkhout and Tinghua Xu} } @techreport {icl:228, title = {Secure Remote Access to Numerical Software and Computation Hardware}, journal = {University of Tennessee Computer Science Technical Report, UT-CS-00-446}, year = {2000}, month = {2000-07}, author = {Dorian Arnold and Shirley Browne and Jack Dongarra and Graham Fagg and Keith Moore} } @inproceedings {icl:26, title = {Secure Remote Access to Numerical Software and Computational Hardware}, journal = {Proceedings of the DoD HPC Users Group Conference (HPCUG) 2000}, year = {2000}, month = {2000-06}, address = {Albuquerque, NM}, keywords = {netsolve}, author = {Dorian Arnold and Shirley Browne and Jack Dongarra and Graham Fagg and Keith Moore} }