@inproceedings {1308, title = {Surviving Errors with OpenSHMEM}, journal = {OpenSHMEM and Related Technologies. Enhancing OpenSHMEM for Hybrid Environments}, year = {2016}, pages = {66{\textendash}81}, publisher = {Springer International Publishing}, address = {Baltimore, MD, USA}, abstract = {Unexpected error conditions stem from a variety of underlying causes, including resource exhaustion, network failures, hardware failures, or program errors. As the scale of HPC systems continues to grow, so does the probability of encountering a condition that causes a failure; meanwhile, error recovery and run-through failure management are becoming mature, and interoperable HPC programming paradigms are beginning to feature advanced error management. As a result from these developments, it becomes increasingly desirable to gracefully handle error conditions in OpenSHMEM. In this paper, we present the design and rationale behind an extension of the OpenSHMEM API that can (1) notify user code of unexpected erroneous conditions, (2) permit customized user response to errors without incurring overhead on an error-free execution path, (3) propagate the occurence of an error condition to all Processing Elements, and (4) consistently close the erroneous epoch in order to resume the application.}, isbn = {978-3-319-50995-2}, author = {Aurelien Bouteiller and George Bosilca and Manjunath Gorentla Venkata}, editor = {Manjunath Gorentla Venkata and Imam, Neena and Pophale, Swaroop and Mintz, Tiffany M.} }