Home | Technologies | Projects | Publications [BibTeX] | Opportunities

BibTeX Citations

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Journal Publications
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@article{he07unified,
  author        = "Xubin (Ben) He
                   and Li Ou
                   and Martha J. Kosa
                   and Stephen L. Scott
                   and Christian Engelmann",
  title         = "A Unified Multiple-Level Cache for High Performance Cluster
                   Storage Systems",
  journal       = "\href{http://www.inderscience.com/browse/index.php?journalcode=ijhpcn}
                   {International Journal of High Performance Computing and
                   Networking (IJHPCN)}",
  volume        = "5",
  number        = "1-2",
  pages         = "97--109",
  year          = "2007. " # "\href{http://www.inderscience.com}{Inderscience
                   Publishers, Geneve, Switzerland}",
  issn          = "1740-0562",
  doi           = "http://dx.doi.org/10.1504/IJHPCN.2007.015768",
  url           = "http://www.csm.ornl.gov/~engelman/publications/he07unified.pdf",
  abstract      = "Highly available data storage for high-performance computing
                   is becoming increasingly more critical as high-end computing
                   systems scale up in size and storage systems are developed
                   around network-centered architectures. A promising solution
                   is to harness the collective storage potential of individual
                   workstations much as we harness idle CPU cycles due to the
                   excellent price/performance ratio and low storage usage of
                   most commodity workstations. For such a storage system,
                   metadata consistency is a key issue assuring storage system
                   availability as well as data reliability. In this paper, we
                   present a decentralized metadata management scheme that
                   improves storage availability without sacrificing
                   performance."
}

@article{engelmann06symmetric,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Symmetric Active/Active High Availability for
                   High-Performance Computing System Services",
  journal       = "\href{http://www.academypublisher.com/jcp}{Journal of
                   Computers (JCP)}",
  volume        = "1",
  number        = "8",
  pages         = "43--54",
  year          = "2006. " # "\href{http://www.academypublisher.com}{Academy
                   Publisher, Oulu, Finland}",
  issn          = "1796-203X",
  doi           = "http://www.academypublisher.com/jcp/vol01/no08/jcp01084354.html",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann06symmetric.pdf",
  abstract      = "This work aims to pave the way for high availability in
                   high-performance computing (HPC) by focusing on efficient
                   redundancy strategies for head and service nodes. These nodes
                   represent single points of failure and control for an entire
                   HPC system as they render it inaccessible and unmanageable in
                   case of a failure until repair. The presented approach
                   introduces two distinct replication methods, internal and
                   external, for providing symmetric active/active high
                   availability for multiple redundant head and service nodes
                   running in virtual synchrony utilizing an existing process
                   group communication system for service group membership
                   management and reliable, totally ordered message delivery.
                   Resented results of a prototype implementation that offers
                   symmetric active/active replication for HPC job and resource
                   management using external replication show that the highest
                   level of availability can be provided with an acceptable
                   performance trade-off."
}

@article{engelmann06molar,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and David E. Bernholdt
                   and Narasimha R. Gottumukkala
                   and Chokchai (Box) Leangsuksun
                   and Jyothish Varma
                   and Chao Wang
                   and Frank Mueller
                   and Aniruddha G. Shet
                   and Ponnuswamy (Saday) Sadayappan",
  title         = "{MOLAR}: {A}daptive Runtime Support for High-End Computing
                   Operating and Runtime Systems",
  journal       = "\href{http://www.sigops.org/osr.html}{ACM SIGOPS Operating
                   Systems Review (OSR)}",
  volume        = "40",
  number        = "2",
  pages         = "63--72",
  year          = "2006. " # "\href{http://www.acm.org}{ACM Press, New York, NY,
                   USA}",
  issn          = "0163-5980",
  doi           = "http://doi.acm.org/10.1145/1131322.1131337",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann06molar.pdf",
  abstract      = "MOLAR is a multi-institutional research effort that
                   concentrates on adaptive, reliable, and efficient operating
                   and runtime system (OS/R) solutions for ultra-scale,
                   high-end scientific computing on the next generation of
                   supercomputers. This research addresses the challenges
                   outlined in FAST-OS (forum to address scalable technology for
                   runtime and operating systems) and HECRTF (high-end computing
                   revitalization task force) activities by exploring the use of
                   advanced monitoring and adaptation to improve application
                   performance and predictability of system interruptions, and
                   by advancing computer reliability, availability and
                   serviceability (RAS) management systems to work cooperatively
                   with the OS/R to identify and preemptively resolve system
                   issues. This paper describes recent research of the MOLAR
                   team in advancing RAS for high-end computing OS/Rs."
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Conference Publications
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@conference{engelmann09evaluating,
  author        = "Christian Engelmann
                   and Hong H. Ong
                   and Stephen L. Scott",
  title         = "Evaluating the Shared Root File System Approach for Diskless
                   High-Performance Computing Systems",
  booktitle     = "Proceedings of the
                   \href{http://www.linuxclustersinstitute.org/conferences}
                   {$10^{th}$ LCI International Conference on High-Performance
                   Clustered Computing (LCI) 2009}",
  month         = mar # "~9-12, ",
  year          = "2009",
  address       = "Boulder, CO, USA",
  url           = "",
  url2          = "",
  note          = "To appear",
  abstract      = "Diskless high-performance computing (HPC) systems utilizing
                   networked storage have become popular in the last several
                   years. Removing disk drives significantly increases compute
                   node reliability as they are known to be a major source of
                   failures. Furthermore, networked storage solutions utilizing
                   parallel I/O and replication are able to provide increased
                   scalability and availability. Reducing a compute node to
                   processor(s), memory and network interface(s) greatly reduces
                   its physical size, which in turn allows for large-scale dense
                   HPC solutions. However, one major obstacle is the requirement
                   by certain operating systems (OSs), such as Linux, for a root
                   file system. While one solution is to remove this requirement
                   from the OS, another is to share the root file system over
                   the networked storage. This paper evaluates three networked
                   file system solutions, NFSv4, Lustre and PVFS2, with respect
                   to their performance, scalability, and availability features
                   for servicing a common root file system in a diskless HPC
                   configuration. Our findings indicate that Lustre is a viable
                   solution as it meets both, scaling and performance
                   requirements. However, certain availability issues regarding
                   single points of failure and control need to be considered."
}

@conference{engelmann09proactive,
  author        = "Christian Engelmann
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Stephen L. Scott",
  title         = "Proactive Fault Tolerance Using Preemptive Migration",
  booktitle     = "Proceedings of the \href{http://www.pdp2009.org}{$17^{th}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2009}",
  pages         = "",
  month         = feb # "~18-20, ",
  year          = "2009",
  address       = "Weimar, Germany",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "",
  doi           = "",
  url           = "",
  url2          = "",
  note          = "To appear",
  abstract      = "Proactive fault tolerance (FT) in high-performance computing
                   is a concept that prevents compute node failures from
                   impacting running parallel applications by preemptively
                   migrating application parts away from nodes that are about
                   to fail. This paper provides a foundation for proactive FT by
                   defining its architecture and classifying implementation
                   options. This paper further relates prior work to the
                   presented architecture and classification, and discusses the
                   challenges ahead for needed supporting technologies."
}

@conference{valentini09high,
  author        = "Alessandro Valentini
                   and Christian Di Biagio
                   and Fabrizio Batino
                   and Guido Pennella
                   and Fabrizio Palma
                   and Christian Engelmann",
  title         = "High Performance Computing with {Harness} over {InfiniBand}",
  booktitle     = "Proceedings of the \href{http://www.pdp2009.org}{$17^{th}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2009}",
  pages         = "",
  month         = feb # "~18-20, ",
  year          = "2009",
  address       = "Weimar, Germany",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "",
  doi           = "",
  url           = "",
  url2          = "",
  note          = "To appear",
  abstract      = "Harness is an adaptable and plug-in-based middleware
                   framework able to support distributed parallel computing. By
                   now, it is based on the Ethernet protocol which cannot
                   guarantee high performance throughput and Real Time
                   (determinism) performance. During last years, both the
                   research and industry environments have developed both new
                   network architectures (InfiniBand, Myrinet, iWARP, etc.) to
                   avoid those limits. This paper concerns the integration
                   between Harness and InfiniBand focusing on two solutions: IP
                   over InfiniBand (IPoIB) and Socket Direct Protocol (SDP)
                   technology. Those allow Harness middleware to take advantage
                   of the enhanced features provided by InfiniBand."
}

@conference{engelmann09case,
  author        = "Christian Engelmann
                   and Hong H. Ong
                   and Stephen L. Scott",
  title         = "The Case for Modular Redundancy in Large-Scale High
                   Performance Computing Systems",
  booktitle     = "Proceedings of the
                   \href{http://www.iasted.org/conferences/home-641.html}
                   {$27^{th}$ IASTED International Conference on Parallel and
                   Distributed Computing and Networks (PDCN) 2009}",
  pages         = "",
  month         = feb # "~16-18, ",
  year          = "2009",
  address       = "Innsbruck, Austria",
  publisher     = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB,
                   Canada}",
  isbn          = "",
  url           = "",
  url2          = "",
  note          = "To appear",
  abstract      = "Recent investigations into resilience of large-scale
                   high-performance computing (HPC) systems showed a continuous
                   trend of decreasing reliability and availability. Newly
                   installed systems have a lower mean-time to failure (MTTF)
                   and a higher mean-time to recover (MTTR) than their
                   predecessors. Modular redundancy is being used in many
                   mission critical systems today to provide for resilience,
                   such as for aerospace and command \& control systems. The
                   primary argument against modular redundancy for resilience
                   in HPC has always been that the capability of a HPC system,
                   and respective return on investment, would be significantly
                   reduced. We argue that modular redundancy can significantly
                   increase compute node availability as it removes the impact
                   of scale from single compute node MTTR. We further argue that
                   single compute nodes can be much less reliable, and therefore
                   less expensive, and still be highly available, if their
                   MTTR/MTTF ratio is maintained."
}

@conference{wang08proactive,
  author        = "Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Proactive Process-Level Live Migration in {HPC}
                   Environments",
  booktitle     = "Proceedings of the \href{http://sc08.supercomputing.org}
                   {IEEE/ACM International Conference on High Performance
                   Computing, Networking, Storage and Analysis (SC) 2008}",
  month         = nov # "~15-21, ",
  year          = "2008",
  address       = "Austin, TX, USA",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-4244-2835-9",
  doi           = "http://doi.acm.org/10.1145/1413370.1413414",
  url           = "http://www.csm.ornl.gov/~engelman/publications/wang08proactive.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/wang08proactive.ppt.pdf",
  abstract      = "As the number of nodes in high-performance computing
                   environments keeps increasing, faults are becoming common
                   place. Reactive fault tolerance (FT) often does not scale due
                   to massive I/O requirements and relies on manual job
                   resubmission. This work complements reactive with proactive
                   FT at the process level. Through health monitoring, a subset
                   of node failures can be anticipated when one`s health
                   deteriorates. A novel process-level live migration mechanism
                   supports continued execution of applications during much of
                   processes migration. This scheme is integrated into an MPI
                   execution environment to transparently sustain
                   health-inflicted node failures, which eradicates the need to
                   restart and requeue MPI jobs. Experiments indicate that 1-6.5
                   seconds of prior warning are required to successfully trigger
                   live process migration while similar operating system
                   virtualization mechanisms require 13-24 seconds. This
                   self-healing approach complements reactive FT by nearly
                   cutting the number of checkpoints in half when 70\% of the
                   faults are handled proactively."
}

@conference{vallee08virtual,
  author        = "Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Hong H. Ong
                   and Anand Tikotekar
                   and Christian Engelmann
                   and Wesley Bland
                   and Ferrol Aderholt
                   and Stephen L. Scott",
  title         = "Virtual System Environments",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.dmtf.org/svm08}{$2^{nd}$ DMTF Academic
                   Alliance Workshop on Systems and Virtualization Management:
                   Standards and New Technologies (SVM) 2008}",
  volume        = "",
  pages         = "",
  month         = oct # "~21-22, ",
  year          = "2008",
  address       = "Munich, Germany",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "",
  doi           = "",
  url           = "http://www.csm.ornl.gov/~engelman/publications/vallee08virtual.pdf",
  url2          = "",
  note          = "To appear",
  abstract      = "Distributed and parallel systems are typically managed with
                   static settings: the operating system (OS) and the runtime
                   environment (RTE) are specified at a given time and cannot be
                   changed to fit an application`s needs. This means that every
                   time application developers want to use their application on
                   a new execution platform, the application has to be ported to
                   this new environment, which may be expensive in terms of
                   application modifications and developer time. However, the
                   science resides in the applications and not in the OS or the
                   RTE. Therefore, it should be beneficial to adapt the OS and
                   the RTE to the application instead of adapting the
                   applications to the OS and the RTE. This document presents
                   the concept of Virtual System Environments (VSE), which
                   enables application developers to specify and create a
                   virtual environment that properly fits their application`s
                   needs. For that four challenges have to be addressed: (i)
                   definition of the VSE itself by the application developers,
                   (ii) deployment of the VSE, (iii) system administration for
                   the platform, and (iv) protection of the platform from the
                   running VSE. We therefore present an integrated tool for the
                   definition and deployment of VSEs on top of traditional and
                   virtual (i.e., using system-level virtualization) execution
                   platforms. This tool provides the capability to choose the
                   degree of delegation for system administration tasks and the
                   degree of protection from the application (e.g., using
                   virtual machines). To summarize, the VSE concept enables the
                   customization of the OS/RTE used for the execution of
                   application by users without compromising local system
                   administration rules and execution platform protection
                   constraints."
}

@conference{tikotekar08analysis,
  author        = "Anand Tikotekar
                   and Geoffroy Vall\'ee
                   and Thomas Naughton
                   and Hong H. Ong
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "An Analysis of {HPC} Benchmark Applications in Virtual
                   Machine Environments",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://europar2008.caos.uab.es}{$14^{th}$ European
                   Conference on Parallel and Distributed Computing (Euro-Par)
                   2008}: \href{http://scilytics.com/vhpc}{$3^{rd}$ Workshop on
                   Virtualization in High-Performance Cluster and Grid Computing
                   (VHPC) 2008}",
  volume        = "",
  pages         = "",
  month         = aug # "~26-29, ",
  year          = "2008",
  address       = "Las Palmas de Gran Canaria, Spain",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "",
  doi           = "",
  url           = "http://www.csm.ornl.gov/~engelman/publications/tikotekar08analysis.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/tikotekar08analysis.ppt.pdf",
  note          = "To appear",
  abstract      = "Virtualization technology has been gaining acceptance in the
                   scientific community due to its overall flexibility in
                   running HPC applications. It has been reported that a
                   specific class of applications is better suited to a
                   particular type of virtualization scheme or implementation.
                   For example, Xen has been shown to perform with little
                   overhead for compute-bound applications. Such a study,
                   although useful, does not allow us to generalize conclusions
                   beyond the performance analysis of that application which is
                   explicitly executed. An explanation of why the generalization
                   described above is difficult, may be due to the versatility
                   in applications, which leads to different overheads in
                   virtual environments. For example, two similar applications
                   may spend disproportionate amount of time in their respective
                   library code when run in virtual environments. In this paper,
                   we aim to study such potential causes by investigating the
                   behavior and identifying patterns of various overheads for
                   HPC benchmark applications. Based on the investigation of the
                   overhead profiles for different benchmarks, we aim to address
                   questions such as: Are the overhead profiles for a particular
                   type of benchmarks (such as compute-bound) similar or are
                   there grounds to conclude otherwise?"
}

@conference{engelmann08symmetric2,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Symmetric Active/Active High Availability for
                   High-Performance Computing System Services: Accomplishments
                   and Limitations",
  booktitle     = "Proceedings of the
                   \href{http://www.ens-lyon.fr/LIP/RESO/ccgrid2008}{$8^{th}$
                   IEEE International Symposium on Cluster Computing and the
                   Grid (CCGrid) 2008}:
                   \href{http://xcr.cenit.latech.edu/resilience2008}{Workshop on
                   Resiliency in High Performance Computing (Resilience) 2008}",
  pages         = "813--818",
  month         = may # "~19-22, ",
  year          = "2008",
  address       = "Lyon, France",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3156-4",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/CCGRID.2008.78",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric2.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric2.pdf",
  abstract      = "This paper summarizes our efforts over the last 3-4 years in
                   providing symmetric active/active high availability for
                   high-performance computing (HPC) system services. This work
                   paves the way for high-level reliability, availability and
                   serviceability in extreme-scale HPC systems by focusing on
                   the most critical components, head and service nodes, and by
                   reinforcing them with appropriate high availability
                   solutions. This paper presents our accomplishments in the
                   form of concepts and respective prototypes, discusses
                   existing limitations, outlines possible future work, and
                   describes the relevance of this research to other, planned
                   efforts."
}

@conference{chen08online,
  author        = "Xin Chen
                   and Benjamin Eckart
                   and Xubin (Ben) He
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "An Online Controller Towards Self-Adaptive File System
                   Availability and Performance",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2008}{$5^{th}$ High
                   Availability and Performance Workshop (HAPCW) 2008}, in
                   conjunction with the \href{http://www.hpcsw.org}{$1^{st}$
                   High-Performance Computer Science Week (HPCSW) 2008}",
  month         = apr # "~3-4, ",
  year          = "2008",
  address       = "Denver, CO, USA",
  url           = "http://www.csm.ornl.gov/~engelman/publications/chen08online.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/chen08online.ppt.pdf",
  abstract      = "At the present time, it can be a significant challenge to
                   build a large-scale distributed file system that
                   simultaneously maintains both high availability and high
                   performance. Although many fault tolerance technologies have
                   been proposed and used in both commercial and academic
                   distributed file systems to achieve high availability, most
                   of them typically sacrifice performance for higher system
                   availability. Additionally, recent studies show that system
                   availability and performance are related to the system
                   workload. In this paper, we analyze the correlations among
                   availability, performance, and workloads based on a
                   replication strategy, and we discuss the trade off between
                   availability and performance with different workloads. Our
                   analysis leads to the design of an online controller that can
                   dynamically achieve optimal performance and availability by
                   tuning the system replication policy."
}

@conference{tikotekar08effects,
  author        = "Anand Tikotekar
                   and Geoffroy Vall\'ee
                   and Thomas Naughton
                   and Hong H. Ong
                   and Christian Engelmann
                   and Stephen L. Scott
                   and Anthony M. Filippi",
  title         = "Effects of Virtualization on a Scientific Application --
                   {R}unning a Hyperspectral Radiative Transfer Code on Virtual
                   Machines",
  booktitle     = "Proceedings of the
                   \href{http://www.csm.ornl.gov/srt/hpcvirt08}{$2^{nd}$
                   Workshop on System-level Virtualization for High Performance
                   Computing (HPCVirt) 2008}, in conjunction with the
                   \href{http://www.eurosys.org/2008}{$3^{rd}$ ACM SIGOPS
                   European Conference on Computer Systems (EuroSys) 2008}",
  month         = mar # "~31, ",
  year          = "2008",
  address       = "Glasgow, UK",
  url           = "http://www.csm.ornl.gov/~engelman/publications/tikotekar08effects.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/tikotekar08effects.ppt.pdf",
  abstract      = "The topic of system-level virtualization has recently begun
                   to receive interest for high performance computing (HPC).
                   This is in part due to the isolation and encapsulation
                   offered by the virtual machine. These traits enable
                   applications to customize their environments and maintain
                   consistent software configurations in their virtual domains.
                   Additionally, there are mechanisms that can be used for fault
                   tolerance like live virtual machine migration. Given these
                   attractive benefits to virtualization, a fundamental question
                   arises, how does this effect my scientific application? We
                   use this as the premise for our paper and observe a
                   real-world scientific code running on a Xen virtual machine.
                   We studied the effects of running a radiative transfer
                   simulation, Hydrolight, on a virtual machine. We discuss our
                   methodology and report observations regarding the usage of
                   virtualization with this application."
}

@conference{engelmann08symmetric,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Symmetric Active/Active Replication for Dependent Services",
  booktitle     = "Proceedings of the
                   \href{http://www.ares-conference.eu/ares2008}{$3^{rd}$
                   International Conference on Availability, Reliability and
                   Security (ARES) 2008}",
  pages         = "260--267",
  month         = mar # "~4-7, ",
  year          = "2008",
  address       = "Barcelona, Spain",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3102-1",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ARES.2008.64",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric.ppt.pdf",
  abstract      = "During the last several years, we have established the
                   symmetric active/active replication model for service-level
                   high availability and implemented several proof-of-concept
                   prototypes. One major deficiency of our model is its
                   inability to deal with dependent services, since its original
                   architecture is based on the client-service model. This paper
                   extends our model to dependent services using its already
                   existing mechanisms and features. The presented concept is
                   based on the idea that a service may also be a client of
                   another service, and multiple services may be clients of each
                   other. A high-level abstraction is used to illustrate
                   dependencies between clients and services, and to decompose
                   dependencies between services into respective client-service
                   dependencies. This abstraction may be used for providing
                   high availability in distributed computing systems with
                   complex service-oriented architectures."
}

@conference{vallee08framework,
  author        = "Geoffroy R. Vall\'ee
                   and Kulathep Charoenpornwattana
                   and Christian Engelmann
                   and Anand Tikotekar
                   and Chokchai (Box) Leangsuksun
                   and Thomas Naughton
                   and Stephen L. Scott",
  title         = "A Framework For Proactive Fault Tolerance",
  booktitle     = "Proceedings of the
                   \href{http://www.ares-conference.eu/ares2008}{$3^{rd}$
                   International Conference on Availability, Reliability and
                   Security (ARES) 2008}",
  pages         = "659--664",
  month         = mar # "~4-7, ",
  year          = "2008",
  address       = "Barcelona, Spain",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3102-1",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ARES.2008.171",
  url           = "http://www.csm.ornl.gov/~engelman/publications/vallee08framework.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/vallee08framework.ppt.pdf",
  abstract      = "Fault tolerance is a major concern to guarantee availability
                   of critical services as well as application execution.
                   Traditional approaches for fault tolerance include
                   checkpoint/restart or duplication. However it is also
                   possible to anticipate failures and proactively take action
                   before failures occur in order to minimize failure impact on
                   the system and application execution. This document presents
                   a proactive fault tolerance framework. This framework can use
                   different proactive fault tolerance mechanisms, i.e.
                   migration and pause/unpause. The framework also allows the
                   implementation of new proactive fault tolerance policies
                   thanks to a modular architecture. A first proactive fault
                   tolerance policy has been implemented and preliminary
                   experimentations have been done based on system-level
                   virtualization and compared with results obtained by
                   simulation."
}

@conference{koenning08virtualized,
  author        = "Bj{\"o}rn K{\"o}nning
                   and Christian Engelmann
                   and Stephen L. Scott
                   and George A. (Al) Geist",
  title         = "Virtualized Environments for the {Harness} High Performance
                   Computing Workbench",
  booktitle     = "Proceedings of the \href{http://www.pdp2008.org}{$16^{th}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2008}",
  pages         = "133--140",
  month         = feb # "~13-15, ",
  year          = "2008",
  address       = "Toulouse, France",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3089-5",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/PDP.2008.14",
  url           = "http://www.csm.ornl.gov/~engelman/publications/koenning08virtualized.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/koenning08virtualized.ppt.pdf",
  abstract      = "This paper describes recent accomplishments in providing a
                   virtualized environment concept and prototype for scientific
                   application development and deployment as part of the Harness
                   High Performance Computing (HPC) Workbench research effort.
                   The presented work focuses on tools and mechanisms that
                   simplify scientific application development and deployment
                   tasks, such that only minimal adaptation is needed when
                   moving from one HPC system to another or after HPC system
                   upgrades. The overall technical approach focuses on the
                   concept of adapting the HPC system environment to the actual
                   needs of individual scientific applications instead of the
                   traditional scheme of adapting scientific applications to
                   individual HPC system environment properties. The presented
                   prototype implementation is based on the mature and
                   lightweight chroot virtualization approach for Unix-type
                   systems with a focus on virtualized file system structure
                   and virtualized shell environment variables utilizing
                   virtualized environment configuration descriptions in
                   Extensible Markup Language (XML) format. The presented work
                   can be easily extended to other virtualization technologies,
                   such as system-level virtualization solutions using
                   hypervisors."
}

@conference{vallee08system,
  author        = "Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Christian Engelmann
                   and Hong H. Ong
                   and Stephen L. Scott",
  title         = "System-level Virtualization for High Performance Computing",
  booktitle     = "Proceedings of the \href{http://www.pdp2008.org}{$16^{th}$
                   Euromicro International Conference on Parallel, Distributed,
                   and network-based Processing (PDP) 2008}",
  pages         = "636--643",
  month         = feb # "~13-15, ",
  year          = "2008",
  address       = "Toulouse, France",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-0-7695-3089-5",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/PDP.2008.85",
  url           = "http://www.csm.ornl.gov/~engelman/publications/vallee08system.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/vallee08system.ppt.pdf",
  abstract      = "System-level virtualization has been a research topic since
                   the 70`s but regained popularity during the past few years
                   because of the availability of efficient solution such as Xen
                   and the implementation of hardware support in commodity
                   processors (e.g. Intel-VT, AMD-V). However, a majority of
                   system-level virtualization projects is guided by the server
                   consolidation market. As a result, current virtualization
                   solutions appear to not be suitable for high performance
                   computing (HPC) which is typically based on large-scale
                   systems. On another hand there is significant interest in
                   exploiting virtual machines (VMs) within HPC for a number of
                   other reasons. By virtualizing the machine, one is able to
                   run a variety of operating systems and environments as needed
                   by the applications. Virtualization allows users to isolate
                   workloads, improving security and reliability. It is also
                   possible to support non-native environments and/or legacy
                   operating environments through virtualization. In addition,
                   it is possible to balance work loads, use migration
                   techniques to relocate applications from failing machines,
                   and isolate fault systems for repair. This document presents
                   the challenges for the implementation of a system-level
                   virtualization solution for HPC. It also presents a brief
                   survey of the different approaches and techniques to address
                   these challenges."
}

@conference{ou07symmetric,
  author        = "Li Ou
                   and Christian Engelmann
                   and Xubin (Ben) He
                   and Xin Chen
                   and Stephen L. Scott",
  title         = "Symmetric Active/Active Metadata Service for Highly Available
                   Cluster Storage Systems",
  booktitle     = "Proceedings of the
                   \href{http://www.iasted.org/conferences/home-590.html}
                   {$19^{th}$ IASTED International Conference on Parallel and
                   Distributed Computing and Systems (PDCS) 2007}",
  pages         = "",
  month         = nov # "~19-21, ",
  year          = "2007",
  address       = "Cambridge, MA, USA",
  publisher     = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB,
                   Canada}",
  isbn          = "978-0-88986-703-1",
  url           = "http://www.csm.ornl.gov/~engelman/publications/ou07symmetric.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/ou07symmetric.ppt.pdf",
  abstract      = "In a typical distributed storage system, metadata is stored
                   and managed by dedicated metadata servers. One way to improve
                   the availability of distributed storage systems is to deploy
                   multiple metadata servers. Past research focused on the
                   active/standby model, where each active server has at least
                   one redundant idle backup. However, interruption of service
                   and loss of service state may occur during a fail-over
                   depending on the used replication technique. The research in
                   this paper targets the symmetric active/active replication
                   model using multiple redundant service nodes running in
                   virtual synchrony. In this model, service node failures do
                   not cause a fail-over to a backup and there is no disruption
                   of service or loss of service state. We propose a fast
                   delivery protocol to reduce the latency of total order
                   broadcast. Our prototype implementation shows that high
                   availability of metadata servers can be achieved with an
                   acceptable performance trade-off using the active/active
                   metadata server solution."
}

@conference{disaverio07distributed,
  author        = "Emanuele Di Saverio
                   and Marco Cesati
                   and Christian Di Biagio
                   and Guido Pennella
                   and Christian Engelmann",
  title         = "Distributed Real-Time Computing with {Harness}",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://pvmmpi07.lri.fr}{$14^{th}$ European PVM/MPI
                   Users` Group Meeting (EuroPVM/MPI) 2007}",
  pages         = "281--288",
  volume        = "4757",
  month         = sep # "~30 - " # oct # "~3, ",
  year          = "2007",
  address       = "Paris, France",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-540-75415-2, ISSN 0302-9743",
  doi           = "http://dx.doi.org/10.1007/978-3-540-75416-9_39",
  url           = "http://www.csm.ornl.gov/~engelman/publications/disaverio07distributed.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/disaverio07distributed.ppt.pdf",
  abstract      = "Modern parallel and distributed computing solutions are often
                   built onto a middleware software layer providing a higher
                   and common level of service between computational nodes.
                   Harness is an adaptable, plugin-based middleware framework
                   for parallel and distributed computing. This paper reports
                   recent research and development results of using Harness for
                   real-time distributed computing applications in the context
                   of an industrial environment with the needs to perform
                   several safety critical tasks. The presented work exploits
                   the modular architecture of Harness in conjunction with a
                   lightweight threaded implementation to resolve several
                   real-time issues by adding three new Harness plug-ins to
                   provide a prioritized lightweight execution environment, low
                   latency communication facilities, and local timestamped event
                   logging."
}

@conference{ou07fast,
  author        = "Li Ou
                   and Xubin (Ben) He
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "A Fast Delivery Protocol for Total Order Broadcasting",
  booktitle     = "Proceedings of the \href{http://www.icccn.org/icccn07}
                   {$16^{th}$ IEEE International Conference on Computer
                   Communications and Networks (ICCCN) 2007}",
  pages         = "730--734",
  month         = aug # "~13-16, ",
  year          = "2007",
  address       = "Honolulu, HI, USA",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "978-1-42441-251-8, ISSN 1095-2055",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ICCCN.2007.4317904",
  url           = "http://www.csm.ornl.gov/~engelman/publications/ou07fast.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/ou07fast.ppt.pdf",
  abstract      = "Sequencer, privilege-based, and communication history
                   algorithms are popular approaches to implement total
                   ordering, where communication history algorithms are most
                   suitable for parallel computing systems, because they provide
                   best performance under heavy work load. Unfortunately,
                   post-transmission delay of communication history algorithms
                   is most apparent when a system is idle. In this paper, we
                   propose a fast delivery protocol to reduce the latency of
                   message ordering. The protocol optimizes the total ordering
                   process by waiting for messages only from a subset of the
                   machines in the group, and by fast acknowledging messages on
                   behalf of other machines. Our test results indicate that the
                   fast delivery protocol is suitable for both idle and heavy
                   load systems, while reducing the latency of message
                   ordering."
}

@conference{nagarajan07proactive,
  author        = "Arun B. Nagarajan
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Proactive Fault Tolerance for {HPC} with {Xen}
                   Virtualization",
  booktitle     = "Proceedings of the \href{http://ics07.ac.upc.edu}{$21^{st}$
                   ACM International Conference on Supercomputing (ICS) 2007}",
  pages         = "23--32",
  month         = jun # "~16-20, ",
  year          = "2007",
  address       = "Seattle, WA, USA",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-59593-768-1",
  doi           = "http://doi.acm.org/10.1145/1274971.1274978",
  url           = "http://www.csm.ornl.gov/~engelman/publications/nagarajan07proactive.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/nagarajan07proactive.ppt.pdf",
  abstract      = "Large-scale parallel computing is relying increasingly on
                   clusters with thousands of processors. At such large counts
                   of compute nodes, faults are becoming common place. Current
                   techniques to tolerate faults focus on reactive schemes to
                   recover from faults and generally rely on a
                   checkpoint/restart mechanism. Yet, in today`s systems, node
                   failures can often be anticipated by detecting a
                   deteriorating health status. Instead of a reactive scheme for
                   fault tolerance (FT), we are promoting a proactive one where
                   processes automatically migrate from unhealthy nodes to
                   healthy ones. Our approach relies on operating system
                   virtualization techniques exemplified by but not limited to
                   Xen. This paper contributes an automatic and transparent
                   mechanism for proactive FT for arbitrary MPI applications.
                   It leverages virtualization techniques combined with health
                   monitoring and load-based migration. We exploit Xen`s live
                   migration mechanism for a guest operating system (OS) to
                   migrate an MPI task from a health-deteriorating node to a
                   healthy one without stopping the MPI task during most of the
                   migration. Our proactive FT daemon orchestrates the tasks of
                   health monitoring, load determination and initiation of guest
                   OS migration. Experimental results demonstrate that live
                   migration hides migration costs and limits the overhead to
                   only a few seconds making it an attractive approach to
                   realize FT in HPC systems. Overall, our enhancements make
                   proactive FT a valuable asset for long-running MPI
                   application that is complementary to reactive FT using full
                   checkpoint/restart schemes since checkpoint frequencies can
                   be reduced as fewer unanticipated failures are encountered.
                   In the context of OS virtualization, we believe that this is
                   the first comprehensive study of proactive fault tolerance
                   where live migration is actually triggered by health
                   monitoring."
}

@conference{engelmann07middleware,
  author        = "Christian Engelmann
                   and Hong H. Ong
                   and Stephen L. Scott",
  title         = "Middleware in Modern High Performance Computing System
                   Architectures",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.iccs-meeting.org/iccs2007}{$7^{th}$
                   International Conference on Computational Science (ICCS)
                   2007}, Part II: \href{http://www.gup.uni-linz.ac.at/cce2007}
                   {$4^{th}$ Special Session on Collaborative and Cooperative
                   Environments (CCE) 2007}",
  volume        = "4488",
  pages         = "784--791",
  month         = may # "~27-30, ",
  year          = "2007",
  address       = "Beijing, China",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "3-5407-2585-5, ISSN 0302-9743",
  doi           = "http://dx.doi.org/10.1007/978-3-540-72586-2_111",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann07middleware.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann07middleware.ppt.pdf",
  abstract      = "A recent trend in modern high performance computing (HPC)
                   system architectures employs lean compute nodes running a
                   lightweight operating system (OS). Certain parts of the OS a
                   well as other system software services are moved to service
                   nodes in order to increase performance and scalability. This
                   paper examines the impact of this HPC system architecture
                   trend on HPC middleware software solutions, which
                   traditionally equip HPC systems with advanced features, such
                   as parallel and distributed programming models, appropriate
                   system resource management mechanisms, remote application
                   steering and user interaction techniques. Since the approach
                   of keeping the compute node software stack small and simple
                   is orthogonal to the middleware concept of adding missing OS
                   features between OS and application, the role and
                   architecture of middleware in modern HPC systems needs to be
                   revisited. The result is a paradigm shift in HPC middleware
                   design, where single middleware services are moved to service
                   nodes, while runtime environments (RTEs) continue to reside
                   on compute nodes."
}

@conference{engelmann07transparent,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Transparent Symmetric Active/Active Replication for
                   Service-Level High Availability",
  booktitle     = "Proceedings of the \href{http://ccgrid07.lncc.br}{$7^{th}$
                   IEEE International Symposium on Cluster Computing and the
                   Grid (CCGrid) 2007}: \href{http://www.lri.fr/~fedak/gp2pc-07}
                   {$7^{th}$ International Workshop on Global and Peer-to-Peer
                   Computing (GP2PC) 2007}",
  pages         = "755--760",
  month         = may # "~14-17, ",
  year          = "2007",
  address       = "Rio de Janeiro, Brazil",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-2833-3",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/CCGRID.2007.116",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann07transparent.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann07transparent.ppt.pdf",
  abstract      = "As service-oriented architectures become more important in
                   parallel and distributed computing systems, individual
                   service instance reliability as well as appropriate service
                   redundancy becomes an essential necessity in order to
                   increase overall system availability. This paper focuses on
                   providing redundancy strategies using service-level
                   replication techniques. Based on previous research using
                   symmetric active/active replication, this paper proposes a
                   transparent symmetric active/active replication approach that
                   allows for more reuse of code between individual
                   service-level replication implementations by using a virtual
                   communication layer. Service- and client-side interceptors
                   are utilized in order to provide total transparency. Clients
                   and servers are unaware of the replication infrastructure as
                   it provides all necessary mechanisms internally."
}

@conference{engelmann07programming,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "On Programming Models for Service-Level High Availability",
  booktitle     = "Proceedings of the
                   \href{http://www.ares-conference.eu/ares2007}{$2^{nd}$
                   International Conference on Availability, Reliability and
                   Security (ARES) 2007}",
  pages         = "999--1006",
  month         = apr # "~10-13, ",
  year          = "2007",
  address       = "Vienna, Austria",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-2775-2",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ARES.2007.109",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann07programming.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann07programming.ppt.pdf",
  abstract      = "This paper provides an overview of existing programming
                   models for service-level high availability and investigates
                   their differences, similarities, advantages, and
                   disadvantages. Its goal is to help to improve reuse of code
                   and to allow adaptation to quality of service requirements by
                   using a uniform programming model description. It further
                   aims at encouraging a discussion about these programming
                   models and their provided quality of service, such as
                   availability, performance, serviceability, usability, and
                   applicability. Within this context, the presented research
                   focuses on providing high availability for services running
                   on head and service nodes of high-performance computing
                   systems."
}

@conference{wang07job,
  author        = "Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "A Job Pause Service under {LAM/MPI+BLCR} for Transparent
                   Fault Tolerance",
  booktitle     = "Proceedings of the \href{http://www.ipdps.org/ipdps2007}
                   {$21^{st}$ IEEE International Parallel and Distributed
                   Processing Symposium (IPDPS) 2007}",
  pages         = "1-10",
  month         = mar # "~26-30, ",
  year          = "2007",
  address       = "Long Beach, CA, USA",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  isbn          = "978-1-59593-768-1",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/IPDPS.2007.370307",
  url           = "http://www.csm.ornl.gov/~engelman/publications/wang07job.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/wang07job.ppt.pdf",
  abstract      = "Checkpoint/restart (C/R) has become a requirement for
                   long-running jobs in large-scale clusters due to a
                   mean-time-to-failure (MTTF) in the order of hours. After a
                   failure, C/R mechanisms generally require a complete restart
                   of an MPI job from the last checkpoint. A complete restart,
                   however, is unnecessary since all but one node are typically
                   still alive. Furthermore, a restart may result in lengthy job
                   requeuing even though the original job had not exceeded its
                   time quantum. In this paper, we overcome these shortcomings.
                   Instead of job restart, we have developed a transparent
                   mechanism for job pause within LAM/MPI+BLCR. This mechanism
                   allows live nodes to remain active and roll back to the last
                   checkpoint while failed nodes are dynamically replaced by
                   spares before resuming from the last checkpoint. Our
                   methodology includes LAM/MPI enhancements in support of
                   scalable group communication with fluctuating number of
                   nodes, reuse of network connections, transparent coordinated
                   checkpoint scheduling and a BLCR enhancement for job pause.
                   Experiments in a cluster with the NAS Parallel Benchmark
                   suite show that our overhead for job pause is comparable to
                   that of a complete job restart. A minimal overhead of 5.6\%
                   is only incurred in case migration takes place while the
                   regular checkpoint overhead remains unchanged. Yet, our
                   approach alleviates the need to reboot the LAM run-time
                   environment, which accounts for considerable overhead
                   resulting in net savings of our scheme in the experiments.
                   Our solution further provides full transparency and
                   automation with the additional benefit of reusing existing
                   resources. Executing continues after failures within the
                   scheduled job, {\em \textit{i.e.}}, the application staging
                   overhead is not incurred again in contrast to a restart.
                   Our scheme offers additional potential for savings through
                   incremental checkpointing and proactive diskless live
                   migration, which we are currently working on."
}

@conference{engelmann07configurable,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Hong H. Ong
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton",
  title         = "Configurable Virtualized System Environments for High
                   Performance Computing",
  booktitle     = "Proceedings of the
                   \href{http://www.csm.ornl.gov/srt/hpcvirt07}{$1^{st}$
                   Workshop on System-level Virtualization for High Performance
                   Computing (HPCVirt) 2007}, in conjunction with the
                   \href{http://www.eurosys.org/2008}{$2^{nd}$ ACM SIGOPS
                   European Conference on Computer Systems (EuroSys) 2007}",
  month         = mar # "~20, ",
  year          = "2007",
  address       = "Lisbon, Portugal",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann07configurable.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann07configurable.ppt.pdf",
  abstract      = "Existing challenges for current terascale high performance
                   computing (HPC) systems are increasingly hampering the
                   development and deployment efforts of system software and
                   scientific applications for next-generation petascale
                   systems. The expected rapid system upgrade interval toward
                   petascale scientific computing demands an incremental
                   strategy for the development and deployment of legacy and new
                   large-scale scientific applications that avoids excessive
                   porting. Furthermore, system software developers as well as
                   scientific application developers require access to
                   large-scale testbed environments in order to test individual
                   solutions at scale. This paper proposes to address these
                   issues at the system software level through the development
                   of a virtualized system environment (VSE) for scientific
                   computing. The proposed VSE approach enables
                   plug-and-play supercomputing through
                   desktop-to-cluster-to-petaflop computer system-level
                   virtualization based on recent advances in hypervisor
                   virtualization technologies. This paper describes the VSE
                   system architecture in detail, discusses needed tools for
                   VSE system management and configuration, and presents
                   respective VSE use case scenarios."
}

@conference{engelmann06towards,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Towards High Availability for High-Performance Computing
                   System Services: {A}ccomplishments and Limitations",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2006}{$4^{th}$ High
                   Availability and Performance Workshop (HAPCW) 2006}, in
                   conjunction with the \href{http://lacsi.krellinst.org}
                   {$7^{th}$ Los Alamos Computer Science Institute (LACSI)
                   Symposium 2006}",
  month         = oct # "~17, ",
  year          = "2006",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann06towards.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann06towards.ppt.pdf",
  abstract      = "During the last several years, our teams at Oak Ridge
                   National Laboratory, Louisiana Tech University, and Tennessee
                   Technological University focused on efficient redundancy
                   strategies for head and service nodes of high-performance
                   computing (HPC) systems in order to pave the way for high
                   availability (HA) in HPC. These nodes typically run critical
                   HPC system services, like job and resource management, and
                   represent single points of failure and control for an entire
                   HPC system. The overarching goal of our research is to
                   provide high-level reliability, availability, and
                   serviceability (RAS) for HPC systems by combining HA and HPC
                   technology. This paper summarizes our accomplishments, such
                   as developed concepts and implemented proof-of-concept
                   prototypes, and describes existing limitations, such as
                   performance issues, which need to be dealt with for
                   production-type deployment."
}

@conference{ou06achieving,
  author        = "Li Ou
                   and Xin Chen
                   and Xubin (Ben) He
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Achieving Computational {I/O} Effciency in a High Performance
                   Cluster Using Multicore Processors",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2006}{$4^{th}$ High
                   Availability and Performance Workshop (HAPCW) 2006}, in
                   conjunction with the \href{http://lacsi.krellinst.org}
                   {$7^{th}$ Los Alamos Computer Science Institute (LACSI)
                   Symposium 2006}",
  month         = oct # "~17, ",
  year          = "2006",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.csm.ornl.gov/~engelman/publications/ou06achieving.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/ou06achieving.ppt.pdf",
  abstract      = "Cluster computing has become one of the most popular
                   platforms for high-performance computing today. The recent
                   popularity of multicore processors provides a flexible way to
                   increase the computational capability of clusters. Although
                   the system performance may improve with multicore processors
                   in a cluster, I/O requests initiated by multiple cores may
                   saturate the I/O bus, and furthermore increase the latency by
                   issuing  multiple non-contiguous disk accesses. In this
                   paper, we propose an asymmetric collective I/O for multicore
                   processors to improve multiple non-contiguous accesses. In
                   our configuration, one core in each multicore processor is
                   designated as the coordinator, and others serve as computing
                   cores. The coordinator is responsible for aggregating I/O
                   operations from computing cores and submitting a contiguous
                   request. The coordinator allocates contiguous memory buffers
                   on behalf of other cores to avoid redundant data copies."
}

@conference{uhlemann06joshua,
  author        = "Kai Uhlemann
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "{JOSHUA}: {S}ymmetric Active/Active Replication for Highly
                   Available {HPC} Job and Resource Management",
  booktitle     = "Proceedings of the \href{http://cluster2006.org}{$8^{th}$
                   IEEE International Conference on Cluster Computing (Cluster)
                   2006}",
  pages         = "1-10",
  month         = sep # "~25-28, ",
  year          = "2006",
  address       = "Barcelona, Spain",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "1-4244-0328-6, ISSN 1552-5244",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2006.311855",
  url           = "http://www.csm.ornl.gov/~engelman/publications/uhlemann06joshua.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/uhlemann06joshua.ppt.pdf",
  abstract      = "Most of today`s HPC systems employ a single head node for
                   control, which represents a single point of failure as it
                   interrupts an entire HPC system upon failure. Furthermore, it
                   is also a single point of control as it disables an entire
                   HPC system until repair. One of the most important HPC system
                   service running on the head node is the job and resource
                   management. If it goes down, all currently running jobs loose
                   the service they report back to. They have to be restarted
                   once the head node is up and running again. With this paper,
                   we present a generic approach for providing symmetric
                   active/active replication for highly available HPC job and
                   resource management. The JOSHUA solution provides a virtually
                   synchronous environment for continuous availability without
                   any interruption of service and without any loss of state.
                   Replication is performed externally via the PBS service
                   interface without the need to modify any service code. Test
                   results as well as availability analysis of our
                   proof-of-concept prototype implementation show that
                   continuous availability can be provided by JOSHUA with an
                   acceptable performance trade-off."
}

@conference{baumann06parallel,
  author        = "Ronald Baumann
                   and Christian Engelmann
                   and George A. (Al) Geist",
  title         = "A Parallel Plug-in Programming Paradigm",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://hpcc06.lrr.in.tum.de}{$7^{th}$ International
                   Conference on High Performance Computing and Communications
                   (HPCC) 2006}",
  volume        = "4208",
  pages         = "823--832",
  month         = sep # "~13-15, ",
  year          = "2006",
  address       = "Munich, Germany",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-540-39368-9, ISSN 0302-9743",
  doi           = "http://dx.doi.org/10.1007/11847366_85",
  url           = "http://www.csm.ornl.gov/~engelman/publications/baumann06parallel.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/baumann06parallel.ppt.pdf",
  abstract      = "Software component architectures allow assembly of
                   applications from individual software modules based on
                   clearly defined programming interfaces, thus improving the
                   reuse of existing solutions and simplifying application
                   development. Furthermore, the plug-in programming paradigm
                   additionally enables runtime reconfigurability, making it
                   possible to adapt to changing application needs, such as
                   different application phases, and system properties, like
                   resource availability, by loading/unloading appropriate
                   software modules. Similar to parallel programs, parallel
                   plug-ins are an abstraction for a set of cooperating
                   individual plug-ins within a parallel application utilizing
                   a software component architecture. Parallel programming
                   paradigms apply to parallel plug-ins in the same way they
                   apply to parallel programs. The research presented in this
                   paper targets the clear definition of parallel plug-ins and
                   the development of a parallel plug-in programming paradigm."
}

@conference{varma06scalable,
  author        = "Jyothish Varma
                   and Chao Wang
                   and Frank Mueller
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Scalable, Fault-Tolerant Membership for {MPI} Tasks on {HPC}
                   Systems",
  booktitle     = "Proceedings of the \href{http://www.ics-conference.org/2006}
                   {$20^{th}$ ACM International Conference on Supercomputing
                   (ICS) 2006}",
  pages         = "219--228",
  month         = jun # "~28-30, ",
  year          = "2006",
  address       = "Cairns, Australia",
  publisher     = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
  doi           = "http://doi.acm.org/10.1145/1183401.1183433",
  isbn          = "1-59593-282-8",
  url           = "http://www.csm.ornl.gov/~engelman/publications/varma06scalable.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/varma06scalable.ppt.pdf",
  abstract      = "Reliability is increasingly becoming a challenge for
                   high-performance computing (HPC) systems with thousands of
                   nodes, such as IBM`s Blue Gene/L. A shorter
                   mean-time-to-failure can be addressed by adding fault
                   tolerance to reconfigure working nodes to ensure that
                   communication and computation can progress. However, existing
                   approaches fall short in providing scalability and small
                   reconfiguration overhead within the fault-tolerant layer.
                   This paper contributes a scalable approach to reconfigure the
                   communication infrastructure after node failures. We propose
                   a decentralized (peer-to-peer) protocol that maintains a
                   consistent view of active nodes in the presence of faults.
                   Our protocol shows response times in the order of hundreds of
                   microseconds and single-digit milliseconds for 
                   reconfiguration using MPI over Blue Gene/L and TCP over 
                   Gigabit, respectively. The protocol can be adapted to match
                   the network topology to further increase performance. We also
                   verify experimental results against a performance model,
                   which demonstrates the scalability of the approach. Hence,
                   the membership service is suitable for deployment in the
                   communication layer of MPI runtime systems, and we have
                   integrated an early version into LAM/MPI."
}

@conference{okunbor06exploring,
  author        = "Daniel I. Okunbor
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "Exploring Process Groups for Reliability, Availability and
                   Serviceability of Terascale Computing Systems",
  booktitle     = "Proceedings of the
                   \href{http://www.atiner.gr/docs/2006AAAPROGRAM_COMP.htm}
                   {$2^{nd}$ International Conference on Computer Science and
                   Information Systems 2006}",
  month         = jun # "~19-21, ",
  year          = "2006",
  address       = "Athens, Greece",
  url           = "http://www.csm.ornl.gov/~engelman/publications/okunbor06exploring.pdf",
  abstract      = "This paper presents various aspects of reliability,
                   availability and serviceability (RAS) systems as they relate
                   to group communication service, including reliable and total
                   order multicast/broadcast, virtual synchrony, and failure
                   detection. While the issue of availability, particularly
                   high availability using replication-based architectures has
                   recently received upsurge research interests, much still have
                   to be done in understanding the basic underlying concepts for
                   achieving RAS systems, especially in high-end and high
                   performance computing (HPC) communities. Various attributes
                   of group communication service and the prototype of symmetric
                   active replication following ideas utilized in the Newtop
                   protocol will be discussed. We explore the application of
                   group communication service for RAS HPC, laying the
                   groundwork for its integrated model."
}

@conference{engelmann06rmix,
  author        = "Christian Engelmann
                   and George A. (Al) Geist",
  title         = "{RMIX}: {A} Dynamic, Heterogeneous, Reconfigurable
                   Communication Framework",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.iccs-meeting.org/iccs2006}{$6^{th}$
                   International Conference on Computational Science (ICCS)
                   2006}, Part II: \href{http://www.gup.uni-linz.ac.at/cce2006}
                   {$3^{rd}$ Special Session on Collaborative and Cooperative
                   Environments (CCE) 2006}",
  volume        = "3992",
  pages         = "573--580",
  month         = may # "~28-31, ",
  year          = "2006",
  address       = "Reading, UK",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "3-540-34381-4, ISSN 0302-9743",
  doi           = "http://dx.doi.org/10.1007/11758525_77",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann06rmix.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann06rmix.ppt.pdf",
  abstract      = "RMIX is a dynamic, heterogeneous, reconfigurable
                   communication framework that allows software components to
                   communicate using various RMI/RPC protocols, such as ONC RPC,
                   Java RMI and SOAP, by facilitating dynamically loadable
                   provider plug-ins to supply different protocol stacks. With
                   this paper, we present a native (C-based), flexible,
                   adaptable, multi-protocol RMI/RPC communication framework
                   that complements the Java-based RMIX variant previously
                   developed by our partner team at Emory University. Our
                   approach offers the same multi-protocol RMI/RPC services
                   and advanced invocation semantics via a C-based interface
                   that does not require an object-oriented programming
                   language. This paper provides a detailed description of our
                   RMIX framework architecture and some of its features. It
                   describes the general use case of the RMIX framework and its
                   integration into the Harness metacomputing environment in the
                   form of a plug-in."
}

@conference{engelmann06active,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and Chokchai (Box) Leangsuksun
                   and Xubin (Ben) He",
  title         = "Active/Active Replication for Highly Available {HPC} System
                   Services",
  booktitle     = "Proceedings of the
                   \href{http://www.ares-conference.eu/ares2006}{$1^{st}$
                   International Conference on Availability, Reliability and
                   Security (ARES) 2006}: $1^{st}$ International Workshop on
                   Frontiers in Availability, Reliability and Security (FARES)
                   2006",
  pages         = "639-645",
  month         = apr # "~20-22, ",
  year          = "2006",
  address       = "Vienna, Austria",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-2567-9",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/ARES.2006.23",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann06active.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann06active.ppt.pdf",
  abstract      = "Today`s high performance computing systems have several
                   reliability deficiencies resulting in availability and
                   serviceability issues. Head and service nodes represent a
                   single point of failure and control for an entire system as
                   they render it inaccessible and unmanageable in case of a
                   failure until repair, causing a significant downtime. This
                   paper introduces two distinct replication methods (internal
                   and external) for providing symmetric active/active high
                   availability for multiple head and service nodes running in
                   virtual synchrony. It presents a comparison of both methods
                   in terms of expected correctness, ease-of-use and performance
                   based on early results from ongoing work in providing
                   symmetric active/active high availability for two HPC system
                   services (TORQUE and PVFS metadata server). It continues with
                   a short description of a distributed mutual exclusion
                   algorithm and a brief statement regarding the handling of
                   Byzantine failures. This paper concludes with an overview of
                   past and ongoing work, and a short summary of the presented
                   research."
}

@conference{engelmann05concepts,
  author        = "Christian Engelmann
                   and Stephen L. Scott",
  title         = "Concepts for High Availability in Scientific High-End
                   Computing",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2005}{$3^{rd}$ High
                   Availability and Performance Workshop (HAPCW) 2005}, in
                   conjunction with the
                   \href{http://lacsi.rice.edu/symposium/agenda_2005}{$6^{th}$
                   Los Alamos Computer Science Institute (LACSI) Symposium
                   2005}",
  month         = oct # "~11, ",
  year          = "2005",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann05concepts.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann05concepts.ppt.pdf",
  abstract      = "Scientific high-end computing (HEC) has become an important
                   tool for scientists world-wide to understand problems, such
                   as in nuclear fusion, human genomics and nanotechnology.
                   Every year, new HEC systems emerge on the market with better
                   performance and higher scale. With only very few exceptions,
                   the overall availability of recently installed systems has
                   been lower in comparison to the same deployment phase of
                   their predecessors. In contrast to the experienced loss of
                   availability, the demand for continuous availability has
                   risen dramatically due to the recent trend towards capability
                   computing. In this paper, we analyze the existing
                   deficiencies of current HEC systems and present several high
                   availability concepts to counter the experienced loss of
                   availability and to alleviate the expected impact on
                   next-generation systems. We explain the application of these
                   concepts to current and future HEC systems and list past and
                   ongoing related research. This paper closes with a short
                   summary of the presented work and a brief discussion of
                   future efforts."
}

@conference{limaye05jobsite,
  author        = "Kshitij Limaye
                   and Chokchai (Box) Leangsuksun
                   and Zeno Greenwood
                   and Stephen L. Scott
                   and Christian Engelmann
                   and Richard M. Libby
                   and Kasidit Chanchio",
  title         = "Job-Site Level Fault Tolerance for Cluster and {Grid}
                   Environments",
  booktitle     = "Proceedings of the \href{http://cluster2005.org}{$7^{th}$
                   IEEE International Conference on Cluster Computing (Cluster)
                   2005}",
  pages         = "1--9",
  month         = sep # "~26-30, ",
  year          = "2005",
  address       = "Boston, MA, USA",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7803-9486-0, ISSN 1552-5244",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2005.347043",
  url           = "http://www.csm.ornl.gov/~engelman/publications/limaye05job-site.pdf",
  abstract      = "In order to adopt high performance clusters and Grid
                   computing for mission critical applications, fault tolerance
                   is a necessity. Common fault tolerance techniques in
                   distributed systems are normally achieved with
                   checkpoint-recovery and job replication on alternative
                   resources, in cases of a system outage. The first approach
                   depends on the system`s MTTR while the latter approach
                   depends on the availability of alternative sites to run
                   replicas. There is a need for complementing these approaches
                   by proactively handling failures at a job-site level,
                   ensuring the system high availability with no loss of user
                   submitted jobs. This paper discusses a novel fault tolerance
                   technique  that enables the job-site recovery in Beowulf
                   cluster-based grid environments, whereas existing techniques
                   give up a failed system by seeking alternative resources.
                   Our results suggest sizable aggregate performance improvement
                   during an implementation of our method in Globus-enabled
                   HA-OSCAR. The technique called Smart Failover provides a
                   transparent and graceful recovery mechanism that saves job
                   states in a local job-manager queue and transfers those
                   states to the backup server periodically, and in critical
                   system events. Thus whenever a failover occurs, the backup
                   server is able to restart the jobs from their last saved
                   state."
}

@conference{song05umlbased,
  author        = "Hertong Song
                   and Chokchai (Box) Leangsuksun
                   and Raja Nassar
                   and Yudan Liu
                   and Christian Engelmann
                   and Stephen L. Scott",
  title         = "{UML-based} {Beowulf} Cluster Availability Modeling",
  booktitle     = "\href{http://www.world-academy-of-science.org/IMCSE2005/ws/SERP}
                   {International Conference on Software Engineering Research
                   and Practice (SERP) 2005}",
  pages         = "161--167",
  month         = jun # "~27-30, ",
  year          = "2005",
  address       = "Las Vegas, NV, USA",
  publisher     = "CSREA Press",
  isbn          = "1-932415-49-1"
}

@conference{engelmann05high,
  author        = "Christian Engelmann
                   and Stephen L. Scott",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  booktitle     = "Proceedings of the \href{http://coset.irisa.fr}{$2^{nd}$
                   International Workshop on Operating Systems, Programming
                   Environments and Management Tools for High-Performance
                   Computing on Clusters (COSET-2) 2005}, in conjunction with
                   the \href{http://ics05.csail.mit.edu}{$19^{th}$ ACM
                   International Conference on Supercomputing (ICS) 2005}",
  month         = jun # "~19, ",
  year          = "2005",
  address       = "Cambridge, MA, USA",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high.ppt.pdf",
  abstract      = "Ultra-scale architectures for scientific high-end computing
                   with tens to hundreds of thousands of processors, such as the
                   IBM Blue Gene/L and the Cray X1, suffer from availability
                   deficiencies, which impact the efficiency of running
                   computational jobs by forcing frequent checkpointing of
                   applications. Most systems are unable to handle runtime
                   system configuration changes caused by failures and require
                   a complete restart of essential system services, such as the
                   job scheduler or MPI, or even of the entire machine. In this
                   paper, we present a flexible, pluggable and component-based
                   high availability framework that expands today`s effort in
                   high availability computing of keeping a single server alive
                   to include all machines cooperating in a high-end scientific
                   computing environment, while allowing adaptation to system
                   properties and application needs."
}

@conference{leangsuksun05asymmetric,
  author        = "Chokchai (Box) Leangsuksun
                   and Venkata K. Munganuru
                   and Tong Liu
                   and Stephen L. Scott
                   and Christian Engelmann",
  title         = "Asymmetric Active-Active High Availability for High-end
                   Computing",
  booktitle     = "Proceedings of the \href{http://coset.irisa.fr}{$2^{nd}$
                   International Workshop on Operating Systems, Programming
                   Environments and Management Tools for High-Performance
                   Computing on Clusters (COSET-2) 2005}, in conjunction with
                   the \href{http://ics05.csail.mit.edu}{$19^{th}$ ACM
                   International Conference on Supercomputing (ICS) 2005}",
  month         = jun # "~19, ",
  year          = "2005",
  address       = "Cambridge, MA, USA",
  url           = "http://www.csm.ornl.gov/~engelman/publications/leangsuksun05asymmetric.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/leangsuksun05asymmetric.ppt.pdf",
  abstract      = "Linux clusters have become very popular for scientific
                   computing at research institutions world-wide, because they
                   can be easily deployed at a fairly low cost. However, the
                   most pressing issues of today`s cluster solutions are
                   availability and serviceability. The conventional Beowulf
                   cluster architecture has a single head node connected to a
                   group of compute nodes. This head node is a typical single
                   point of failure and control, which severely limits
                   availability and serviceability by effectively cutting off
                   healthy compute nodes from the outside world upon overload
                   or failure. In this paper, we describe a paradigm that
                   addresses this issue using asymmetric active-active high
                   availability. Our framework comprises of n + 1 head nodes,
                   where n head nodes are active in the sense that they provide
                   services to simultaneously incoming user requests. One
                   standby server monitors all active servers and performs a
                   fail-over in case of a detected outage. We present a
                   prototype implementation based on a 2 + 1 solution and
                   discuss initial results."
}

@conference{engelmann05superscalable,
  author        = "Christian Engelmann
                   and George A. (Al) Geist",
  title         = "Super-Scalable Algorithms for Computing on 100,000
                   Processors",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.iccs-meeting.org/iccs2005}{$5^{th}$
                   International Conference on Computational Science (ICCS)
                   2005}, Part I",
  volume        = "3514",
  pages         = "313--320",
  month         = may # "~22-25, ",
  year          = "2005",
  address       = "Atlanta, GA, USA",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "978-3-540-26032-5, ISSN 0302-9743",
  doi           = "http://dx.doi.org/10.1007/11428831_39",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann05superscalable.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann05superscalable.ppt.pdf",
  abstract      = "In the next five years, the number of processors in high-end
                   systems for scientific computing is expected to rise to tens
                   and even hundreds of thousands. For example, the IBM Blue
                   Gene/L can have up to 128,000 processors and the delivery of
                   the first system is scheduled for 2005. Existing deficiencies
                   in scalability and fault-tolerance of scientific applications
                   need to be addressed soon. If the number of processors grows
                   by a magnitude and efficiency drops by a magnitude, the
                   overall effective computing performance stays the same.
                   Furthermore, the mean time to interrupt of high-end computer
                   systems decreases with scale and complexity. In a
                   100,000-processor system, failures may occur every couple of
                   minutes and traditional checkpointing may no longer be
                   feasible. With this paper, we summarize our recent research
                   in super-scalable algorithms for computing on 100,000
                   processors. We introduce the algorithm properties of scale
                   invariance and natural fault tolerance, and discuss how they
                   can be applied to two different classes of algorithms. We
                   also describe a super-scalable diskless checkpointing
                   algorithm for problems that can`t be transformed into a
                   super-scalable variant, or where other solutions are more
                   efficient. Finally, a 100,000-processor simulator is
                   presented as a platform for testing and experimentation."
}

@conference{engelmann05lightweight,
  author        = "Christian Engelmann
                   and George A. (Al) Geist",
  title         = "A Lightweight Kernel for the Harness Metacomputing
                   Framework",
  booktitle     = "Proceedings of the
                   \href{http://www.ipdps.org/ipdps2005}{$19^{th}$ IEEE
                   International Parallel and Distributed Processing Symposium
                   (IPDPS) 2005}: \href{http://www.cs.umass.edu/~rsnbrg/hcw2005}
                   {$14^{th}$ Heterogeneous Computing Workshop (HCW) 2005}",
  month         = apr # "~4, ",
  year          = "2005",
  address       = "Denver, CO, USA",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-2312-9, ISSN 1530-2075",
  doi           = "http://doi.ieeecomputersociety.org/10.1109/IPDPS.2005.34",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann05lightweight.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann05lightweight.ppt.pdf",
  abstract      = "Harness is a pluggable heterogeneous Distributed Virtual
                   Machine (DVM) environment for parallel and distributed
                   scientific computing. This paper describes recent
                   improvements in the Harness kernel design. By using a
                   lightweight approach and moving previously integrated system
                   services into software modules, the software becomes more
                   versatile and adaptable. This paper outlines these changes
                   and explains the major Harness kernel components in more
                   detail. A short overview is given of ongoing efforts in
                   integrating RMIX, a dynamic heterogeneous reconfigurable
                   communication framework, into the Harness environment as a
                   new plug-in software module. We describe the overall impact
                   of these changes and how they relate to other ongoing work."
}

@conference{engelmann04high,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and George A. (Al) Geist",
  title         = "High Availability through Distributed Control",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2004}{$2^{nd}$ High
                   Availability and Performance Workshop (HAPCW) 2004}, in
                   conjunction with the
                   \href{http://lacsi.rice.edu/symposium/agenda_2004}{$5^{th}$
                   Los Alamos Computer Science Institute (LACSI) Symposium
                   2004}",
  month         = oct # "~12, ",
  year          = "2004",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann04high.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann04high.ppt.pdf",
  abstract      = "Cost-effective, flexible and efficient scientific simulations
                   in cutting-edge research areas utilize huge high-end
                   computing resources with thousands of processors. In the next
                   five to ten years the number of processors in such computer
                   systems will rise to tens of thousands, while scientific
                   application running times are expected to increase further
                   beyond the Mean-Time-To-Interrupt (MTTI) of hardware and
                   system software components. This paper describes the ongoing
                   research in heterogeneous adaptable reconfigurable networked
                   systems (Harness) and its recent achievements in the area of
                   high availability distributed virtual machine environments
                   for parallel and distributed scientific computing. It shows
                   how a distributed control algorithm is able to steer a
                   distributed virtual machine process in virtual synchrony
                   while maintaining consistent replication for high
                   availability. It briefly illustrates ongoing work in
                   heterogeneous reconfigurable communication frameworks and
                   security mechanisms. The paper continues with a short
                   overview of similar research in reliable group communication
                   frameworks, fault-tolerant process groups and highly
                   available distributed virtual processes. It closes with a
                   brief discussion of possible future research directions."
}

@conference{he04highly,
  author        = "Xubin (Ben) He
                   and Li Ou
                   and Stephen L. Scott
                   and Christian Engelmann",
  title         = "A Highly Available Cluster Storage System using Scavenging",
  booktitle     = "Proceedings of the
                   \href{http://xcr.cenit.latech.edu/hapcw2004}{$2^{nd}$ High
                   Availability and Performance Workshop (HAPCW) 2004}, in
                   conjunction with the
                   \href{http://lacsi.rice.edu/symposium/agenda_2004}{$5^{th}$
                   Los Alamos Computer Science Institute (LACSI) Symposium
                   2004}",
  month         = oct # "~12, ",
  year          = "2004",
  address       = "Santa Fe, NM, USA",
  url           = "http://www.csm.ornl.gov/~engelman/publications/he04highly.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/he04highly.ppt.pdf",
  abstract      = "Highly available data storage for high-performance computing
                   is becoming increasingly more critical as high-end computing
                   systems scale up in size and storage systems are developed
                   around network-centered architectures. A promising solution
                   is to harness the collective storage potential of individual
                   workstations much as we harness idle CPU cycles due to the
                   excellent price/performance ratio and low storage usage of
                   most commodity workstations. For such a storage system,
                   metadata consistency is a key issue assuring storage system
                   availability as well as data reliability. In this paper, we
                   present a decentralized metadata management scheme that
                   improves storage availability without sacrificing
                   performance."
}

@conference{engelmann03diskless,
  author        = "Christian Engelmann
                   and George A. (Al) Geist",
  title         = "A Diskless Checkpointing Algorithm for Super-scale
                   Architectures Applied to the Fast Fourier Transform",
  booktitle     = "Proceedings of the
                   \href{http://www.cs.msstate.edu/~clade2003}{Challenges of
                   Large Applications in Distributed Environments Workshop
                   (CLADE) 2003}, in conjunction with the
                   \href{http://csag.ucsd.edu/HPDC-12}{$12^{th}$ IEEE
                   International Symposium on High Performance Distributed
                   Computing (HPDC) 2003}",
  pages         = "47",
  month         = jun # "~21, ",
  year          = "2003",
  address       = "Seattle, WA, USA",
  publisher     = "\href{http://www.computer.org}{IEEE Computer Society, Los
                   Alamitos, CA, USA}",
  isbn          = "0-7695-1984-9",
  doi           = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4159902",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann03diskless.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann03diskless.ppt.pdf",
  abstract      = "This paper discusses the issue of fault-tolerance in
                   distributed computer systems with tens or hundreds of
                   thousands of diskless processor units. Such systems, like the
                   IBM Blue Gene/L, are predicted to be deployed in the next
                   five to ten years. Since a 100,000-processor system is going
                   to be less reliable, scientific applications need to be able
                   to recover from occurring failures more efficiently. In this
                   paper, we adapt the present technique of diskless
                   checkpointing to such huge distributed systems in order to
                   equip existing scientific algorithms with super-scalable
                   fault-tolerance. First, we discuss the method of diskless
                   checkpointing, then we adapt this technique to super-scale
                   architectures and finally we present results from an
                   implementation of the Fast Fourier Transform that uses the
                   adapted technique to achieve super-scale fault-tolerance."
}

@conference{engelmann02distributed,
  author        = "Christian Engelmann
                   and Stephen L. Scott
                   and George A. (Al) Geist",
  title         = "Distributed Peer-to-Peer Control in {Harness}",
  booktitle     = "Lecture Notes in Computer Science: Proceedings of the
                   \href{http://www.science.uva.nl/events/ICCS2002}{$2^{nd}$
                   International Conference on Computational Science (ICCS)
                   2002}, Part II: Workshop on Global and Collaborative
                   Computing",
  volume        = "2330",
  pages         = "720--727",
  month         = apr # "~21-24, ",
  year          = "2002",
  address       = "Amsterdam, The Netherlands",
  publisher     = "\href{http://www.springer.com}{Springer Verlag, Berlin,
                   Germany}",
  isbn          = "3-540-43593-X, ISSN 0302-9743",
  doi           = "http://www.springerlink.com/content/l537ujfwt8yta2dp",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann02distributed.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann02distributed.ppt.pdf",
  abstract      = "Harness is an adaptable fault-tolerant virtual machine
                   environment for next-generation heterogeneous distributed
                   computing developed as a follow on to PVM. It additionally
                   enables the assembly of applications from plug-ins and
                   provides fault-tolerance. This work describes the distributed
                   control, which manages global state replication to ensure a
                   high-availability of service. Group communication services
                   achieve an agreement on an initial global state and a linear
                   history of global state changes at all members of the
                   distributed virtual machine. This global state is replicated
                   to all members to easily recover from single, multiple and
                   cascaded faults. A peer-to-peer ring network architecture and
                   tunable multi-point failure conditions provide heterogeneity
                   and scalability. Finally, the integration of the distributed
                   control into the multi-threaded kernel architecture of
                   Harness offers a fault-tolerant global state database service
                   for plug-ins and applications."
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Conference Poster Presentations
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@misc{scott09tunable,
  author        = "Stephen L. Scott
                   and Christian Engelmann
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Anand Tikotekar
                   and George Ostrouchov
                   and Chokchai (Box) Leangsuksun
                   and Nichamon Naksinehaboon
                   and Raja Nassar
                   and Mihaela Paun
                   and Frank Mueller
                   and Chao Wang
                   and Arun B. Nagarajan
                   and Jyothish Varma",
  title         = "A Tunable Holistic Resiliency Approach for High-Performance
                   Computing Systems",
  month         = feb # "~14-18, ",
  year          = "2009",
  howpublished  = "{Poster at the \href{http://ppopp09.rice.edu}{$14^{th}$ ACM
                   SIGPLAN Symposium on Principles and Practice of Parallel
                   Programming (PPoPP) 2009}, Raleigh, NC, USA}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/scott09tunable.pdf",
  note          = "To appear",
  abstract      = "In order to address anticipated high failure rates,
                   resiliency characteristics have become an urgent priority for
                   next-generation extreme-scale high-performance computing
                   (HPC) systems. This poster describes our past and ongoing
                   efforts in novel fault resilience technologies for HPC.
                   Presented work includes proactive fault resilience
                   techniques, system and application reliability models and
                   analyses, failure prediction, transparent process- and
                   virtual-machine-level migration, and trade-off models for
                   combining preemptive migration with checkpoint/restart. This
                   poster summarizes our work and puts all individual
                   technologies into context with a proposed holistic fault
                   resilience framework."
}

@misc{geist08harness,
  author        = "George A. (Al) Geist
                   and Christian Engelmann
                   and Jack J. Dongarra
                   and George Bosilca
                   and Magdalena M. S\l{}awi\'nska
                   and Jaros\l{}aw K. S\l{}awi\'nski",
  title         = "The {Harness} Workbench: {U}nified and Adaptive Access to
                   Diverse High-Performance Computing Platforms",
  month         = mar # "~30 - " # apr # "~5, ",
  year          = "2008",
  howpublished  = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$
                   High-Performance Computer Science Week (HPCSW) 2008}, Denver,
                   CO, USA}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/geist08harness.pdf",
  abstract      = "This poster summarizes our past and ongoing research and
                   development efforts in novel software solutions for providing
                   unified and adaptive access to diverse high-performance
                   computing (HPC) platforms. The poster showcases developed
                   proof-of-concept implementations of tools and mechanisms that
                   simplify scientific application development and deployment
                   tasks, such that only minimal adaptation is needed when
                   moving from one HPC system to another or after HPC system
                   upgrades."
}

@misc{scott08resiliency,
  author        = "Stephen L. Scott
                   and Christian Engelmann
                   and Hong H. Ong
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Anand Tikotekar
                   and George Ostrouchov
                   and Chokchai (Box) Leangsuksun
                   and Nichamon Naksinehaboon
                   and Raja Nassar
                   and Mihaela Paun
                   and Frank Mueller
                   and Chao Wang
                   and Arun B. Nagarajan
                   and Jyothish Varma
                   and Xubin (Ben) He
                   and Li Ou
                   and Xin Chen",
  title         = "Resiliency for High-Performance Computing Systems",
  month         = mar # "~30 - " # apr # "~5, ",
  year          = "2008",
  howpublished  = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$
                   High-Performance Computer Science Week (HPCSW) 2008}, Denver,
                   CO, USA}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/scott08resiliency.pdf",
  abstract      = "This poster summarizes our past and ongoing research and
                   development efforts in novel system software solutions for
                   providing high-level reliability, availability and
                   serviceability (RAS) for next-generation extreme-scale
                   high-performance computing (HPC) systems and beyond. The
                   poster showcases results of developed proof-of-concept
                   implementations and performed theoretical analyses, outlines
                   planned research and development activities, and presents
                   respective initial results."
}

@misc{scott08systemlevel,
  author        = "Stephen L. Scott
                   and Geoffroy R. Vall\'ee
                   and Thomas Naughton
                   and Anand Tikotekar
                   and Christian Engelmann
                   and Hong H. Ong",
  title         = "System-level Virtualization for for High-Performance
                   Computing",
  month         = mar # "~30 - " # apr # "~5, ",
  year          = "2008",
  howpublished  = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$
                   High-Performance Computer Science Week (HPCSW) 2008}, Denver,
                   CO, USA}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/scott08systemlevel.pdf",
  abstract      = "This poster summarizes our past and ongoing research and
                   development efforts in novel system software solutions for
                   providing a virtual system environment (VSE) for
                   next-generation extreme-scale high-performance computing
                   (HPC) systems and beyond. The poster showcases results of
                   developed proof-of-concept implementations and performed
                   theoretical analyses, outlines planned research and
                   development activities, and presents respective initial
                   results."
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Invited Talks and Lectures
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@misc{engelmann08high,
  author        = "Christian Engelmann",
  title         = "High-Performance Computing Research at Oak Ridge National
                   Laboratory",
  month         = dec # "~8, ",
  year          = "2008",
  howpublished  = "{Invited talk at the Reading Annual Computational Science
                    Workshop, Reading, United Kingdom}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann08high.pdf",
  abstract      = "Oak Ridge National Laboratory (ORNL) is the largest energy
                   laboratory in the United States. Its National Center for
                   Computational Sciences (NCCS) provides the most powerful
                   computing resources in the world for open scientific
                   research. Jaguar, a Cray XT5 system at NCCS, is the second
                   HPC system to exceed 1 PFlop/s (10^15 Floating Point
                   Operations Per Second), and the fastest open science
                   supercomputer in the world. It recently ranked #2 in the Top
                   500 List of Supercomputer Sites with a maximal LINPACK
                   benchmark performance of 1.059 PFlop/s and a theoretical peak
                   performance of 1.3814 PFlop/s. Annually, 80 percent of
                   Jaguar’s resources are allocated through the U.S Department
                   of Energy’s Innovative and Novel Computational Impact on
                   Theory and Experiment (INCITE) program, a competitively
                   selected, peer reviewed process open to researchers from
                   universities, industry, government and non-profit
                   organizations. These allocations address some of the most
                   challenging scientific problems in areas such as climate
                   modeling, renewable energy, materials science, fusion and
                   combustion. In conjunction with NCCS, the Computer Science
                   and Mathematics Division at ORNL performs basic and applied
                   research in HPC, mathematics, and intelligent systems. This
                   talk gives a summary of the HPC research performed at ORNL.
                   It provides details about the Jaguar peta-scale computing
                   resource, an overview of the computational science research
                   carried out using ORNL’s computing resources, and a
                   description of various computer science efforts targeting
                   solutions for next-generation HPC systems."
}

@misc{engelmann08modular,
  author        = "Christian Engelmann",
  title         = "Modular Redundancy in HPC Systems: Why, Where, When and How?",
  month         = oct # "~15, ",
  year          = "2008",
  howpublished  = "{Invited talk at the $1^{st}$ HPC Resiliency Summit: Workshop
                   on Resiliency for Petascale HPC 2008, in conjunction with the
                   \href{http://www.lanl.gov/conferences/lacss/2008}{$1^{st}$
                   Los Alamos Computer Science Symposium (LACSS) 2008}, Santa
                   Fe, NM}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann08modular.ppt.pdf",
  abstract      = "The continuing growth in high-performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). With only very few
                   exceptions, the availability of recently installed systems
                   has been lower in comparison to the same deployment phase of
                   their predecessors. As a result, sites lower allowable job
                   run times in order to force applications to store
                   intermediate results (checkpoints) as insurance against lost
                   computation time. However, checkpoints themselves waste
                   valuable computation time and resources. In contrast to the
                   experienced loss of availability, the demand for continuous
                   availability has risen dramatically with the trend towards
                   capability computing, which drives the race for scientific
                   discovery by running applications on the fastest machines
                   available while desiring significant amounts of time (weeks
                   and months) without interruption. These machines must be able
                   to run in the event of frequent interrupts in such a manner
                   that the capability is not severely degraded. Thus, research
                   and development of scalable RAS technologies is paramount to
                   the success of future extreme-scale systems. This talk
                   summarizes our past accomplishments, ongoing work, and future
                   plans in the area of high-level RAS for HPC."
}

@misc{engelmann08resiliency,
  author        = "Christian Engelmann",
  title         = "Resiliency for High-Performance Computing",
  month         = apr # "~10-12, ",
  year          = "2008",
  howpublished  = "{Invited talk at the
                   \href{http://acet.rdg.ac.uk/events/details/cancun.php}
                   {$2^{nd}$ Collaborative and Grid Computing Technologies
                   Workshop (CGCTW) 2008}, Cancun, Mexico}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann08resiliency.ppt.pdf",
  abstract      = "In order to address anticipated high failure rates,
                   resiliency characteristics have become an urgent priority for
                   next-generation high-performance computing (HPC) systems. One
                   major source of concern are non-recoverable soft errors,
                   i.e., bit flips in memory, cache, registers, and logic. The
                   probability of such errors not only grows with system size,
                   but also with increasing architectural vulnerability caused
                   by employing accelerators and by shrinking nanometer
                   technology. Reactive fault tolerance technologies, such as
                   checkpoint/restart, are unable to handle high failure rates
                   due to associated overheads, while proactive resiliency
                   technologies, such as preemptive migration, simply fail as
                   random soft errors can't be predicted. This talk proposes a
                   new, bold direction in resiliency for HPC as it targets
                   resiliency for next-generation extreme-scale HPC systems at
                   the system software level through computational redundancy
                   strategies, i.e., dual- and triple-modular redundancy."
}

@misc{engelmann08advanced,
  author        = "Christian Engelmann",
  title         = "Advanced Fault Tolerance Solutions for High Performance
                   Computing",
  month         = feb # "~11, ",
  year          = "2008",
  howpublished  = "{Seminar at the \href{http://www.laas.fr}{Laboratoire
                   d'Analyse et d'Architecture des Syst\`emes},
                   \href{http://www.cnrs.fr}{Centre National de la Recherche
                   Scientifique}, Toulouse, France}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann08advanced.ppt.pdf",
  abstract      = "The continuing growth in high performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). With only very few
                   exceptions, the availability of recently installed systems
                   has been lower in comparison to the same deployment phase of
                   their predecessors. As a result, sites lower allowable job
                   run times in order to force applications to store
                   intermediate results (checkpoints) as insurance against lost
                   computation time. However, checkpoints themselves waste
                   valuable computation time and resources. In contrast to the
                   experienced loss of availability, the demand for continuous
                   availability has risen dramatically with the trend towards
                   capability computing, which drives the race for scientific
                   discovery by running applications on the fastest machines
                   available while desiring significant amounts of time (weeks
                   and months) without interruption. These machines must be able
                   to run in the event of frequent interrupts in such a manner
                   that the capability is not severely degraded. Thus, research
                   and development of scalable RAS technologies is paramount to
                   the success of future extreme-scale systems. This talk
                   summarizes our accomplishments in the area of high-level RAS
                   for HPC, such as developed concepts and implemented
                   proof-of-concept prototypes, and describes existing
                   limitations, such as performance issues, which need to be
                   dealt with for production-type deployment."
}

@misc{engelmann07service,
  author        = "Christian Engelmann",
  title         = "Service-Level High Availability in Parallel and Distributed
                   Systems",
  month         = oct # "~10, ",
  year          = "2007",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann07service.pdf",
  abstract      = "As service-oriented architectures become more important in
                   parallel and distributed computing systems, individual
                   service instance reliability as well as appropriate service
                   redundancy are essential to increase overall system
                   availability. This talk focuses on redundancy strategies
                   using service-level replication techniques. An overview of
                   existing programming models for service-level high
                   availability is presented and their differences,
                   similarities, advantages, and disadvantages are discussed.
                   Recent advances in providing service-level symmetric
                   active/active high availability are discussed. While the
                   primary target of the presented research is high availability
                   for service nodes in tightly-coupled extreme-scale
                   high-performance computing (HPC) systems, it is also
                   applicable to loosely-coupled distributed computing
                   scenarios."
}

@misc{engelmann07advanced2,
  author        = "Christian Engelmann",
  title         = "Advanced Fault Tolerance Solutions for High Performance
                   Computing",
  month         = jun # "~8, ",
  year          = "2007",
  howpublished  = "{Invited talk at the
                   \href{http://www.thaigrid.or.th/wttc2007}{Workshop on Trends,
                   Technologies and Collaborative Opportunities in High
                   Performance and Grid Computing (WTTC) 2007}, Khon Kean,
                   Thailand}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann07advanced2.ppt.pdf",
  abstract      = "The continuing growth in high performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). With only very few
                   exceptions, the availability of recently installed systems
                   has been lower in comparison to the same deployment phase of
                   their predecessors. As a result, sites lower allowable job
                   run times in order to force applications to store
                   intermediate results (checkpoints) as insurance against lost
                   computation time. However, checkpoints themselves waste
                   valuable computation time and resources. In contrast to the
                   experienced loss of availability, the demand for continuous
                   availability has risen dramatically with the trend towards
                   capability computing, which drives the race for scientific
                   discovery by running applications on the fastest machines
                   available while desiring significant amounts of time (weeks
                   and months) without interruption. These machines must be able
                   to run in the event of frequent interrupts in such a manner
                   that the capability is not severely degraded. Thus, research
                   and development of scalable RAS technologies is paramount to
                   the success of future extreme-scale systems. This talk
                   summarizes our accomplishments in the area of high-level RAS
                   for HPC, such as developed concepts and implemented
                   proof-of-concept prototypes, and describes existing
                   limitations, such as performance issues, which need to be
                   dealt with for production-type deployment."
}

@misc{engelmann07advanced,
  author        = "Christian Engelmann",
  title         = "Advanced Fault Tolerance Solutions for High Performance
                   Computing",
  month         = jun # "~4-5, ",
  year          = "2007",
  howpublished  = "{Invited talk at the
                   \href{http://www.thaigrid.or.th/wttc2007}{Workshop on Trends,
                   Technologies and Collaborative Opportunities in High
                   Performance and Grid Computing (WTTC) 2007}, Khon Kean,
                   Thailand}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann07advanced.ppt.pdf",
  abstract      = "The continuing growth in high performance computing (HPC)
                   system scale poses a challenge for system software and
                   scientific applications with respect to reliability,
                   availability and serviceability (RAS). With only very few
                   exceptions, the availability of recently installed systems
                   has been lower in comparison to the same deployment phase of
                   their predecessors. As a result, sites lower allowable job
                   run times in order to force applications to store
                   intermediate results (checkpoints) as insurance against lost
                   computation time. However, checkpoints themselves waste
                   valuable computation time and resources. In contrast to the
                   experienced loss of availability, the demand for continuous
                   availability has risen dramatically with the trend towards
                   capability computing, which drives the race for scientific
                   discovery by running applications on the fastest machines
                   available while desiring significant amounts of time (weeks
                   and months) without interruption. These machines must be
                   able to run in the event of frequent interrupts in such a
                   manner that the capability is not severely degraded. Thus,
                   research and development of scalable RAS technologies is
                   paramount to the success of future extreme-scale systems.
                   This talk summarizes our accomplishments in the area of
                   high-level RAS for HPC, such as developed concepts and
                   implemented proof-of-concept prototypes, and describes
                   existing limitations, such as performance issues, which
                   need to be dealt with for production-type deployment."
}

@misc{engelmann07operating,
  author        = "Christian Engelmann",
  title         = "Operating System Research at {ORNL}: {S}ystem-level
                   Virtualization",
  month         = apr # "~10, ",
  year          = "2007",
  howpublished  = "{Seminar at the \href{http://www.gup.uni-linz.ac.at}
                   {Institute of Graphics and Parallel Processing},
                   \href{http://www.uni-linz.ac.at}{Johannes Kepler University},
                   Linz, Austria}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann07operating.ppt.pdf",
  abstract      = "The emergence of virtualization enabled hardware, such as the
                   latest generation AMD and Intel processors, has raised
                   significant interest in High Performance Computing (HPC)
                   community. In particular, system-level virtualization
                   provides an opportunity to advance the design and development
                   of operating systems, programming environments,
                   administration practices, and resource management tools. This
                   leads to some potential research topics for HPC, such as
                   failure tolerance, system management, and solutions for
                   application porting to new HPC platforms. This talk will
                   present an overview of the research in System-level
                   Virtualization taking place by the Systems Research Team in
                   the Computer Science Research Group at Oak Ridge National
                   Laboratory."
}

@misc{engelmann07towards,
  author        = "Christian Engelmann",
  title         = "Towards High Availability for High-Performance Computing
                   System Services: {A}ccomplishments and Limitations",
  month         = mar # "~14, ",
  year          = "2007",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann07towards.pdf",
  abstract      = "During the last several years, our teams at Oak Ridge
                   National Laboratory, Louisiana Tech University, and Tennessee
                   Technological University focused on efficient redundancy
                   strategies for head and service nodes of high-performance
                   computing (HPC) systems in order to pave the way for high
                   availability (HA) in HPC. These nodes typically run critical
                   HPC system services, like job and resource management, and
                   represent single points of failure and control for an entire
                   HPC system. The overarching goal of our research is to
                   provide high-level reliability, availability, and
                   serviceability (RAS) for HPC systems by combining HA and HPC
                   technology. This talk summarizes our accomplishments, such as
                   developed concepts and implemented proof-of-concept
                   prototypes, and describes existing limitations, such as
                   performance issues, which need to be dealt with for
                   production-type deployment."
}

@misc{engelmann06high,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = jun # "~9, ",
  year          = "2006",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann06high.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors, such as the IBM Blue Gene/L and the
                   Cray X1, is the potential inability to identify problems and
                   take preemptive action before a failure impacts a running
                   job. In fact, in systems of this scale, predictions estimate
                   the mean time to interrupt in terms of hours. Current
                   solutions for fault-tolerance in HEC focus on dealing with
                   the result of a failure. However, most are unable to handle
                   runtime system configuration changes caused by failures and
                   require a complete restart of essential system services
                   (e.g. MPI) or even of the entire machine. High availability
                   (HA) computing strives to avoid the problems of unexpected
                   failures through preemptive measures. There are various
                   techniques to implement high availability. In contrast to
                   active/hot-standby high availability with its fail-over
                   model, active/active high availability with its virtual
                   synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of our research is to expand today`s effort in HA for HEC,
                   so that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   This talk will present an overview of recent research at Oak
                   Ridge National Laboratory in high availability solutions for
                   ultra-scale scientific high-end computing."
}

@misc{scott06advancing,
  author        = "Stephen L. Scott
                   and Christian Engelmann",
  title         = "Advancing Reliability, Availability and Serviceability for
                   High-Performance Computing",
  month         = apr # "~19, ",
  year          = "2006",
  howpublished  = "{Seminar at the \href{http://www.gup.uni-linz.ac.at}
                   {Institute of Graphics and Parallel Processing},
                   \href{http://www.uni-linz.ac.at}{Johannes Kepler University},
                   Linz, Austria}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/scott06advancing.ppt.pdf",
  abstract      = "Today’s high performance computing systems have several
                   reliability deficiencies resulting in noticeable availability
                   and serviceability issues. For example, head and service
                   nodes represent a single point of failure and control for an
                   entire system as they render it inaccessible and unmanageable
                   in case of a failure until repair, causing a significant
                   downtime. Furthermore, current solutions for fault-tolerance
                   focus on dealing with the result of a failure. However, most
                   are unable to transparently mask runtime system configuration
                   changes caused by failures and require a complete restart of
                   essential system services, such as MPI, in case of a failure.
                   High availability computing strives to avoid the problems of
                   unexpected failures through preemptive measures. The overall
                   goal of our research is to expand today’s effort in high
                   availability for high-performance computing, so that systems
                   can be kept alive by an OS runtime environment that
                   understands the concepts of dynamic system configuration and
                   degraded operation mode. This talk will present an overview
                   of recent research performed at Oak Ridge National Laboratory
                   in collaboration with Louisiana Tech University, North
                   Carolina State University and the University of Reading in
                   developing core technologies and proof-of-concept prototypes
                   that improve the overall reliability, availability and
                   serviceability of high-performance computing systems."
}

@misc{engelmann05high4,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = oct # "~18, ",
  year          = "2005",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high4.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors, such as the IBM Blue Gene/L and the
                   Cray X1, is the potential inability to identify problems and
                   take preemptive action before a failure impacts a running
                   job. In fact, in systems of this scale, predictions estimate
                   the mean time to interrupt in terms of hours. Current
                   solutions for fault-tolerance in HEC focus on dealing with
                   the result of a failure. However, most are unable to handle
                   runtime system configuration changes caused by failures and
                   require a complete restart of essential system services (e.g.
                   MPI) or even of the entire machine. High availability (HA)
                   computing strives to avoid the problems of unexpected
                   failures through preemptive measures. There are various
                   techniques to implement high availability. In contrast to
                   active/hot-standby high availability with its fail-over
                   model, active/active high availability with its virtual
                   synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of our research is to expand today`s effort in HA for HEC, so
                   that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   This talk will present an overview of recent research at Oak
                   Ridge National Laboratory in high availability solutions for
                   ultra-scale scientific high-end computing."
}

@misc{engelmann05high3,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = sep # "~26, ",
  year          = "2005",
  howpublished  = "{Seminar at the \href{http://www.uncfsu.edu/macsc}{Department
                   of Mathematics and Computer Science},
                   \href{http://www.uncfsu.edu}{Fayetteville State University},
                   Fayetteville, NC, USA}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high3.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors, such as the IBM Blue Gene/L and the
                   Cray X1, is the potential inability to identify problems and
                   take preemptive action before a failure impacts a running
                   job. In fact, in systems of this scale, predictions estimate
                   the mean time to interrupt in terms of hours. Current
                   solutions for fault-tolerance in HEC focus on dealing with
                   the result of a failure. However, most are unable to handle
                   runtime system configuration changes caused by failures and
                   require a complete restart of essential system services (e.g.
                   MPI) or even of the entire machine. High availability (HA)
                   computing strives to avoid the problems of unexpected
                   failures through preemptive measures. There are various
                   techniques to implement high availability. In contrast to
                   active/hot-standby high availability with its fail-over
                   model, active/active high availability with its virtual
                   synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of our research is to expand today’s effort in HA for HEC, so
                   that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   This talk will present an overview of recent research at Oak
                   Ridge National Laboratory in fault tolerance and high
                   availability solutions for ultra-scale scientific high-end
                   computing."
}

@misc{engelmann05high2,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = may # "~13, ",
  year          = "2005",
  howpublished  = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
                   of Computer Science}, \href{http://www.reading.ac.uk}
                   {University of Reading}, Reading, United Kingdom}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high2.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors, such as the IBM Blue Gene/L and the
                   Cray X1, is the potential inability to identify problems and
                   take preemptive action before a failure impacts a running
                   job. In fact, in systems of this scale, predictions estimate
                   the mean time to interrupt in terms of hours. Current
                   solutions for fault-tolerance in HEC focus on dealing with
                   the result of a failure. However, most are unable to handle
                   runtime system configuration changes caused by failures and
                   require a complete restart of essential system services (e.g.
                   MPI) or even of the entire machine. High availability (HA)
                   computing strives to avoid the problems of unexpected
                   failures through preemptive measures. There are various
                   techniques to implement high availability. In contrast to
                   active/hot-standby high availability with its fail-over
                   model, active/active high availability with its virtual
                   synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of our research is to expand today’s effort in HA for HEC,
                   so that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   This talk will present an overview of recent research at Oak
                   Ridge National Laboratory in fault-tolerant heterogeneous
                   metacomputing, advanced super-scalable algorithms and high
                   availability system software for ultra-scale scientific
                   high-end computing."
}

@misc{engelmann05high1,
  author        = "Christian Engelmann",
  title         = "High Availability for Ultra-Scale High-End Scientific
                   Computing",
  month         = apr # "~15, ",
  year          = "2005",
  howpublished  = "{Seminar at the \href{http://cenit.latech.edu}{Center for
                   Entrepreneurship and Information Technology},
                   \href{http://www.latech.edu}{Louisiana Tech University},
                   Ruston, LA, USA}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high1.ppt.pdf",
  abstract      = "A major concern in exploiting ultra-scale architectures for
                   scientific high-end computing (HEC) with tens to hundreds of
                   thousands of processors is the potential inability to
                   identify problems and take preemptive action before a failure
                   impacts a running job. In fact, in systems of this scale,
                   predictions estimate the mean time to interrupt in terms of
                   hours. Current solutions for fault-tolerance in HEC focus on
                   dealing with the result of a failure. However, most are
                   unable to handle runtime system configuration changes caused
                   by failures and require a complete restart of essential
                   system services (e.g. MPI) or even of the entire machine.
                   High availability (HA) computing strives to avoid the
                   problems of unexpected failures through preemptive measures.
                   There are various techniques to implement high availability.
                   In contrast to active/hot-standby high availability with its
                   fail-over model, active/active high availability with its
                   virtual synchrony model is superior in many areas including
                   scalability, throughput, availability and responsiveness.
                   However, it is significantly more complex. The overall goal
                   of this research is to expand today’s effort in HA for HEC,
                   so that systems that have the ability to hot-swap hardware
                   components can be kept alive by an OS runtime environment
                   that understands the concept of dynamic system configuration.
                   With the aim of addressing the future challenges of high
                   availability in ultra-scale HEC, this project intends to
                   develop a proof-of-concept implementation of an active/active
                   high availability system software framework."
}

@misc{engelmann04diskless,
  author        = "Christian Engelmann",
  title         = "Diskless Checkpointing on Super-scale Architectures --
                   {A}pplied to the Fast Fourier Transform",
  month         = feb # "~25, ",
  year          = "2004",
  howpublished  = "{Invited talk at the \href{http://www.siam.org/meetings/pp04}
                   {$11^{th}$ SIAM Conference on Parallel Processing for
                   Scientific Computing (SIAM PP) 2004}, San Francisco, CA,
                   USA}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann04diskless.ppt.pdf",
  abstract      = "This talk discusses the issue of fault-tolerance in
                   distributed computer systems with tens or hundreds of
                   thousands of diskless processor units. Such systems, like the
                   IBM Blue Gene/L, are predicted to be deployed in the next
                   five to ten years. Since a 100,000-processor system is going
                   to be less reliable, scientific applications need to be able
                   to recover from occurring failures more efficiently. In this
                   paper, we adapt the present technique of diskless
                   checkpointing to such huge distributed systems in order to
                   equip existing scientific algorithms with super-scalable
                   fault-tolerance. First, we discuss the method of diskless
                   checkpointing, then we adapt this technique to super-scale
                   architectures and finally we present results from an
                   implementation of the Fast Fourier Transform that uses the
                   adapted technique to achieve super-scale fault-tolerance."
}

@misc{engelmann04superscalable,
  author        = "Christian Engelmann",
  title         = "Super-scalable Algorithms -- {N}ext Generation Supercomputing
                   on 100,000 and more Processors",
  month         = jan # "~29, ",
  year          = "2004",
  howpublished  = "{Seminar at the \href{http://www.csm.ornl.gov}{Computer
                   Science and Mathematics Division}, \href{http://www.ornl.gov}
                   {Oak Ridge National Laboratory}, Oak Ridge, TN, USA}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann04superscalable.ppt.pdf",
  abstract      = "This talk discusses recent research into the issues and
                   potential problems of algorithm scalability and
                   fault-tolerance on next-generation high-performance computer
                   systems with tens and even hundreds of thousands of
                   processors. Such massively parallel computers, like the IBM
                   Blue Gene/L, are going to be deployed in the next five to ten
                   years and existing deficiencies in scalability and
                   fault-tolerance need to be addressed soon. Scientific
                   algorithms have shown poor scalability on 10,000-processor
                   systems that exist today. Furthermore, future systems will be
                   less reliable due to the large number of components.
                   Super-scalable algorithms, which have the properties of scale
                   invariance and natural fault-tolerance, are able to get the
                   correct answer despite multiple task failures and without
                   checkpointing. We will show that such algorithms exist for a
                   wide variety of problems, such as finite difference, finite
                   element, multigrid and global maximum. Despite these
                   findings, traditional algorithms may still be preferred due
                   to their known behavior, or simply because a super-scalable
                   algorithm does not exist or is hard to find for a particular
                   problem. In this case, we propose a peer-to-peer diskless
                   checkpointing algorithm that can provide scale invariant
                   fault-tolerance."
}

@misc{engelmann03distributed,
  author        = "Christian Engelmann",
  title         = "Distributed Peer-to-Peer Control for {Harness}",
  month         = feb # "~11, ",
  year          = "2004",
  howpublished  = "{Seminar at the \href{http://www.csc.ncsu.edu}{Department of
                   Computer Science}, \href{http://www.ncsu.edu}{North Carolina
                   State University}, Raleigh, NC, USA}",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann03distributed.ppt.pdf",
  abstract      = "Harness is an adaptable fault-tolerant virtual machine
                   environment for next-generation heterogeneous distributed
                   computing developed as a follow on to PVM. It additionally
                   enables the assembly of applications from plug-ins and
                   provides fault-tolerance. This work describes the distributed
                   control, which manages global state replication to ensure a
                   high-availability of service. Group communication services
                   achieve an agreement on an initial global state and a linear
                   history of global state changes at all members of the
                   distributed virtual machine. This global state is replicated
                   to all members to easily recover from single, multiple and
                   cascaded faults. A peer-to-peer ring network architecture and
                   tunable multi-point failure conditions provide heterogeneity
                   and scalability. Finally, the integration of the distributed
                   control into the multi-threaded kernel architecture of
                   Harness offers a fault-tolerant global state database service
                   for plug-ins and applications."
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Theses
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@phdthesis{engelmann08symmetric3,
  author        = "Christian Engelmann",
  title         = "Symmetric Active/Active High Availability for
                   High-Performance Computing System Services",
  year          = "2008",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Advisor: Prof. Vassil N. Alexandrov (University of Reading)",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric3.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric3.ppt.pdf",
  abstract      = "In order to address anticipated high failure rates,
                   reliability, availability and serviceability have become an
                   urgent priority for next-generation high-performance
                   computing (HPC) systems. This thesis aims to pave the way for
                   highly available HPC systems by focusing on their most
                   critical components and by reinforcing them with appropriate
                   high availability solutions. Service components, such as head
                   and service nodes, are the Achilles heel of a HPC system.
                   A failure typically results in a complete system-wide outage.
                   This thesis targets efficient software state replication
                   mechanisms for service component redundancy to achieve high
                   availability as well as high performance. Its methodology
                   relies on defining a modern theoretical foundation for
                   providing service-level high availability, identifying
                   availability deficiencies of HPC systems, and comparing
                   various service-level high availability methods. This thesis
                   showcases several developed proof-of-concept prototypes
                   providing high availability for services running on HPC head
                   and service nodes using the symmetric active/active
                   replication method, i.e., state-machine replication, to
                   complement prior work in this area using active/standby and
                   asymmetric active/active configurations. Presented
                   contributions include a generic taxonomy for service high
                   availability, an insight into availability deficiencies of
                   HPC systems, and a unified definition of service-level high
                   availability methods. Further contributions encompass a fully
                   functional symmetric active/active high availability
                   prototype for a HPC job and resource management service that
                   does not require modification of service, a fully functional
                   symmetric active/active high availability prototype for a HPC
                   parallel file system metadata service that offers high
                   performance, and two preliminary prototypes for a transparent
                   symmetric active/active replication software framework for
                   client-service and dependent service scenarios that hide the
                   replication infrastructure from clients and services.
                   Assuming a mean-time to failure of 5,000 hours for a head or
                   service node, all presented prototypes improve service
                   availability from 99.285\% to 99.995\% in a two-node system,
                   and to 99.99996\% with three nodes."
}

@mastersthesis{engelmann01distributed2,
  author        = "Christian Engelmann",
  title         = "Distributed Peer-to-Peer Control for {Harness}",
  month         = jul # "~7, ",
  year          = "2001",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany. Advisors: Prof. Vassil N. Alexandrov (University of
                   Reading); George A. (Al) Geist (Oak Ridge National
                   Laboratory)",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed2.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed2.ppt.pdf",
  abstract      = "Parallel processing, the method of cutting down a large
                   computational problem into many small tasks which are solved
                   in parallel, is a field of increasing importance in science.
                   Cost-effective, flexible and efficient simulations of
                   mathematical models of physical, chemical or biological
                   real-world problems are replacing the traditional
                   experimental research. Current software solutions for
                   parallel and scientific computation, like Parallel Virtual
                   Machine and Message Passing Interface, have limitations in
                   handling faults and failures, in utilizing heterogeneous and
                   dynamically changing communication structures, and in
                   enabling migrating or cooperative applications. The current
                   research in heterogeneous adaptable reconfigurable networked
                   systems (Harness) aims to produce the next generation of
                   software solutions for distributed computing. A
                   high-available and light-weighted distributed virtual
                   machine service provides an encapsulation of a few hundred
                   to a few thousand physical machines in a virtual
                   heterogeneous large scale cluster. A high availability of
                   a service in distributed systems can be achieved by
                   replication of the service state on multiple server
                   processes. If one ore more server processes fails, the
                   surviving ones continue to provide the service because they
                   know the state. Since every member of a distributed virtual
                   machine is part of the distributed virtual machine service
                   state and is able to change this state, a distributed control
                   is needed to replicate the state and maintain its
                   consistency. This distributed control manages state changes
                   as well as the state-replication and the detection of and
                   recovery from faults and failures of server processes. This
                   work analyzes system architectures currently used in
                   heterogeneous distributed computing by defining terms,
                   conditions and assumptions. It shows that such systems are
                   asynchronous and may use partially synchronous communication
                   to detect and to distinguish different classes of faults and
                   failures. It describes how a high availability of a large
                   scale distributed service on a huge number of servers
                   residing on different geographical locations can be realized.
                   Asynchronous group communication services, such as Reliable
                   Broadcast, Atomic Broadcast, Distributed Agreement and
                   Membership, are analyzed to develop linear scalable
                   algorithms in an unidirectional and in a bidirectional
                   connected asynchronous peer-to-peer ring architecture.
                   A Transaction Control group communication service is
                   introduced as state-replication service. The system analysis
                   distinguishes different types of distributed systems, where
                   active transactions execute state changes using
                   non-replicated data of one or more servers and inactive
                   transactions report state changes using replicated data only.
                   It is applicable for passive fault-tolerant distributed
                   databases as well as for active fault-tolerant distributed
                   control mechanisms. No control token is used and time stamps
                   are avoided, so that all members of a server group have equal
                   responsibilities and are independent from the system time.
                   A prototype which implements the most complicated Transaction
                   Control algorithm is realized due to the complexity of the
                   distributed system and the early development stage of the
                   introduced algorithms. The prototype is used to obtain
                   practical experience with the state-replication algorithm."
}

@mastersthesis{engelmann01distributed,
  author        = "Christian Engelmann",
  title         = "Distributed Peer-to-Peer Control for {Harness}",
  month         = feb # "~23, ",
  year          = "2001",
  school        = "\href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK. Advisors: Prof. Uwe Metzler (Technical College
                   for Engineering and Economics (FHTW) Berlin); George A. (Al)
                   Geist (Oak Ridge National Laboratory)",
  url           = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed.ppt.pdf",
  abstract      = "Parallel processing, the method of cutting down a large
                   computational problem into many small tasks which are solved
                   in parallel, is a field of increasing importance in science.
                   Cost-effective, flexible and efficient simulations of
                   mathematical models of physical, chemical or biological
                   real-world problems are replacing the traditional
                   experimental research. Current software solutions for
                   parallel and scientific computation, like Parallel Virtual
                   Machine and Message Passing Interface, have limitations in
                   handling faults and failures, in utilizing heterogeneous and
                   dynamically changing communication structures, and in
                   enabling migrating or cooperative applications. The current
                   research in heterogeneous adaptable reconfigurable networked
                   systems (Harness) aims to produce the next generation of
                   software solutions for distributed computing. A
                   high-available and light-weighted distributed virtual
                   machine service provides an encapsulation of a few hundred
                   to a few thousand physical machines in a virtual
                   heterogeneous large scale cluster. A high availability of
                   a service in distributed systems can be achieved by
                   replication of the service state on multiple server
                   processes. If one ore more server processes fails, the
                   surviving ones continue to provide the service because they
                   know the state. Since every member of a distributed virtual
                   machine is part of the distributed virtual machine service
                   state and is able to change this state, a distributed control
                   is needed to replicate the state and maintain its
                   consistency. This distributed control manages state changes
                   as well as the state-replication and the detection of and
                   recovery from faults and failures of server processes. This
                   work analyzes system architectures currently used in
                   heterogeneous distributed computing by defining terms,
                   conditions and assumptions. It shows that such systems are
                   asynchronous and may use partially synchronous communication
                   to detect and to distinguish different classes of faults and
                   failures. It describes how a high availability of a large
                   scale distributed service on a huge number of servers
                   residing on different geographical locations can be realized.
                   Asynchronous group communication services, such as Reliable
                   Broadcast, Atomic Broadcast, Distributed Agreement and
                   Membership, are analyzed to develop linear scalable
                   algorithms in an unidirectional and in a bidirectional
                   connected asynchronous peer-to-peer ring architecture.
                   A Transaction Control group communication service is
                   introduced as state-replication service. The system analysis
                   distinguishes different types of distributed systems, where
                   active transactions execute state changes using
                   non-replicated data of one or more servers and inactive
                   transactions report state changes using replicated data only.
                   It is applicable for passive fault-tolerant distributed
                   databases as well as for active fault-tolerant distributed
                   control mechanisms. No control token is used and time stamps
                   are avoided, so that all members of a server group have equal
                   responsibilities and are independent from the system time.
                   A prototype which implements the most complicated Transaction
                   Control algorithm is realized due to the complexity of the
                   distributed system and the early development stage of the
                   introduced algorithms. The prototype is used to obtain
                   practical experience with the state-replication algorithm."
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Co-advised Theses
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

@mastersthesis{koenning07virtualized,
  author        = "Bj{\"o}rn K{\"o}nning",
  title         = "Virtualized Environments for the {Harness Workbench}",
  month         = mar # "~14, ",
  year          = "2007",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Advisors: Prof. Vassil N. Alexandrov (University of Reading);
                   Christian Engelmann (Oak Ridge National Laboratory)",
  url           = "http://www.csm.ornl.gov/~engelman/students/koenning07virtualized.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/students/koenning07virtualized.ppt.pdf",
  abstract      = "The expanded use of computational sciences today leads to a
                   significant need of high performance computing systems. High
                   performance computing is currently undergoing vigorous
                   revival, and multiple efforts are underway to develop much
                   faster computing systems in the near future. New software
                   tools are required for the efficient use of petascale
                   computing systems. With the new Harness Workbench Project
                   the Oak Ridge National Laboratory intends to develop an
                   appropriate development and runtime environment for high
                   performance computing platforms. This dissertation project
                   is part of the Harness Workbench Project, and deals with the
                   development of a concept for virtualised environments and
                   various approaches to create and describe them. The developed
                   virtualisation approach is based on the \verb|chroot|
                   mechanism and uses platform-independent environment
                   descriptions. File structures and environment variables are
                   emulated to provide the portability of computational software
                   over diverse high performance computing platforms. Security
                   measures and sandbox characteristic are integrable."
}

@mastersthesis{weber07high,
  author        = "Matthias Weber",
  title         = "High Availability for the {Lustre} File System",
  month         = mar # "~14, ",
  year          = "2007",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany. Advisors: Prof. Vassil N. Alexandrov (University of
                   Reading); Christian Engelmann (Oak Ridge National
                   Laboratory)",
  url           = "http://www.csm.ornl.gov/~engelman/students/weber07high.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/students/weber07high.ppt.pdf",
  abstract      = "With the growing importance of high performance computing
                   and, more importantly, the fast growing size of sophisticated
                   high performance computing systems, research in the area of
                   high availability is essential to meet the needs to sustain
                   the current growth. This Master thesis project aims to
                   improve the availability of Lustre. Major concern of this
                   project is the metadata server of the file system. The
                   metadata server of Lustre suffers from the last single point
                   of failure in the file system. To overcome this single point
                   of failure an active/active high availability approach is
                   introduced. The new file system design with multiple MDS
                   nodes running in virtual synchrony leads to a significant
                   increase of availability. Two prototype implementations aim
                   to show how the proposed system design and its new realized
                   form of symmetric active/active high availability can be
                   accomplished in practice. The results of this work point out
                   the difficulties in adapting the file system to the
                   active/active high availability design. Tests identify not
                   achieved functionality and show performance problems of the
                   proposed solution. The findings of this dissertation may be
                   used for further work on high availability for distributed
                   file systems."
}

@mastersthesis{baumann06design,
  author        = "Ronald Baumann",
  title         = "Design and Development of Prototype Components for the
                   {Harness} High-Performance Computing Workbench",
  month         = mar # "~6, ",
  year          = "2006",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany. Advisors: Prof. Vassil N. Alexandrov (University of
                   Reading); George A. (Al) Geist and Christian  Engelmann (Oak
                   Ridge National Laboratory)",
  url           = "http://www.csm.ornl.gov/~engelman/students/baumann06design.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/students/baumann06design.ppt.pdf",
  abstract      = "This master thesis examines plug-in technology, especially
                   the new field of parallel plug-ins. Plug-ins are popular
                   because they extend the capabilities of software packages
                   such as browsers and Photoshop, and allow an individual user
                   to add new functionality. Parallel plug-ins also provide the
                   above capabilities to a distributed set of resources, i.e.,
                   a plug-in now becomes a set of coordinating plug-ins. Second,
                   the set of plugins may be heterogeneous either in function or
                   because the underlying resources are heterogeneous. This new
                   dimension of complexity provides a rich research space which
                   is explored in this thesis. Experiences are collected and
                   presented as parallel plug-in paradigms and concepts. The
                   Harness framework was used in this project, in particular the
                   plugin manager and available communication capabilities.
                   Plug-ins provide methods for users to extend Harness
                   according to their requirements. The result of this thesis is
                   a parallel plug-in paradigm and template for Harness. Users
                   of the Harness environment will be able to design and
                   implement their applications in the form of parallel plug-ins
                   easier and faster by using the paradigm resulting from this
                   project. Prototypes were implemented which handle different
                   aspects of parallel plug-ins. Parallel plug-in configurations
                   were tested on an appropriate number of Harness kernels,
                   including available communication and error-handling
                   capabilities. Furthermore, research was done in the area of
                   fault tolerance while parallel plug-ins are (un)loaded, as
                   well as while a task is performed."
}

@mastersthesis{uhlemann06high,
  author        = "Kai Uhlemann",
  title         = "High Availability for High-End Scientific Computing",
  school        = "\href{http://www.cs.reading.ac.uk}{Department of Computer
                   Science}, \href{http://www.reading.ac.uk}{University of
                   Reading}, UK",
  month         = mar # "~6, ",
  year          = "2006",
  school        = "Department of Computer Science, University of Reading, UK",
  note          = "Thesis research performed at Oak Ridge National Laboratory.
                   Double diploma in conjunction with the
                   \href{http://www.f1.fhtw-berlin.de}{Department of
                   Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
                   College for Engineering and Economics (FHTW) Berlin},
                   Germany. Advisors: Prof. Vassil N. Alexandrov (University of
                   Reading); George A. (Al) Geist and  Christian Engelmann (Oak
                   Ridge National Laboratory)",
  url           = "http://www.csm.ornl.gov/~engelman/students/uhlemann06high.pdf",
  url2          = "http://www.csm.ornl.gov/~engelman/students/uhlemann06high.ppt.pdf",
  abstract      = "With the growing interest and popularity in high performance
                   cluster computing and, more importantly, the fast growing
                   size of compute clusters, research in the area of high
                   availability is essential to meet the needs to sustain the
                   current growth. This Master thesis project introduces a new
                   approach for high availability focusing on the head node of a
                   cluster system. This projects focus is on providing high
                   availability to the job scheduler service, which is the most
                   vital part of the traditional Beowulf-style cluster
                   architecture. This research seeks to add high availability to
                   the job scheduler service and resource management system,
                   typically running on the head node, leading to a significant
                   increase of availability for cluster computing. Also, this
                   software project takes advantage of the virtual synchrony
                   paradigm to achieve active/active replication, the highest
                   form of high availability. A proof-of-concept implementation
                   shows how high availability can be designed in software and
                   what results can be expected of such a system. The results
                   may be reused for future or existing projects to further
                   improve and extent the high availability of compute
                   clusters."
}