BibTeX Citations
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Journal Publications
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@article{he07unified,
author = "Xubin (Ben) He
and Li Ou
and Martha J. Kosa
and Stephen L. Scott
and Christian Engelmann",
title = "A Unified Multiple-Level Cache for High Performance Cluster
Storage Systems",
journal = "\href{http://www.inderscience.com/browse/index.php?journalcode=ijhpcn}
{International Journal of High Performance Computing and
Networking (IJHPCN)}",
volume = "5",
number = "1-2",
pages = "97--109",
year = "2007. " # "\href{http://www.inderscience.com}{Inderscience
Publishers, Geneve, Switzerland}",
issn = "1740-0562",
doi = "http://dx.doi.org/10.1504/IJHPCN.2007.015768",
url = "http://www.csm.ornl.gov/~engelman/publications/he07unified.pdf",
abstract = "Highly available data storage for high-performance computing
is becoming increasingly more critical as high-end computing
systems scale up in size and storage systems are developed
around network-centered architectures. A promising solution
is to harness the collective storage potential of individual
workstations much as we harness idle CPU cycles due to the
excellent price/performance ratio and low storage usage of
most commodity workstations. For such a storage system,
metadata consistency is a key issue assuring storage system
availability as well as data reliability. In this paper, we
present a decentralized metadata management scheme that
improves storage availability without sacrificing
performance."
}
@article{engelmann06symmetric,
author = "Christian Engelmann
and Stephen L. Scott
and Chokchai (Box) Leangsuksun
and Xubin (Ben) He",
title = "Symmetric Active/Active High Availability for
High-Performance Computing System Services",
journal = "\href{http://www.academypublisher.com/jcp}{Journal of
Computers (JCP)}",
volume = "1",
number = "8",
pages = "43--54",
year = "2006. " # "\href{http://www.academypublisher.com}{Academy
Publisher, Oulu, Finland}",
issn = "1796-203X",
doi = "http://www.academypublisher.com/jcp/vol01/no08/jcp01084354.html",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06symmetric.pdf",
abstract = "This work aims to pave the way for high availability in
high-performance computing (HPC) by focusing on efficient
redundancy strategies for head and service nodes. These nodes
represent single points of failure and control for an entire
HPC system as they render it inaccessible and unmanageable in
case of a failure until repair. The presented approach
introduces two distinct replication methods, internal and
external, for providing symmetric active/active high
availability for multiple redundant head and service nodes
running in virtual synchrony utilizing an existing process
group communication system for service group membership
management and reliable, totally ordered message delivery.
Resented results of a prototype implementation that offers
symmetric active/active replication for HPC job and resource
management using external replication show that the highest
level of availability can be provided with an acceptable
performance trade-off."
}
@article{engelmann06molar,
author = "Christian Engelmann
and Stephen L. Scott
and David E. Bernholdt
and Narasimha R. Gottumukkala
and Chokchai (Box) Leangsuksun
and Jyothish Varma
and Chao Wang
and Frank Mueller
and Aniruddha G. Shet
and Ponnuswamy (Saday) Sadayappan",
title = "{MOLAR}: {A}daptive Runtime Support for High-End Computing
Operating and Runtime Systems",
journal = "\href{http://www.sigops.org/osr.html}{ACM SIGOPS Operating
Systems Review (OSR)}",
volume = "40",
number = "2",
pages = "63--72",
year = "2006. " # "\href{http://www.acm.org}{ACM Press, New York, NY,
USA}",
issn = "0163-5980",
doi = "http://doi.acm.org/10.1145/1131322.1131337",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06molar.pdf",
abstract = "MOLAR is a multi-institutional research effort that
concentrates on adaptive, reliable, and efficient operating
and runtime system (OS/R) solutions for ultra-scale,
high-end scientific computing on the next generation of
supercomputers. This research addresses the challenges
outlined in FAST-OS (forum to address scalable technology for
runtime and operating systems) and HECRTF (high-end computing
revitalization task force) activities by exploring the use of
advanced monitoring and adaptation to improve application
performance and predictability of system interruptions, and
by advancing computer reliability, availability and
serviceability (RAS) management systems to work cooperatively
with the OS/R to identify and preemptively resolve system
issues. This paper describes recent research of the MOLAR
team in advancing RAS for high-end computing OS/Rs."
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Conference Publications
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@conference{engelmann09evaluating,
author = "Christian Engelmann
and Hong H. Ong
and Stephen L. Scott",
title = "Evaluating the Shared Root File System Approach for Diskless
High-Performance Computing Systems",
booktitle = "Proceedings of the
\href{http://www.linuxclustersinstitute.org/conferences}
{$10^{th}$ LCI International Conference on High-Performance
Clustered Computing (LCI) 2009}",
month = mar # "~9-12, ",
year = "2009",
address = "Boulder, CO, USA",
url = "",
url2 = "",
note = "To appear",
abstract = "Diskless high-performance computing (HPC) systems utilizing
networked storage have become popular in the last several
years. Removing disk drives significantly increases compute
node reliability as they are known to be a major source of
failures. Furthermore, networked storage solutions utilizing
parallel I/O and replication are able to provide increased
scalability and availability. Reducing a compute node to
processor(s), memory and network interface(s) greatly reduces
its physical size, which in turn allows for large-scale dense
HPC solutions. However, one major obstacle is the requirement
by certain operating systems (OSs), such as Linux, for a root
file system. While one solution is to remove this requirement
from the OS, another is to share the root file system over
the networked storage. This paper evaluates three networked
file system solutions, NFSv4, Lustre and PVFS2, with respect
to their performance, scalability, and availability features
for servicing a common root file system in a diskless HPC
configuration. Our findings indicate that Lustre is a viable
solution as it meets both, scaling and performance
requirements. However, certain availability issues regarding
single points of failure and control need to be considered."
}
@conference{engelmann09proactive,
author = "Christian Engelmann
and Geoffroy R. Vall\'ee
and Thomas Naughton
and Stephen L. Scott",
title = "Proactive Fault Tolerance Using Preemptive Migration",
booktitle = "Proceedings of the \href{http://www.pdp2009.org}{$17^{th}$
Euromicro International Conference on Parallel, Distributed,
and network-based Processing (PDP) 2009}",
pages = "",
month = feb # "~18-20, ",
year = "2009",
address = "Weimar, Germany",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "",
doi = "",
url = "",
url2 = "",
note = "To appear",
abstract = "Proactive fault tolerance (FT) in high-performance computing
is a concept that prevents compute node failures from
impacting running parallel applications by preemptively
migrating application parts away from nodes that are about
to fail. This paper provides a foundation for proactive FT by
defining its architecture and classifying implementation
options. This paper further relates prior work to the
presented architecture and classification, and discusses the
challenges ahead for needed supporting technologies."
}
@conference{valentini09high,
author = "Alessandro Valentini
and Christian Di Biagio
and Fabrizio Batino
and Guido Pennella
and Fabrizio Palma
and Christian Engelmann",
title = "High Performance Computing with {Harness} over {InfiniBand}",
booktitle = "Proceedings of the \href{http://www.pdp2009.org}{$17^{th}$
Euromicro International Conference on Parallel, Distributed,
and network-based Processing (PDP) 2009}",
pages = "",
month = feb # "~18-20, ",
year = "2009",
address = "Weimar, Germany",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "",
doi = "",
url = "",
url2 = "",
note = "To appear",
abstract = "Harness is an adaptable and plug-in-based middleware
framework able to support distributed parallel computing. By
now, it is based on the Ethernet protocol which cannot
guarantee high performance throughput and Real Time
(determinism) performance. During last years, both the
research and industry environments have developed both new
network architectures (InfiniBand, Myrinet, iWARP, etc.) to
avoid those limits. This paper concerns the integration
between Harness and InfiniBand focusing on two solutions: IP
over InfiniBand (IPoIB) and Socket Direct Protocol (SDP)
technology. Those allow Harness middleware to take advantage
of the enhanced features provided by InfiniBand."
}
@conference{engelmann09case,
author = "Christian Engelmann
and Hong H. Ong
and Stephen L. Scott",
title = "The Case for Modular Redundancy in Large-Scale High
Performance Computing Systems",
booktitle = "Proceedings of the
\href{http://www.iasted.org/conferences/home-641.html}
{$27^{th}$ IASTED International Conference on Parallel and
Distributed Computing and Networks (PDCN) 2009}",
pages = "",
month = feb # "~16-18, ",
year = "2009",
address = "Innsbruck, Austria",
publisher = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB,
Canada}",
isbn = "",
url = "",
url2 = "",
note = "To appear",
abstract = "Recent investigations into resilience of large-scale
high-performance computing (HPC) systems showed a continuous
trend of decreasing reliability and availability. Newly
installed systems have a lower mean-time to failure (MTTF)
and a higher mean-time to recover (MTTR) than their
predecessors. Modular redundancy is being used in many
mission critical systems today to provide for resilience,
such as for aerospace and command \& control systems. The
primary argument against modular redundancy for resilience
in HPC has always been that the capability of a HPC system,
and respective return on investment, would be significantly
reduced. We argue that modular redundancy can significantly
increase compute node availability as it removes the impact
of scale from single compute node MTTR. We further argue that
single compute nodes can be much less reliable, and therefore
less expensive, and still be highly available, if their
MTTR/MTTF ratio is maintained."
}
@conference{wang08proactive,
author = "Chao Wang
and Frank Mueller
and Christian Engelmann
and Stephen L. Scott",
title = "Proactive Process-Level Live Migration in {HPC}
Environments",
booktitle = "Proceedings of the \href{http://sc08.supercomputing.org}
{IEEE/ACM International Conference on High Performance
Computing, Networking, Storage and Analysis (SC) 2008}",
month = nov # "~15-21, ",
year = "2008",
address = "Austin, TX, USA",
publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
isbn = "978-1-4244-2835-9",
doi = "http://doi.acm.org/10.1145/1413370.1413414",
url = "http://www.csm.ornl.gov/~engelman/publications/wang08proactive.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/wang08proactive.ppt.pdf",
abstract = "As the number of nodes in high-performance computing
environments keeps increasing, faults are becoming common
place. Reactive fault tolerance (FT) often does not scale due
to massive I/O requirements and relies on manual job
resubmission. This work complements reactive with proactive
FT at the process level. Through health monitoring, a subset
of node failures can be anticipated when one`s health
deteriorates. A novel process-level live migration mechanism
supports continued execution of applications during much of
processes migration. This scheme is integrated into an MPI
execution environment to transparently sustain
health-inflicted node failures, which eradicates the need to
restart and requeue MPI jobs. Experiments indicate that 1-6.5
seconds of prior warning are required to successfully trigger
live process migration while similar operating system
virtualization mechanisms require 13-24 seconds. This
self-healing approach complements reactive FT by nearly
cutting the number of checkpoints in half when 70\% of the
faults are handled proactively."
}
@conference{vallee08virtual,
author = "Geoffroy R. Vall\'ee
and Thomas Naughton
and Hong H. Ong
and Anand Tikotekar
and Christian Engelmann
and Wesley Bland
and Ferrol Aderholt
and Stephen L. Scott",
title = "Virtual System Environments",
booktitle = "Lecture Notes in Computer Science: Proceedings of the
\href{http://www.dmtf.org/svm08}{$2^{nd}$ DMTF Academic
Alliance Workshop on Systems and Virtualization Management:
Standards and New Technologies (SVM) 2008}",
volume = "",
pages = "",
month = oct # "~21-22, ",
year = "2008",
address = "Munich, Germany",
publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin,
Germany}",
isbn = "",
doi = "",
url = "http://www.csm.ornl.gov/~engelman/publications/vallee08virtual.pdf",
url2 = "",
note = "To appear",
abstract = "Distributed and parallel systems are typically managed with
static settings: the operating system (OS) and the runtime
environment (RTE) are specified at a given time and cannot be
changed to fit an application`s needs. This means that every
time application developers want to use their application on
a new execution platform, the application has to be ported to
this new environment, which may be expensive in terms of
application modifications and developer time. However, the
science resides in the applications and not in the OS or the
RTE. Therefore, it should be beneficial to adapt the OS and
the RTE to the application instead of adapting the
applications to the OS and the RTE. This document presents
the concept of Virtual System Environments (VSE), which
enables application developers to specify and create a
virtual environment that properly fits their application`s
needs. For that four challenges have to be addressed: (i)
definition of the VSE itself by the application developers,
(ii) deployment of the VSE, (iii) system administration for
the platform, and (iv) protection of the platform from the
running VSE. We therefore present an integrated tool for the
definition and deployment of VSEs on top of traditional and
virtual (i.e., using system-level virtualization) execution
platforms. This tool provides the capability to choose the
degree of delegation for system administration tasks and the
degree of protection from the application (e.g., using
virtual machines). To summarize, the VSE concept enables the
customization of the OS/RTE used for the execution of
application by users without compromising local system
administration rules and execution platform protection
constraints."
}
@conference{tikotekar08analysis,
author = "Anand Tikotekar
and Geoffroy Vall\'ee
and Thomas Naughton
and Hong H. Ong
and Christian Engelmann
and Stephen L. Scott",
title = "An Analysis of {HPC} Benchmark Applications in Virtual
Machine Environments",
booktitle = "Lecture Notes in Computer Science: Proceedings of the
\href{http://europar2008.caos.uab.es}{$14^{th}$ European
Conference on Parallel and Distributed Computing (Euro-Par)
2008}: \href{http://scilytics.com/vhpc}{$3^{rd}$ Workshop on
Virtualization in High-Performance Cluster and Grid Computing
(VHPC) 2008}",
volume = "",
pages = "",
month = aug # "~26-29, ",
year = "2008",
address = "Las Palmas de Gran Canaria, Spain",
publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin,
Germany}",
isbn = "",
doi = "",
url = "http://www.csm.ornl.gov/~engelman/publications/tikotekar08analysis.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/tikotekar08analysis.ppt.pdf",
note = "To appear",
abstract = "Virtualization technology has been gaining acceptance in the
scientific community due to its overall flexibility in
running HPC applications. It has been reported that a
specific class of applications is better suited to a
particular type of virtualization scheme or implementation.
For example, Xen has been shown to perform with little
overhead for compute-bound applications. Such a study,
although useful, does not allow us to generalize conclusions
beyond the performance analysis of that application which is
explicitly executed. An explanation of why the generalization
described above is difficult, may be due to the versatility
in applications, which leads to different overheads in
virtual environments. For example, two similar applications
may spend disproportionate amount of time in their respective
library code when run in virtual environments. In this paper,
we aim to study such potential causes by investigating the
behavior and identifying patterns of various overheads for
HPC benchmark applications. Based on the investigation of the
overhead profiles for different benchmarks, we aim to address
questions such as: Are the overhead profiles for a particular
type of benchmarks (such as compute-bound) similar or are
there grounds to conclude otherwise?"
}
@conference{engelmann08symmetric2,
author = "Christian Engelmann
and Stephen L. Scott
and Chokchai (Box) Leangsuksun
and Xubin (Ben) He",
title = "Symmetric Active/Active High Availability for
High-Performance Computing System Services: Accomplishments
and Limitations",
booktitle = "Proceedings of the
\href{http://www.ens-lyon.fr/LIP/RESO/ccgrid2008}{$8^{th}$
IEEE International Symposium on Cluster Computing and the
Grid (CCGrid) 2008}:
\href{http://xcr.cenit.latech.edu/resilience2008}{Workshop on
Resiliency in High Performance Computing (Resilience) 2008}",
pages = "813--818",
month = may # "~19-22, ",
year = "2008",
address = "Lyon, France",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "978-0-7695-3156-4",
doi = "http://doi.ieeecomputersociety.org/10.1109/CCGRID.2008.78",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric2.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric2.pdf",
abstract = "This paper summarizes our efforts over the last 3-4 years in
providing symmetric active/active high availability for
high-performance computing (HPC) system services. This work
paves the way for high-level reliability, availability and
serviceability in extreme-scale HPC systems by focusing on
the most critical components, head and service nodes, and by
reinforcing them with appropriate high availability
solutions. This paper presents our accomplishments in the
form of concepts and respective prototypes, discusses
existing limitations, outlines possible future work, and
describes the relevance of this research to other, planned
efforts."
}
@conference{chen08online,
author = "Xin Chen
and Benjamin Eckart
and Xubin (Ben) He
and Christian Engelmann
and Stephen L. Scott",
title = "An Online Controller Towards Self-Adaptive File System
Availability and Performance",
booktitle = "Proceedings of the
\href{http://xcr.cenit.latech.edu/hapcw2008}{$5^{th}$ High
Availability and Performance Workshop (HAPCW) 2008}, in
conjunction with the \href{http://www.hpcsw.org}{$1^{st}$
High-Performance Computer Science Week (HPCSW) 2008}",
month = apr # "~3-4, ",
year = "2008",
address = "Denver, CO, USA",
url = "http://www.csm.ornl.gov/~engelman/publications/chen08online.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/chen08online.ppt.pdf",
abstract = "At the present time, it can be a significant challenge to
build a large-scale distributed file system that
simultaneously maintains both high availability and high
performance. Although many fault tolerance technologies have
been proposed and used in both commercial and academic
distributed file systems to achieve high availability, most
of them typically sacrifice performance for higher system
availability. Additionally, recent studies show that system
availability and performance are related to the system
workload. In this paper, we analyze the correlations among
availability, performance, and workloads based on a
replication strategy, and we discuss the trade off between
availability and performance with different workloads. Our
analysis leads to the design of an online controller that can
dynamically achieve optimal performance and availability by
tuning the system replication policy."
}
@conference{tikotekar08effects,
author = "Anand Tikotekar
and Geoffroy Vall\'ee
and Thomas Naughton
and Hong H. Ong
and Christian Engelmann
and Stephen L. Scott
and Anthony M. Filippi",
title = "Effects of Virtualization on a Scientific Application --
{R}unning a Hyperspectral Radiative Transfer Code on Virtual
Machines",
booktitle = "Proceedings of the
\href{http://www.csm.ornl.gov/srt/hpcvirt08}{$2^{nd}$
Workshop on System-level Virtualization for High Performance
Computing (HPCVirt) 2008}, in conjunction with the
\href{http://www.eurosys.org/2008}{$3^{rd}$ ACM SIGOPS
European Conference on Computer Systems (EuroSys) 2008}",
month = mar # "~31, ",
year = "2008",
address = "Glasgow, UK",
url = "http://www.csm.ornl.gov/~engelman/publications/tikotekar08effects.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/tikotekar08effects.ppt.pdf",
abstract = "The topic of system-level virtualization has recently begun
to receive interest for high performance computing (HPC).
This is in part due to the isolation and encapsulation
offered by the virtual machine. These traits enable
applications to customize their environments and maintain
consistent software configurations in their virtual domains.
Additionally, there are mechanisms that can be used for fault
tolerance like live virtual machine migration. Given these
attractive benefits to virtualization, a fundamental question
arises, how does this effect my scientific application? We
use this as the premise for our paper and observe a
real-world scientific code running on a Xen virtual machine.
We studied the effects of running a radiative transfer
simulation, Hydrolight, on a virtual machine. We discuss our
methodology and report observations regarding the usage of
virtualization with this application."
}
@conference{engelmann08symmetric,
author = "Christian Engelmann
and Stephen L. Scott
and Chokchai (Box) Leangsuksun
and Xubin (Ben) He",
title = "Symmetric Active/Active Replication for Dependent Services",
booktitle = "Proceedings of the
\href{http://www.ares-conference.eu/ares2008}{$3^{rd}$
International Conference on Availability, Reliability and
Security (ARES) 2008}",
pages = "260--267",
month = mar # "~4-7, ",
year = "2008",
address = "Barcelona, Spain",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "978-0-7695-3102-1",
doi = "http://doi.ieeecomputersociety.org/10.1109/ARES.2008.64",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric.ppt.pdf",
abstract = "During the last several years, we have established the
symmetric active/active replication model for service-level
high availability and implemented several proof-of-concept
prototypes. One major deficiency of our model is its
inability to deal with dependent services, since its original
architecture is based on the client-service model. This paper
extends our model to dependent services using its already
existing mechanisms and features. The presented concept is
based on the idea that a service may also be a client of
another service, and multiple services may be clients of each
other. A high-level abstraction is used to illustrate
dependencies between clients and services, and to decompose
dependencies between services into respective client-service
dependencies. This abstraction may be used for providing
high availability in distributed computing systems with
complex service-oriented architectures."
}
@conference{vallee08framework,
author = "Geoffroy R. Vall\'ee
and Kulathep Charoenpornwattana
and Christian Engelmann
and Anand Tikotekar
and Chokchai (Box) Leangsuksun
and Thomas Naughton
and Stephen L. Scott",
title = "A Framework For Proactive Fault Tolerance",
booktitle = "Proceedings of the
\href{http://www.ares-conference.eu/ares2008}{$3^{rd}$
International Conference on Availability, Reliability and
Security (ARES) 2008}",
pages = "659--664",
month = mar # "~4-7, ",
year = "2008",
address = "Barcelona, Spain",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "978-0-7695-3102-1",
doi = "http://doi.ieeecomputersociety.org/10.1109/ARES.2008.171",
url = "http://www.csm.ornl.gov/~engelman/publications/vallee08framework.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/vallee08framework.ppt.pdf",
abstract = "Fault tolerance is a major concern to guarantee availability
of critical services as well as application execution.
Traditional approaches for fault tolerance include
checkpoint/restart or duplication. However it is also
possible to anticipate failures and proactively take action
before failures occur in order to minimize failure impact on
the system and application execution. This document presents
a proactive fault tolerance framework. This framework can use
different proactive fault tolerance mechanisms, i.e.
migration and pause/unpause. The framework also allows the
implementation of new proactive fault tolerance policies
thanks to a modular architecture. A first proactive fault
tolerance policy has been implemented and preliminary
experimentations have been done based on system-level
virtualization and compared with results obtained by
simulation."
}
@conference{koenning08virtualized,
author = "Bj{\"o}rn K{\"o}nning
and Christian Engelmann
and Stephen L. Scott
and George A. (Al) Geist",
title = "Virtualized Environments for the {Harness} High Performance
Computing Workbench",
booktitle = "Proceedings of the \href{http://www.pdp2008.org}{$16^{th}$
Euromicro International Conference on Parallel, Distributed,
and network-based Processing (PDP) 2008}",
pages = "133--140",
month = feb # "~13-15, ",
year = "2008",
address = "Toulouse, France",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "978-0-7695-3089-5",
doi = "http://doi.ieeecomputersociety.org/10.1109/PDP.2008.14",
url = "http://www.csm.ornl.gov/~engelman/publications/koenning08virtualized.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/koenning08virtualized.ppt.pdf",
abstract = "This paper describes recent accomplishments in providing a
virtualized environment concept and prototype for scientific
application development and deployment as part of the Harness
High Performance Computing (HPC) Workbench research effort.
The presented work focuses on tools and mechanisms that
simplify scientific application development and deployment
tasks, such that only minimal adaptation is needed when
moving from one HPC system to another or after HPC system
upgrades. The overall technical approach focuses on the
concept of adapting the HPC system environment to the actual
needs of individual scientific applications instead of the
traditional scheme of adapting scientific applications to
individual HPC system environment properties. The presented
prototype implementation is based on the mature and
lightweight chroot virtualization approach for Unix-type
systems with a focus on virtualized file system structure
and virtualized shell environment variables utilizing
virtualized environment configuration descriptions in
Extensible Markup Language (XML) format. The presented work
can be easily extended to other virtualization technologies,
such as system-level virtualization solutions using
hypervisors."
}
@conference{vallee08system,
author = "Geoffroy R. Vall\'ee
and Thomas Naughton
and Christian Engelmann
and Hong H. Ong
and Stephen L. Scott",
title = "System-level Virtualization for High Performance Computing",
booktitle = "Proceedings of the \href{http://www.pdp2008.org}{$16^{th}$
Euromicro International Conference on Parallel, Distributed,
and network-based Processing (PDP) 2008}",
pages = "636--643",
month = feb # "~13-15, ",
year = "2008",
address = "Toulouse, France",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "978-0-7695-3089-5",
doi = "http://doi.ieeecomputersociety.org/10.1109/PDP.2008.85",
url = "http://www.csm.ornl.gov/~engelman/publications/vallee08system.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/vallee08system.ppt.pdf",
abstract = "System-level virtualization has been a research topic since
the 70`s but regained popularity during the past few years
because of the availability of efficient solution such as Xen
and the implementation of hardware support in commodity
processors (e.g. Intel-VT, AMD-V). However, a majority of
system-level virtualization projects is guided by the server
consolidation market. As a result, current virtualization
solutions appear to not be suitable for high performance
computing (HPC) which is typically based on large-scale
systems. On another hand there is significant interest in
exploiting virtual machines (VMs) within HPC for a number of
other reasons. By virtualizing the machine, one is able to
run a variety of operating systems and environments as needed
by the applications. Virtualization allows users to isolate
workloads, improving security and reliability. It is also
possible to support non-native environments and/or legacy
operating environments through virtualization. In addition,
it is possible to balance work loads, use migration
techniques to relocate applications from failing machines,
and isolate fault systems for repair. This document presents
the challenges for the implementation of a system-level
virtualization solution for HPC. It also presents a brief
survey of the different approaches and techniques to address
these challenges."
}
@conference{ou07symmetric,
author = "Li Ou
and Christian Engelmann
and Xubin (Ben) He
and Xin Chen
and Stephen L. Scott",
title = "Symmetric Active/Active Metadata Service for Highly Available
Cluster Storage Systems",
booktitle = "Proceedings of the
\href{http://www.iasted.org/conferences/home-590.html}
{$19^{th}$ IASTED International Conference on Parallel and
Distributed Computing and Systems (PDCS) 2007}",
pages = "",
month = nov # "~19-21, ",
year = "2007",
address = "Cambridge, MA, USA",
publisher = "\href{http://www.actapress.com}{ACTA Press, Calgary, AB,
Canada}",
isbn = "978-0-88986-703-1",
url = "http://www.csm.ornl.gov/~engelman/publications/ou07symmetric.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/ou07symmetric.ppt.pdf",
abstract = "In a typical distributed storage system, metadata is stored
and managed by dedicated metadata servers. One way to improve
the availability of distributed storage systems is to deploy
multiple metadata servers. Past research focused on the
active/standby model, where each active server has at least
one redundant idle backup. However, interruption of service
and loss of service state may occur during a fail-over
depending on the used replication technique. The research in
this paper targets the symmetric active/active replication
model using multiple redundant service nodes running in
virtual synchrony. In this model, service node failures do
not cause a fail-over to a backup and there is no disruption
of service or loss of service state. We propose a fast
delivery protocol to reduce the latency of total order
broadcast. Our prototype implementation shows that high
availability of metadata servers can be achieved with an
acceptable performance trade-off using the active/active
metadata server solution."
}
@conference{disaverio07distributed,
author = "Emanuele Di Saverio
and Marco Cesati
and Christian Di Biagio
and Guido Pennella
and Christian Engelmann",
title = "Distributed Real-Time Computing with {Harness}",
booktitle = "Lecture Notes in Computer Science: Proceedings of the
\href{http://pvmmpi07.lri.fr}{$14^{th}$ European PVM/MPI
Users` Group Meeting (EuroPVM/MPI) 2007}",
pages = "281--288",
volume = "4757",
month = sep # "~30 - " # oct # "~3, ",
year = "2007",
address = "Paris, France",
publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin,
Germany}",
isbn = "978-3-540-75415-2, ISSN 0302-9743",
doi = "http://dx.doi.org/10.1007/978-3-540-75416-9_39",
url = "http://www.csm.ornl.gov/~engelman/publications/disaverio07distributed.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/disaverio07distributed.ppt.pdf",
abstract = "Modern parallel and distributed computing solutions are often
built onto a middleware software layer providing a higher
and common level of service between computational nodes.
Harness is an adaptable, plugin-based middleware framework
for parallel and distributed computing. This paper reports
recent research and development results of using Harness for
real-time distributed computing applications in the context
of an industrial environment with the needs to perform
several safety critical tasks. The presented work exploits
the modular architecture of Harness in conjunction with a
lightweight threaded implementation to resolve several
real-time issues by adding three new Harness plug-ins to
provide a prioritized lightweight execution environment, low
latency communication facilities, and local timestamped event
logging."
}
@conference{ou07fast,
author = "Li Ou
and Xubin (Ben) He
and Christian Engelmann
and Stephen L. Scott",
title = "A Fast Delivery Protocol for Total Order Broadcasting",
booktitle = "Proceedings of the \href{http://www.icccn.org/icccn07}
{$16^{th}$ IEEE International Conference on Computer
Communications and Networks (ICCCN) 2007}",
pages = "730--734",
month = aug # "~13-16, ",
year = "2007",
address = "Honolulu, HI, USA",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "978-1-42441-251-8, ISSN 1095-2055",
doi = "http://doi.ieeecomputersociety.org/10.1109/ICCCN.2007.4317904",
url = "http://www.csm.ornl.gov/~engelman/publications/ou07fast.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/ou07fast.ppt.pdf",
abstract = "Sequencer, privilege-based, and communication history
algorithms are popular approaches to implement total
ordering, where communication history algorithms are most
suitable for parallel computing systems, because they provide
best performance under heavy work load. Unfortunately,
post-transmission delay of communication history algorithms
is most apparent when a system is idle. In this paper, we
propose a fast delivery protocol to reduce the latency of
message ordering. The protocol optimizes the total ordering
process by waiting for messages only from a subset of the
machines in the group, and by fast acknowledging messages on
behalf of other machines. Our test results indicate that the
fast delivery protocol is suitable for both idle and heavy
load systems, while reducing the latency of message
ordering."
}
@conference{nagarajan07proactive,
author = "Arun B. Nagarajan
and Frank Mueller
and Christian Engelmann
and Stephen L. Scott",
title = "Proactive Fault Tolerance for {HPC} with {Xen}
Virtualization",
booktitle = "Proceedings of the \href{http://ics07.ac.upc.edu}{$21^{st}$
ACM International Conference on Supercomputing (ICS) 2007}",
pages = "23--32",
month = jun # "~16-20, ",
year = "2007",
address = "Seattle, WA, USA",
publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
isbn = "978-1-59593-768-1",
doi = "http://doi.acm.org/10.1145/1274971.1274978",
url = "http://www.csm.ornl.gov/~engelman/publications/nagarajan07proactive.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/nagarajan07proactive.ppt.pdf",
abstract = "Large-scale parallel computing is relying increasingly on
clusters with thousands of processors. At such large counts
of compute nodes, faults are becoming common place. Current
techniques to tolerate faults focus on reactive schemes to
recover from faults and generally rely on a
checkpoint/restart mechanism. Yet, in today`s systems, node
failures can often be anticipated by detecting a
deteriorating health status. Instead of a reactive scheme for
fault tolerance (FT), we are promoting a proactive one where
processes automatically migrate from unhealthy nodes to
healthy ones. Our approach relies on operating system
virtualization techniques exemplified by but not limited to
Xen. This paper contributes an automatic and transparent
mechanism for proactive FT for arbitrary MPI applications.
It leverages virtualization techniques combined with health
monitoring and load-based migration. We exploit Xen`s live
migration mechanism for a guest operating system (OS) to
migrate an MPI task from a health-deteriorating node to a
healthy one without stopping the MPI task during most of the
migration. Our proactive FT daemon orchestrates the tasks of
health monitoring, load determination and initiation of guest
OS migration. Experimental results demonstrate that live
migration hides migration costs and limits the overhead to
only a few seconds making it an attractive approach to
realize FT in HPC systems. Overall, our enhancements make
proactive FT a valuable asset for long-running MPI
application that is complementary to reactive FT using full
checkpoint/restart schemes since checkpoint frequencies can
be reduced as fewer unanticipated failures are encountered.
In the context of OS virtualization, we believe that this is
the first comprehensive study of proactive fault tolerance
where live migration is actually triggered by health
monitoring."
}
@conference{engelmann07middleware,
author = "Christian Engelmann
and Hong H. Ong
and Stephen L. Scott",
title = "Middleware in Modern High Performance Computing System
Architectures",
booktitle = "Lecture Notes in Computer Science: Proceedings of the
\href{http://www.iccs-meeting.org/iccs2007}{$7^{th}$
International Conference on Computational Science (ICCS)
2007}, Part II: \href{http://www.gup.uni-linz.ac.at/cce2007}
{$4^{th}$ Special Session on Collaborative and Cooperative
Environments (CCE) 2007}",
volume = "4488",
pages = "784--791",
month = may # "~27-30, ",
year = "2007",
address = "Beijing, China",
publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin,
Germany}",
isbn = "3-5407-2585-5, ISSN 0302-9743",
doi = "http://dx.doi.org/10.1007/978-3-540-72586-2_111",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07middleware.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann07middleware.ppt.pdf",
abstract = "A recent trend in modern high performance computing (HPC)
system architectures employs lean compute nodes running a
lightweight operating system (OS). Certain parts of the OS a
well as other system software services are moved to service
nodes in order to increase performance and scalability. This
paper examines the impact of this HPC system architecture
trend on HPC middleware software solutions, which
traditionally equip HPC systems with advanced features, such
as parallel and distributed programming models, appropriate
system resource management mechanisms, remote application
steering and user interaction techniques. Since the approach
of keeping the compute node software stack small and simple
is orthogonal to the middleware concept of adding missing OS
features between OS and application, the role and
architecture of middleware in modern HPC systems needs to be
revisited. The result is a paradigm shift in HPC middleware
design, where single middleware services are moved to service
nodes, while runtime environments (RTEs) continue to reside
on compute nodes."
}
@conference{engelmann07transparent,
author = "Christian Engelmann
and Stephen L. Scott
and Chokchai (Box) Leangsuksun
and Xubin (Ben) He",
title = "Transparent Symmetric Active/Active Replication for
Service-Level High Availability",
booktitle = "Proceedings of the \href{http://ccgrid07.lncc.br}{$7^{th}$
IEEE International Symposium on Cluster Computing and the
Grid (CCGrid) 2007}: \href{http://www.lri.fr/~fedak/gp2pc-07}
{$7^{th}$ International Workshop on Global and Peer-to-Peer
Computing (GP2PC) 2007}",
pages = "755--760",
month = may # "~14-17, ",
year = "2007",
address = "Rio de Janeiro, Brazil",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "0-7695-2833-3",
doi = "http://doi.ieeecomputersociety.org/10.1109/CCGRID.2007.116",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07transparent.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann07transparent.ppt.pdf",
abstract = "As service-oriented architectures become more important in
parallel and distributed computing systems, individual
service instance reliability as well as appropriate service
redundancy becomes an essential necessity in order to
increase overall system availability. This paper focuses on
providing redundancy strategies using service-level
replication techniques. Based on previous research using
symmetric active/active replication, this paper proposes a
transparent symmetric active/active replication approach that
allows for more reuse of code between individual
service-level replication implementations by using a virtual
communication layer. Service- and client-side interceptors
are utilized in order to provide total transparency. Clients
and servers are unaware of the replication infrastructure as
it provides all necessary mechanisms internally."
}
@conference{engelmann07programming,
author = "Christian Engelmann
and Stephen L. Scott
and Chokchai (Box) Leangsuksun
and Xubin (Ben) He",
title = "On Programming Models for Service-Level High Availability",
booktitle = "Proceedings of the
\href{http://www.ares-conference.eu/ares2007}{$2^{nd}$
International Conference on Availability, Reliability and
Security (ARES) 2007}",
pages = "999--1006",
month = apr # "~10-13, ",
year = "2007",
address = "Vienna, Austria",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "0-7695-2775-2",
doi = "http://doi.ieeecomputersociety.org/10.1109/ARES.2007.109",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07programming.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann07programming.ppt.pdf",
abstract = "This paper provides an overview of existing programming
models for service-level high availability and investigates
their differences, similarities, advantages, and
disadvantages. Its goal is to help to improve reuse of code
and to allow adaptation to quality of service requirements by
using a uniform programming model description. It further
aims at encouraging a discussion about these programming
models and their provided quality of service, such as
availability, performance, serviceability, usability, and
applicability. Within this context, the presented research
focuses on providing high availability for services running
on head and service nodes of high-performance computing
systems."
}
@conference{wang07job,
author = "Chao Wang
and Frank Mueller
and Christian Engelmann
and Stephen L. Scott",
title = "A Job Pause Service under {LAM/MPI+BLCR} for Transparent
Fault Tolerance",
booktitle = "Proceedings of the \href{http://www.ipdps.org/ipdps2007}
{$21^{st}$ IEEE International Parallel and Distributed
Processing Symposium (IPDPS) 2007}",
pages = "1-10",
month = mar # "~26-30, ",
year = "2007",
address = "Long Beach, CA, USA",
publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
isbn = "978-1-59593-768-1",
doi = "http://doi.ieeecomputersociety.org/10.1109/IPDPS.2007.370307",
url = "http://www.csm.ornl.gov/~engelman/publications/wang07job.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/wang07job.ppt.pdf",
abstract = "Checkpoint/restart (C/R) has become a requirement for
long-running jobs in large-scale clusters due to a
mean-time-to-failure (MTTF) in the order of hours. After a
failure, C/R mechanisms generally require a complete restart
of an MPI job from the last checkpoint. A complete restart,
however, is unnecessary since all but one node are typically
still alive. Furthermore, a restart may result in lengthy job
requeuing even though the original job had not exceeded its
time quantum. In this paper, we overcome these shortcomings.
Instead of job restart, we have developed a transparent
mechanism for job pause within LAM/MPI+BLCR. This mechanism
allows live nodes to remain active and roll back to the last
checkpoint while failed nodes are dynamically replaced by
spares before resuming from the last checkpoint. Our
methodology includes LAM/MPI enhancements in support of
scalable group communication with fluctuating number of
nodes, reuse of network connections, transparent coordinated
checkpoint scheduling and a BLCR enhancement for job pause.
Experiments in a cluster with the NAS Parallel Benchmark
suite show that our overhead for job pause is comparable to
that of a complete job restart. A minimal overhead of 5.6\%
is only incurred in case migration takes place while the
regular checkpoint overhead remains unchanged. Yet, our
approach alleviates the need to reboot the LAM run-time
environment, which accounts for considerable overhead
resulting in net savings of our scheme in the experiments.
Our solution further provides full transparency and
automation with the additional benefit of reusing existing
resources. Executing continues after failures within the
scheduled job, {\em \textit{i.e.}}, the application staging
overhead is not incurred again in contrast to a restart.
Our scheme offers additional potential for savings through
incremental checkpointing and proactive diskless live
migration, which we are currently working on."
}
@conference{engelmann07configurable,
author = "Christian Engelmann
and Stephen L. Scott
and Hong H. Ong
and Geoffroy R. Vall\'ee
and Thomas Naughton",
title = "Configurable Virtualized System Environments for High
Performance Computing",
booktitle = "Proceedings of the
\href{http://www.csm.ornl.gov/srt/hpcvirt07}{$1^{st}$
Workshop on System-level Virtualization for High Performance
Computing (HPCVirt) 2007}, in conjunction with the
\href{http://www.eurosys.org/2008}{$2^{nd}$ ACM SIGOPS
European Conference on Computer Systems (EuroSys) 2007}",
month = mar # "~20, ",
year = "2007",
address = "Lisbon, Portugal",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07configurable.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann07configurable.ppt.pdf",
abstract = "Existing challenges for current terascale high performance
computing (HPC) systems are increasingly hampering the
development and deployment efforts of system software and
scientific applications for next-generation petascale
systems. The expected rapid system upgrade interval toward
petascale scientific computing demands an incremental
strategy for the development and deployment of legacy and new
large-scale scientific applications that avoids excessive
porting. Furthermore, system software developers as well as
scientific application developers require access to
large-scale testbed environments in order to test individual
solutions at scale. This paper proposes to address these
issues at the system software level through the development
of a virtualized system environment (VSE) for scientific
computing. The proposed VSE approach enables
plug-and-play supercomputing through
desktop-to-cluster-to-petaflop computer system-level
virtualization based on recent advances in hypervisor
virtualization technologies. This paper describes the VSE
system architecture in detail, discusses needed tools for
VSE system management and configuration, and presents
respective VSE use case scenarios."
}
@conference{engelmann06towards,
author = "Christian Engelmann
and Stephen L. Scott
and Chokchai (Box) Leangsuksun
and Xubin (Ben) He",
title = "Towards High Availability for High-Performance Computing
System Services: {A}ccomplishments and Limitations",
booktitle = "Proceedings of the
\href{http://xcr.cenit.latech.edu/hapcw2006}{$4^{th}$ High
Availability and Performance Workshop (HAPCW) 2006}, in
conjunction with the \href{http://lacsi.krellinst.org}
{$7^{th}$ Los Alamos Computer Science Institute (LACSI)
Symposium 2006}",
month = oct # "~17, ",
year = "2006",
address = "Santa Fe, NM, USA",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06towards.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann06towards.ppt.pdf",
abstract = "During the last several years, our teams at Oak Ridge
National Laboratory, Louisiana Tech University, and Tennessee
Technological University focused on efficient redundancy
strategies for head and service nodes of high-performance
computing (HPC) systems in order to pave the way for high
availability (HA) in HPC. These nodes typically run critical
HPC system services, like job and resource management, and
represent single points of failure and control for an entire
HPC system. The overarching goal of our research is to
provide high-level reliability, availability, and
serviceability (RAS) for HPC systems by combining HA and HPC
technology. This paper summarizes our accomplishments, such
as developed concepts and implemented proof-of-concept
prototypes, and describes existing limitations, such as
performance issues, which need to be dealt with for
production-type deployment."
}
@conference{ou06achieving,
author = "Li Ou
and Xin Chen
and Xubin (Ben) He
and Christian Engelmann
and Stephen L. Scott",
title = "Achieving Computational {I/O} Effciency in a High Performance
Cluster Using Multicore Processors",
booktitle = "Proceedings of the
\href{http://xcr.cenit.latech.edu/hapcw2006}{$4^{th}$ High
Availability and Performance Workshop (HAPCW) 2006}, in
conjunction with the \href{http://lacsi.krellinst.org}
{$7^{th}$ Los Alamos Computer Science Institute (LACSI)
Symposium 2006}",
month = oct # "~17, ",
year = "2006",
address = "Santa Fe, NM, USA",
url = "http://www.csm.ornl.gov/~engelman/publications/ou06achieving.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/ou06achieving.ppt.pdf",
abstract = "Cluster computing has become one of the most popular
platforms for high-performance computing today. The recent
popularity of multicore processors provides a flexible way to
increase the computational capability of clusters. Although
the system performance may improve with multicore processors
in a cluster, I/O requests initiated by multiple cores may
saturate the I/O bus, and furthermore increase the latency by
issuing multiple non-contiguous disk accesses. In this
paper, we propose an asymmetric collective I/O for multicore
processors to improve multiple non-contiguous accesses. In
our configuration, one core in each multicore processor is
designated as the coordinator, and others serve as computing
cores. The coordinator is responsible for aggregating I/O
operations from computing cores and submitting a contiguous
request. The coordinator allocates contiguous memory buffers
on behalf of other cores to avoid redundant data copies."
}
@conference{uhlemann06joshua,
author = "Kai Uhlemann
and Christian Engelmann
and Stephen L. Scott",
title = "{JOSHUA}: {S}ymmetric Active/Active Replication for Highly
Available {HPC} Job and Resource Management",
booktitle = "Proceedings of the \href{http://cluster2006.org}{$8^{th}$
IEEE International Conference on Cluster Computing (Cluster)
2006}",
pages = "1-10",
month = sep # "~25-28, ",
year = "2006",
address = "Barcelona, Spain",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "1-4244-0328-6, ISSN 1552-5244",
doi = "http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2006.311855",
url = "http://www.csm.ornl.gov/~engelman/publications/uhlemann06joshua.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/uhlemann06joshua.ppt.pdf",
abstract = "Most of today`s HPC systems employ a single head node for
control, which represents a single point of failure as it
interrupts an entire HPC system upon failure. Furthermore, it
is also a single point of control as it disables an entire
HPC system until repair. One of the most important HPC system
service running on the head node is the job and resource
management. If it goes down, all currently running jobs loose
the service they report back to. They have to be restarted
once the head node is up and running again. With this paper,
we present a generic approach for providing symmetric
active/active replication for highly available HPC job and
resource management. The JOSHUA solution provides a virtually
synchronous environment for continuous availability without
any interruption of service and without any loss of state.
Replication is performed externally via the PBS service
interface without the need to modify any service code. Test
results as well as availability analysis of our
proof-of-concept prototype implementation show that
continuous availability can be provided by JOSHUA with an
acceptable performance trade-off."
}
@conference{baumann06parallel,
author = "Ronald Baumann
and Christian Engelmann
and George A. (Al) Geist",
title = "A Parallel Plug-in Programming Paradigm",
booktitle = "Lecture Notes in Computer Science: Proceedings of the
\href{http://hpcc06.lrr.in.tum.de}{$7^{th}$ International
Conference on High Performance Computing and Communications
(HPCC) 2006}",
volume = "4208",
pages = "823--832",
month = sep # "~13-15, ",
year = "2006",
address = "Munich, Germany",
publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin,
Germany}",
isbn = "978-3-540-39368-9, ISSN 0302-9743",
doi = "http://dx.doi.org/10.1007/11847366_85",
url = "http://www.csm.ornl.gov/~engelman/publications/baumann06parallel.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/baumann06parallel.ppt.pdf",
abstract = "Software component architectures allow assembly of
applications from individual software modules based on
clearly defined programming interfaces, thus improving the
reuse of existing solutions and simplifying application
development. Furthermore, the plug-in programming paradigm
additionally enables runtime reconfigurability, making it
possible to adapt to changing application needs, such as
different application phases, and system properties, like
resource availability, by loading/unloading appropriate
software modules. Similar to parallel programs, parallel
plug-ins are an abstraction for a set of cooperating
individual plug-ins within a parallel application utilizing
a software component architecture. Parallel programming
paradigms apply to parallel plug-ins in the same way they
apply to parallel programs. The research presented in this
paper targets the clear definition of parallel plug-ins and
the development of a parallel plug-in programming paradigm."
}
@conference{varma06scalable,
author = "Jyothish Varma
and Chao Wang
and Frank Mueller
and Christian Engelmann
and Stephen L. Scott",
title = "Scalable, Fault-Tolerant Membership for {MPI} Tasks on {HPC}
Systems",
booktitle = "Proceedings of the \href{http://www.ics-conference.org/2006}
{$20^{th}$ ACM International Conference on Supercomputing
(ICS) 2006}",
pages = "219--228",
month = jun # "~28-30, ",
year = "2006",
address = "Cairns, Australia",
publisher = "\href{http://www.acm.org}{ACM Press, New York, NY, USA}",
doi = "http://doi.acm.org/10.1145/1183401.1183433",
isbn = "1-59593-282-8",
url = "http://www.csm.ornl.gov/~engelman/publications/varma06scalable.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/varma06scalable.ppt.pdf",
abstract = "Reliability is increasingly becoming a challenge for
high-performance computing (HPC) systems with thousands of
nodes, such as IBM`s Blue Gene/L. A shorter
mean-time-to-failure can be addressed by adding fault
tolerance to reconfigure working nodes to ensure that
communication and computation can progress. However, existing
approaches fall short in providing scalability and small
reconfiguration overhead within the fault-tolerant layer.
This paper contributes a scalable approach to reconfigure the
communication infrastructure after node failures. We propose
a decentralized (peer-to-peer) protocol that maintains a
consistent view of active nodes in the presence of faults.
Our protocol shows response times in the order of hundreds of
microseconds and single-digit milliseconds for
reconfiguration using MPI over Blue Gene/L and TCP over
Gigabit, respectively. The protocol can be adapted to match
the network topology to further increase performance. We also
verify experimental results against a performance model,
which demonstrates the scalability of the approach. Hence,
the membership service is suitable for deployment in the
communication layer of MPI runtime systems, and we have
integrated an early version into LAM/MPI."
}
@conference{okunbor06exploring,
author = "Daniel I. Okunbor
and Christian Engelmann
and Stephen L. Scott",
title = "Exploring Process Groups for Reliability, Availability and
Serviceability of Terascale Computing Systems",
booktitle = "Proceedings of the
\href{http://www.atiner.gr/docs/2006AAAPROGRAM_COMP.htm}
{$2^{nd}$ International Conference on Computer Science and
Information Systems 2006}",
month = jun # "~19-21, ",
year = "2006",
address = "Athens, Greece",
url = "http://www.csm.ornl.gov/~engelman/publications/okunbor06exploring.pdf",
abstract = "This paper presents various aspects of reliability,
availability and serviceability (RAS) systems as they relate
to group communication service, including reliable and total
order multicast/broadcast, virtual synchrony, and failure
detection. While the issue of availability, particularly
high availability using replication-based architectures has
recently received upsurge research interests, much still have
to be done in understanding the basic underlying concepts for
achieving RAS systems, especially in high-end and high
performance computing (HPC) communities. Various attributes
of group communication service and the prototype of symmetric
active replication following ideas utilized in the Newtop
protocol will be discussed. We explore the application of
group communication service for RAS HPC, laying the
groundwork for its integrated model."
}
@conference{engelmann06rmix,
author = "Christian Engelmann
and George A. (Al) Geist",
title = "{RMIX}: {A} Dynamic, Heterogeneous, Reconfigurable
Communication Framework",
booktitle = "Lecture Notes in Computer Science: Proceedings of the
\href{http://www.iccs-meeting.org/iccs2006}{$6^{th}$
International Conference on Computational Science (ICCS)
2006}, Part II: \href{http://www.gup.uni-linz.ac.at/cce2006}
{$3^{rd}$ Special Session on Collaborative and Cooperative
Environments (CCE) 2006}",
volume = "3992",
pages = "573--580",
month = may # "~28-31, ",
year = "2006",
address = "Reading, UK",
publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin,
Germany}",
isbn = "3-540-34381-4, ISSN 0302-9743",
doi = "http://dx.doi.org/10.1007/11758525_77",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06rmix.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann06rmix.ppt.pdf",
abstract = "RMIX is a dynamic, heterogeneous, reconfigurable
communication framework that allows software components to
communicate using various RMI/RPC protocols, such as ONC RPC,
Java RMI and SOAP, by facilitating dynamically loadable
provider plug-ins to supply different protocol stacks. With
this paper, we present a native (C-based), flexible,
adaptable, multi-protocol RMI/RPC communication framework
that complements the Java-based RMIX variant previously
developed by our partner team at Emory University. Our
approach offers the same multi-protocol RMI/RPC services
and advanced invocation semantics via a C-based interface
that does not require an object-oriented programming
language. This paper provides a detailed description of our
RMIX framework architecture and some of its features. It
describes the general use case of the RMIX framework and its
integration into the Harness metacomputing environment in the
form of a plug-in."
}
@conference{engelmann06active,
author = "Christian Engelmann
and Stephen L. Scott
and Chokchai (Box) Leangsuksun
and Xubin (Ben) He",
title = "Active/Active Replication for Highly Available {HPC} System
Services",
booktitle = "Proceedings of the
\href{http://www.ares-conference.eu/ares2006}{$1^{st}$
International Conference on Availability, Reliability and
Security (ARES) 2006}: $1^{st}$ International Workshop on
Frontiers in Availability, Reliability and Security (FARES)
2006",
pages = "639-645",
month = apr # "~20-22, ",
year = "2006",
address = "Vienna, Austria",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "0-7695-2567-9",
doi = "http://doi.ieeecomputersociety.org/10.1109/ARES.2006.23",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06active.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann06active.ppt.pdf",
abstract = "Today`s high performance computing systems have several
reliability deficiencies resulting in availability and
serviceability issues. Head and service nodes represent a
single point of failure and control for an entire system as
they render it inaccessible and unmanageable in case of a
failure until repair, causing a significant downtime. This
paper introduces two distinct replication methods (internal
and external) for providing symmetric active/active high
availability for multiple head and service nodes running in
virtual synchrony. It presents a comparison of both methods
in terms of expected correctness, ease-of-use and performance
based on early results from ongoing work in providing
symmetric active/active high availability for two HPC system
services (TORQUE and PVFS metadata server). It continues with
a short description of a distributed mutual exclusion
algorithm and a brief statement regarding the handling of
Byzantine failures. This paper concludes with an overview of
past and ongoing work, and a short summary of the presented
research."
}
@conference{engelmann05concepts,
author = "Christian Engelmann
and Stephen L. Scott",
title = "Concepts for High Availability in Scientific High-End
Computing",
booktitle = "Proceedings of the
\href{http://xcr.cenit.latech.edu/hapcw2005}{$3^{rd}$ High
Availability and Performance Workshop (HAPCW) 2005}, in
conjunction with the
\href{http://lacsi.rice.edu/symposium/agenda_2005}{$6^{th}$
Los Alamos Computer Science Institute (LACSI) Symposium
2005}",
month = oct # "~11, ",
year = "2005",
address = "Santa Fe, NM, USA",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05concepts.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann05concepts.ppt.pdf",
abstract = "Scientific high-end computing (HEC) has become an important
tool for scientists world-wide to understand problems, such
as in nuclear fusion, human genomics and nanotechnology.
Every year, new HEC systems emerge on the market with better
performance and higher scale. With only very few exceptions,
the overall availability of recently installed systems has
been lower in comparison to the same deployment phase of
their predecessors. In contrast to the experienced loss of
availability, the demand for continuous availability has
risen dramatically due to the recent trend towards capability
computing. In this paper, we analyze the existing
deficiencies of current HEC systems and present several high
availability concepts to counter the experienced loss of
availability and to alleviate the expected impact on
next-generation systems. We explain the application of these
concepts to current and future HEC systems and list past and
ongoing related research. This paper closes with a short
summary of the presented work and a brief discussion of
future efforts."
}
@conference{limaye05jobsite,
author = "Kshitij Limaye
and Chokchai (Box) Leangsuksun
and Zeno Greenwood
and Stephen L. Scott
and Christian Engelmann
and Richard M. Libby
and Kasidit Chanchio",
title = "Job-Site Level Fault Tolerance for Cluster and {Grid}
Environments",
booktitle = "Proceedings of the \href{http://cluster2005.org}{$7^{th}$
IEEE International Conference on Cluster Computing (Cluster)
2005}",
pages = "1--9",
month = sep # "~26-30, ",
year = "2005",
address = "Boston, MA, USA",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "0-7803-9486-0, ISSN 1552-5244",
doi = "http://doi.ieeecomputersociety.org/10.1109/CLUSTR.2005.347043",
url = "http://www.csm.ornl.gov/~engelman/publications/limaye05job-site.pdf",
abstract = "In order to adopt high performance clusters and Grid
computing for mission critical applications, fault tolerance
is a necessity. Common fault tolerance techniques in
distributed systems are normally achieved with
checkpoint-recovery and job replication on alternative
resources, in cases of a system outage. The first approach
depends on the system`s MTTR while the latter approach
depends on the availability of alternative sites to run
replicas. There is a need for complementing these approaches
by proactively handling failures at a job-site level,
ensuring the system high availability with no loss of user
submitted jobs. This paper discusses a novel fault tolerance
technique that enables the job-site recovery in Beowulf
cluster-based grid environments, whereas existing techniques
give up a failed system by seeking alternative resources.
Our results suggest sizable aggregate performance improvement
during an implementation of our method in Globus-enabled
HA-OSCAR. The technique called Smart Failover provides a
transparent and graceful recovery mechanism that saves job
states in a local job-manager queue and transfers those
states to the backup server periodically, and in critical
system events. Thus whenever a failover occurs, the backup
server is able to restart the jobs from their last saved
state."
}
@conference{song05umlbased,
author = "Hertong Song
and Chokchai (Box) Leangsuksun
and Raja Nassar
and Yudan Liu
and Christian Engelmann
and Stephen L. Scott",
title = "{UML-based} {Beowulf} Cluster Availability Modeling",
booktitle = "\href{http://www.world-academy-of-science.org/IMCSE2005/ws/SERP}
{International Conference on Software Engineering Research
and Practice (SERP) 2005}",
pages = "161--167",
month = jun # "~27-30, ",
year = "2005",
address = "Las Vegas, NV, USA",
publisher = "CSREA Press",
isbn = "1-932415-49-1"
}
@conference{engelmann05high,
author = "Christian Engelmann
and Stephen L. Scott",
title = "High Availability for Ultra-Scale High-End Scientific
Computing",
booktitle = "Proceedings of the \href{http://coset.irisa.fr}{$2^{nd}$
International Workshop on Operating Systems, Programming
Environments and Management Tools for High-Performance
Computing on Clusters (COSET-2) 2005}, in conjunction with
the \href{http://ics05.csail.mit.edu}{$19^{th}$ ACM
International Conference on Supercomputing (ICS) 2005}",
month = jun # "~19, ",
year = "2005",
address = "Cambridge, MA, USA",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high.ppt.pdf",
abstract = "Ultra-scale architectures for scientific high-end computing
with tens to hundreds of thousands of processors, such as the
IBM Blue Gene/L and the Cray X1, suffer from availability
deficiencies, which impact the efficiency of running
computational jobs by forcing frequent checkpointing of
applications. Most systems are unable to handle runtime
system configuration changes caused by failures and require
a complete restart of essential system services, such as the
job scheduler or MPI, or even of the entire machine. In this
paper, we present a flexible, pluggable and component-based
high availability framework that expands today`s effort in
high availability computing of keeping a single server alive
to include all machines cooperating in a high-end scientific
computing environment, while allowing adaptation to system
properties and application needs."
}
@conference{leangsuksun05asymmetric,
author = "Chokchai (Box) Leangsuksun
and Venkata K. Munganuru
and Tong Liu
and Stephen L. Scott
and Christian Engelmann",
title = "Asymmetric Active-Active High Availability for High-end
Computing",
booktitle = "Proceedings of the \href{http://coset.irisa.fr}{$2^{nd}$
International Workshop on Operating Systems, Programming
Environments and Management Tools for High-Performance
Computing on Clusters (COSET-2) 2005}, in conjunction with
the \href{http://ics05.csail.mit.edu}{$19^{th}$ ACM
International Conference on Supercomputing (ICS) 2005}",
month = jun # "~19, ",
year = "2005",
address = "Cambridge, MA, USA",
url = "http://www.csm.ornl.gov/~engelman/publications/leangsuksun05asymmetric.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/leangsuksun05asymmetric.ppt.pdf",
abstract = "Linux clusters have become very popular for scientific
computing at research institutions world-wide, because they
can be easily deployed at a fairly low cost. However, the
most pressing issues of today`s cluster solutions are
availability and serviceability. The conventional Beowulf
cluster architecture has a single head node connected to a
group of compute nodes. This head node is a typical single
point of failure and control, which severely limits
availability and serviceability by effectively cutting off
healthy compute nodes from the outside world upon overload
or failure. In this paper, we describe a paradigm that
addresses this issue using asymmetric active-active high
availability. Our framework comprises of n + 1 head nodes,
where n head nodes are active in the sense that they provide
services to simultaneously incoming user requests. One
standby server monitors all active servers and performs a
fail-over in case of a detected outage. We present a
prototype implementation based on a 2 + 1 solution and
discuss initial results."
}
@conference{engelmann05superscalable,
author = "Christian Engelmann
and George A. (Al) Geist",
title = "Super-Scalable Algorithms for Computing on 100,000
Processors",
booktitle = "Lecture Notes in Computer Science: Proceedings of the
\href{http://www.iccs-meeting.org/iccs2005}{$5^{th}$
International Conference on Computational Science (ICCS)
2005}, Part I",
volume = "3514",
pages = "313--320",
month = may # "~22-25, ",
year = "2005",
address = "Atlanta, GA, USA",
publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin,
Germany}",
isbn = "978-3-540-26032-5, ISSN 0302-9743",
doi = "http://dx.doi.org/10.1007/11428831_39",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05superscalable.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann05superscalable.ppt.pdf",
abstract = "In the next five years, the number of processors in high-end
systems for scientific computing is expected to rise to tens
and even hundreds of thousands. For example, the IBM Blue
Gene/L can have up to 128,000 processors and the delivery of
the first system is scheduled for 2005. Existing deficiencies
in scalability and fault-tolerance of scientific applications
need to be addressed soon. If the number of processors grows
by a magnitude and efficiency drops by a magnitude, the
overall effective computing performance stays the same.
Furthermore, the mean time to interrupt of high-end computer
systems decreases with scale and complexity. In a
100,000-processor system, failures may occur every couple of
minutes and traditional checkpointing may no longer be
feasible. With this paper, we summarize our recent research
in super-scalable algorithms for computing on 100,000
processors. We introduce the algorithm properties of scale
invariance and natural fault tolerance, and discuss how they
can be applied to two different classes of algorithms. We
also describe a super-scalable diskless checkpointing
algorithm for problems that can`t be transformed into a
super-scalable variant, or where other solutions are more
efficient. Finally, a 100,000-processor simulator is
presented as a platform for testing and experimentation."
}
@conference{engelmann05lightweight,
author = "Christian Engelmann
and George A. (Al) Geist",
title = "A Lightweight Kernel for the Harness Metacomputing
Framework",
booktitle = "Proceedings of the
\href{http://www.ipdps.org/ipdps2005}{$19^{th}$ IEEE
International Parallel and Distributed Processing Symposium
(IPDPS) 2005}: \href{http://www.cs.umass.edu/~rsnbrg/hcw2005}
{$14^{th}$ Heterogeneous Computing Workshop (HCW) 2005}",
month = apr # "~4, ",
year = "2005",
address = "Denver, CO, USA",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "0-7695-2312-9, ISSN 1530-2075",
doi = "http://doi.ieeecomputersociety.org/10.1109/IPDPS.2005.34",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05lightweight.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann05lightweight.ppt.pdf",
abstract = "Harness is a pluggable heterogeneous Distributed Virtual
Machine (DVM) environment for parallel and distributed
scientific computing. This paper describes recent
improvements in the Harness kernel design. By using a
lightweight approach and moving previously integrated system
services into software modules, the software becomes more
versatile and adaptable. This paper outlines these changes
and explains the major Harness kernel components in more
detail. A short overview is given of ongoing efforts in
integrating RMIX, a dynamic heterogeneous reconfigurable
communication framework, into the Harness environment as a
new plug-in software module. We describe the overall impact
of these changes and how they relate to other ongoing work."
}
@conference{engelmann04high,
author = "Christian Engelmann
and Stephen L. Scott
and George A. (Al) Geist",
title = "High Availability through Distributed Control",
booktitle = "Proceedings of the
\href{http://xcr.cenit.latech.edu/hapcw2004}{$2^{nd}$ High
Availability and Performance Workshop (HAPCW) 2004}, in
conjunction with the
\href{http://lacsi.rice.edu/symposium/agenda_2004}{$5^{th}$
Los Alamos Computer Science Institute (LACSI) Symposium
2004}",
month = oct # "~12, ",
year = "2004",
address = "Santa Fe, NM, USA",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann04high.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann04high.ppt.pdf",
abstract = "Cost-effective, flexible and efficient scientific simulations
in cutting-edge research areas utilize huge high-end
computing resources with thousands of processors. In the next
five to ten years the number of processors in such computer
systems will rise to tens of thousands, while scientific
application running times are expected to increase further
beyond the Mean-Time-To-Interrupt (MTTI) of hardware and
system software components. This paper describes the ongoing
research in heterogeneous adaptable reconfigurable networked
systems (Harness) and its recent achievements in the area of
high availability distributed virtual machine environments
for parallel and distributed scientific computing. It shows
how a distributed control algorithm is able to steer a
distributed virtual machine process in virtual synchrony
while maintaining consistent replication for high
availability. It briefly illustrates ongoing work in
heterogeneous reconfigurable communication frameworks and
security mechanisms. The paper continues with a short
overview of similar research in reliable group communication
frameworks, fault-tolerant process groups and highly
available distributed virtual processes. It closes with a
brief discussion of possible future research directions."
}
@conference{he04highly,
author = "Xubin (Ben) He
and Li Ou
and Stephen L. Scott
and Christian Engelmann",
title = "A Highly Available Cluster Storage System using Scavenging",
booktitle = "Proceedings of the
\href{http://xcr.cenit.latech.edu/hapcw2004}{$2^{nd}$ High
Availability and Performance Workshop (HAPCW) 2004}, in
conjunction with the
\href{http://lacsi.rice.edu/symposium/agenda_2004}{$5^{th}$
Los Alamos Computer Science Institute (LACSI) Symposium
2004}",
month = oct # "~12, ",
year = "2004",
address = "Santa Fe, NM, USA",
url = "http://www.csm.ornl.gov/~engelman/publications/he04highly.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/he04highly.ppt.pdf",
abstract = "Highly available data storage for high-performance computing
is becoming increasingly more critical as high-end computing
systems scale up in size and storage systems are developed
around network-centered architectures. A promising solution
is to harness the collective storage potential of individual
workstations much as we harness idle CPU cycles due to the
excellent price/performance ratio and low storage usage of
most commodity workstations. For such a storage system,
metadata consistency is a key issue assuring storage system
availability as well as data reliability. In this paper, we
present a decentralized metadata management scheme that
improves storage availability without sacrificing
performance."
}
@conference{engelmann03diskless,
author = "Christian Engelmann
and George A. (Al) Geist",
title = "A Diskless Checkpointing Algorithm for Super-scale
Architectures Applied to the Fast Fourier Transform",
booktitle = "Proceedings of the
\href{http://www.cs.msstate.edu/~clade2003}{Challenges of
Large Applications in Distributed Environments Workshop
(CLADE) 2003}, in conjunction with the
\href{http://csag.ucsd.edu/HPDC-12}{$12^{th}$ IEEE
International Symposium on High Performance Distributed
Computing (HPDC) 2003}",
pages = "47",
month = jun # "~21, ",
year = "2003",
address = "Seattle, WA, USA",
publisher = "\href{http://www.computer.org}{IEEE Computer Society, Los
Alamitos, CA, USA}",
isbn = "0-7695-1984-9",
doi = "http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4159902",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann03diskless.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann03diskless.ppt.pdf",
abstract = "This paper discusses the issue of fault-tolerance in
distributed computer systems with tens or hundreds of
thousands of diskless processor units. Such systems, like the
IBM Blue Gene/L, are predicted to be deployed in the next
five to ten years. Since a 100,000-processor system is going
to be less reliable, scientific applications need to be able
to recover from occurring failures more efficiently. In this
paper, we adapt the present technique of diskless
checkpointing to such huge distributed systems in order to
equip existing scientific algorithms with super-scalable
fault-tolerance. First, we discuss the method of diskless
checkpointing, then we adapt this technique to super-scale
architectures and finally we present results from an
implementation of the Fast Fourier Transform that uses the
adapted technique to achieve super-scale fault-tolerance."
}
@conference{engelmann02distributed,
author = "Christian Engelmann
and Stephen L. Scott
and George A. (Al) Geist",
title = "Distributed Peer-to-Peer Control in {Harness}",
booktitle = "Lecture Notes in Computer Science: Proceedings of the
\href{http://www.science.uva.nl/events/ICCS2002}{$2^{nd}$
International Conference on Computational Science (ICCS)
2002}, Part II: Workshop on Global and Collaborative
Computing",
volume = "2330",
pages = "720--727",
month = apr # "~21-24, ",
year = "2002",
address = "Amsterdam, The Netherlands",
publisher = "\href{http://www.springer.com}{Springer Verlag, Berlin,
Germany}",
isbn = "3-540-43593-X, ISSN 0302-9743",
doi = "http://www.springerlink.com/content/l537ujfwt8yta2dp",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann02distributed.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann02distributed.ppt.pdf",
abstract = "Harness is an adaptable fault-tolerant virtual machine
environment for next-generation heterogeneous distributed
computing developed as a follow on to PVM. It additionally
enables the assembly of applications from plug-ins and
provides fault-tolerance. This work describes the distributed
control, which manages global state replication to ensure a
high-availability of service. Group communication services
achieve an agreement on an initial global state and a linear
history of global state changes at all members of the
distributed virtual machine. This global state is replicated
to all members to easily recover from single, multiple and
cascaded faults. A peer-to-peer ring network architecture and
tunable multi-point failure conditions provide heterogeneity
and scalability. Finally, the integration of the distributed
control into the multi-threaded kernel architecture of
Harness offers a fault-tolerant global state database service
for plug-ins and applications."
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Conference Poster Presentations
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@misc{scott09tunable,
author = "Stephen L. Scott
and Christian Engelmann
and Geoffroy R. Vall\'ee
and Thomas Naughton
and Anand Tikotekar
and George Ostrouchov
and Chokchai (Box) Leangsuksun
and Nichamon Naksinehaboon
and Raja Nassar
and Mihaela Paun
and Frank Mueller
and Chao Wang
and Arun B. Nagarajan
and Jyothish Varma",
title = "A Tunable Holistic Resiliency Approach for High-Performance
Computing Systems",
month = feb # "~14-18, ",
year = "2009",
howpublished = "{Poster at the \href{http://ppopp09.rice.edu}{$14^{th}$ ACM
SIGPLAN Symposium on Principles and Practice of Parallel
Programming (PPoPP) 2009}, Raleigh, NC, USA}",
url = "http://www.csm.ornl.gov/~engelman/publications/scott09tunable.pdf",
note = "To appear",
abstract = "In order to address anticipated high failure rates,
resiliency characteristics have become an urgent priority for
next-generation extreme-scale high-performance computing
(HPC) systems. This poster describes our past and ongoing
efforts in novel fault resilience technologies for HPC.
Presented work includes proactive fault resilience
techniques, system and application reliability models and
analyses, failure prediction, transparent process- and
virtual-machine-level migration, and trade-off models for
combining preemptive migration with checkpoint/restart. This
poster summarizes our work and puts all individual
technologies into context with a proposed holistic fault
resilience framework."
}
@misc{geist08harness,
author = "George A. (Al) Geist
and Christian Engelmann
and Jack J. Dongarra
and George Bosilca
and Magdalena M. S\l{}awi\'nska
and Jaros\l{}aw K. S\l{}awi\'nski",
title = "The {Harness} Workbench: {U}nified and Adaptive Access to
Diverse High-Performance Computing Platforms",
month = mar # "~30 - " # apr # "~5, ",
year = "2008",
howpublished = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$
High-Performance Computer Science Week (HPCSW) 2008}, Denver,
CO, USA}",
url = "http://www.csm.ornl.gov/~engelman/publications/geist08harness.pdf",
abstract = "This poster summarizes our past and ongoing research and
development efforts in novel software solutions for providing
unified and adaptive access to diverse high-performance
computing (HPC) platforms. The poster showcases developed
proof-of-concept implementations of tools and mechanisms that
simplify scientific application development and deployment
tasks, such that only minimal adaptation is needed when
moving from one HPC system to another or after HPC system
upgrades."
}
@misc{scott08resiliency,
author = "Stephen L. Scott
and Christian Engelmann
and Hong H. Ong
and Geoffroy R. Vall\'ee
and Thomas Naughton
and Anand Tikotekar
and George Ostrouchov
and Chokchai (Box) Leangsuksun
and Nichamon Naksinehaboon
and Raja Nassar
and Mihaela Paun
and Frank Mueller
and Chao Wang
and Arun B. Nagarajan
and Jyothish Varma
and Xubin (Ben) He
and Li Ou
and Xin Chen",
title = "Resiliency for High-Performance Computing Systems",
month = mar # "~30 - " # apr # "~5, ",
year = "2008",
howpublished = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$
High-Performance Computer Science Week (HPCSW) 2008}, Denver,
CO, USA}",
url = "http://www.csm.ornl.gov/~engelman/publications/scott08resiliency.pdf",
abstract = "This poster summarizes our past and ongoing research and
development efforts in novel system software solutions for
providing high-level reliability, availability and
serviceability (RAS) for next-generation extreme-scale
high-performance computing (HPC) systems and beyond. The
poster showcases results of developed proof-of-concept
implementations and performed theoretical analyses, outlines
planned research and development activities, and presents
respective initial results."
}
@misc{scott08systemlevel,
author = "Stephen L. Scott
and Geoffroy R. Vall\'ee
and Thomas Naughton
and Anand Tikotekar
and Christian Engelmann
and Hong H. Ong",
title = "System-level Virtualization for for High-Performance
Computing",
month = mar # "~30 - " # apr # "~5, ",
year = "2008",
howpublished = "{Poster at the \href{http://www.hpcsw.org}{$1^{st}$
High-Performance Computer Science Week (HPCSW) 2008}, Denver,
CO, USA}",
url = "http://www.csm.ornl.gov/~engelman/publications/scott08systemlevel.pdf",
abstract = "This poster summarizes our past and ongoing research and
development efforts in novel system software solutions for
providing a virtual system environment (VSE) for
next-generation extreme-scale high-performance computing
(HPC) systems and beyond. The poster showcases results of
developed proof-of-concept implementations and performed
theoretical analyses, outlines planned research and
development activities, and presents respective initial
results."
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Invited Talks and Lectures
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@misc{engelmann08high,
author = "Christian Engelmann",
title = "High-Performance Computing Research at Oak Ridge National
Laboratory",
month = dec # "~8, ",
year = "2008",
howpublished = "{Invited talk at the Reading Annual Computational Science
Workshop, Reading, United Kingdom}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08high.pdf",
abstract = "Oak Ridge National Laboratory (ORNL) is the largest energy
laboratory in the United States. Its National Center for
Computational Sciences (NCCS) provides the most powerful
computing resources in the world for open scientific
research. Jaguar, a Cray XT5 system at NCCS, is the second
HPC system to exceed 1 PFlop/s (10^15 Floating Point
Operations Per Second), and the fastest open science
supercomputer in the world. It recently ranked #2 in the Top
500 List of Supercomputer Sites with a maximal LINPACK
benchmark performance of 1.059 PFlop/s and a theoretical peak
performance of 1.3814 PFlop/s. Annually, 80 percent of
Jaguar’s resources are allocated through the U.S Department
of Energy’s Innovative and Novel Computational Impact on
Theory and Experiment (INCITE) program, a competitively
selected, peer reviewed process open to researchers from
universities, industry, government and non-profit
organizations. These allocations address some of the most
challenging scientific problems in areas such as climate
modeling, renewable energy, materials science, fusion and
combustion. In conjunction with NCCS, the Computer Science
and Mathematics Division at ORNL performs basic and applied
research in HPC, mathematics, and intelligent systems. This
talk gives a summary of the HPC research performed at ORNL.
It provides details about the Jaguar peta-scale computing
resource, an overview of the computational science research
carried out using ORNL’s computing resources, and a
description of various computer science efforts targeting
solutions for next-generation HPC systems."
}
@misc{engelmann08modular,
author = "Christian Engelmann",
title = "Modular Redundancy in HPC Systems: Why, Where, When and How?",
month = oct # "~15, ",
year = "2008",
howpublished = "{Invited talk at the $1^{st}$ HPC Resiliency Summit: Workshop
on Resiliency for Petascale HPC 2008, in conjunction with the
\href{http://www.lanl.gov/conferences/lacss/2008}{$1^{st}$
Los Alamos Computer Science Symposium (LACSS) 2008}, Santa
Fe, NM}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08modular.ppt.pdf",
abstract = "The continuing growth in high-performance computing (HPC)
system scale poses a challenge for system software and
scientific applications with respect to reliability,
availability and serviceability (RAS). With only very few
exceptions, the availability of recently installed systems
has been lower in comparison to the same deployment phase of
their predecessors. As a result, sites lower allowable job
run times in order to force applications to store
intermediate results (checkpoints) as insurance against lost
computation time. However, checkpoints themselves waste
valuable computation time and resources. In contrast to the
experienced loss of availability, the demand for continuous
availability has risen dramatically with the trend towards
capability computing, which drives the race for scientific
discovery by running applications on the fastest machines
available while desiring significant amounts of time (weeks
and months) without interruption. These machines must be able
to run in the event of frequent interrupts in such a manner
that the capability is not severely degraded. Thus, research
and development of scalable RAS technologies is paramount to
the success of future extreme-scale systems. This talk
summarizes our past accomplishments, ongoing work, and future
plans in the area of high-level RAS for HPC."
}
@misc{engelmann08resiliency,
author = "Christian Engelmann",
title = "Resiliency for High-Performance Computing",
month = apr # "~10-12, ",
year = "2008",
howpublished = "{Invited talk at the
\href{http://acet.rdg.ac.uk/events/details/cancun.php}
{$2^{nd}$ Collaborative and Grid Computing Technologies
Workshop (CGCTW) 2008}, Cancun, Mexico}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08resiliency.ppt.pdf",
abstract = "In order to address anticipated high failure rates,
resiliency characteristics have become an urgent priority for
next-generation high-performance computing (HPC) systems. One
major source of concern are non-recoverable soft errors,
i.e., bit flips in memory, cache, registers, and logic. The
probability of such errors not only grows with system size,
but also with increasing architectural vulnerability caused
by employing accelerators and by shrinking nanometer
technology. Reactive fault tolerance technologies, such as
checkpoint/restart, are unable to handle high failure rates
due to associated overheads, while proactive resiliency
technologies, such as preemptive migration, simply fail as
random soft errors can't be predicted. This talk proposes a
new, bold direction in resiliency for HPC as it targets
resiliency for next-generation extreme-scale HPC systems at
the system software level through computational redundancy
strategies, i.e., dual- and triple-modular redundancy."
}
@misc{engelmann08advanced,
author = "Christian Engelmann",
title = "Advanced Fault Tolerance Solutions for High Performance
Computing",
month = feb # "~11, ",
year = "2008",
howpublished = "{Seminar at the \href{http://www.laas.fr}{Laboratoire
d'Analyse et d'Architecture des Syst\`emes},
\href{http://www.cnrs.fr}{Centre National de la Recherche
Scientifique}, Toulouse, France}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08advanced.ppt.pdf",
abstract = "The continuing growth in high performance computing (HPC)
system scale poses a challenge for system software and
scientific applications with respect to reliability,
availability and serviceability (RAS). With only very few
exceptions, the availability of recently installed systems
has been lower in comparison to the same deployment phase of
their predecessors. As a result, sites lower allowable job
run times in order to force applications to store
intermediate results (checkpoints) as insurance against lost
computation time. However, checkpoints themselves waste
valuable computation time and resources. In contrast to the
experienced loss of availability, the demand for continuous
availability has risen dramatically with the trend towards
capability computing, which drives the race for scientific
discovery by running applications on the fastest machines
available while desiring significant amounts of time (weeks
and months) without interruption. These machines must be able
to run in the event of frequent interrupts in such a manner
that the capability is not severely degraded. Thus, research
and development of scalable RAS technologies is paramount to
the success of future extreme-scale systems. This talk
summarizes our accomplishments in the area of high-level RAS
for HPC, such as developed concepts and implemented
proof-of-concept prototypes, and describes existing
limitations, such as performance issues, which need to be
dealt with for production-type deployment."
}
@misc{engelmann07service,
author = "Christian Engelmann",
title = "Service-Level High Availability in Parallel and Distributed
Systems",
month = oct # "~10, ",
year = "2007",
howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
of Computer Science}, \href{http://www.reading.ac.uk}
{University of Reading}, Reading, United Kingdom}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07service.pdf",
abstract = "As service-oriented architectures become more important in
parallel and distributed computing systems, individual
service instance reliability as well as appropriate service
redundancy are essential to increase overall system
availability. This talk focuses on redundancy strategies
using service-level replication techniques. An overview of
existing programming models for service-level high
availability is presented and their differences,
similarities, advantages, and disadvantages are discussed.
Recent advances in providing service-level symmetric
active/active high availability are discussed. While the
primary target of the presented research is high availability
for service nodes in tightly-coupled extreme-scale
high-performance computing (HPC) systems, it is also
applicable to loosely-coupled distributed computing
scenarios."
}
@misc{engelmann07advanced2,
author = "Christian Engelmann",
title = "Advanced Fault Tolerance Solutions for High Performance
Computing",
month = jun # "~8, ",
year = "2007",
howpublished = "{Invited talk at the
\href{http://www.thaigrid.or.th/wttc2007}{Workshop on Trends,
Technologies and Collaborative Opportunities in High
Performance and Grid Computing (WTTC) 2007}, Khon Kean,
Thailand}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07advanced2.ppt.pdf",
abstract = "The continuing growth in high performance computing (HPC)
system scale poses a challenge for system software and
scientific applications with respect to reliability,
availability and serviceability (RAS). With only very few
exceptions, the availability of recently installed systems
has been lower in comparison to the same deployment phase of
their predecessors. As a result, sites lower allowable job
run times in order to force applications to store
intermediate results (checkpoints) as insurance against lost
computation time. However, checkpoints themselves waste
valuable computation time and resources. In contrast to the
experienced loss of availability, the demand for continuous
availability has risen dramatically with the trend towards
capability computing, which drives the race for scientific
discovery by running applications on the fastest machines
available while desiring significant amounts of time (weeks
and months) without interruption. These machines must be able
to run in the event of frequent interrupts in such a manner
that the capability is not severely degraded. Thus, research
and development of scalable RAS technologies is paramount to
the success of future extreme-scale systems. This talk
summarizes our accomplishments in the area of high-level RAS
for HPC, such as developed concepts and implemented
proof-of-concept prototypes, and describes existing
limitations, such as performance issues, which need to be
dealt with for production-type deployment."
}
@misc{engelmann07advanced,
author = "Christian Engelmann",
title = "Advanced Fault Tolerance Solutions for High Performance
Computing",
month = jun # "~4-5, ",
year = "2007",
howpublished = "{Invited talk at the
\href{http://www.thaigrid.or.th/wttc2007}{Workshop on Trends,
Technologies and Collaborative Opportunities in High
Performance and Grid Computing (WTTC) 2007}, Khon Kean,
Thailand}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07advanced.ppt.pdf",
abstract = "The continuing growth in high performance computing (HPC)
system scale poses a challenge for system software and
scientific applications with respect to reliability,
availability and serviceability (RAS). With only very few
exceptions, the availability of recently installed systems
has been lower in comparison to the same deployment phase of
their predecessors. As a result, sites lower allowable job
run times in order to force applications to store
intermediate results (checkpoints) as insurance against lost
computation time. However, checkpoints themselves waste
valuable computation time and resources. In contrast to the
experienced loss of availability, the demand for continuous
availability has risen dramatically with the trend towards
capability computing, which drives the race for scientific
discovery by running applications on the fastest machines
available while desiring significant amounts of time (weeks
and months) without interruption. These machines must be
able to run in the event of frequent interrupts in such a
manner that the capability is not severely degraded. Thus,
research and development of scalable RAS technologies is
paramount to the success of future extreme-scale systems.
This talk summarizes our accomplishments in the area of
high-level RAS for HPC, such as developed concepts and
implemented proof-of-concept prototypes, and describes
existing limitations, such as performance issues, which
need to be dealt with for production-type deployment."
}
@misc{engelmann07operating,
author = "Christian Engelmann",
title = "Operating System Research at {ORNL}: {S}ystem-level
Virtualization",
month = apr # "~10, ",
year = "2007",
howpublished = "{Seminar at the \href{http://www.gup.uni-linz.ac.at}
{Institute of Graphics and Parallel Processing},
\href{http://www.uni-linz.ac.at}{Johannes Kepler University},
Linz, Austria}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07operating.ppt.pdf",
abstract = "The emergence of virtualization enabled hardware, such as the
latest generation AMD and Intel processors, has raised
significant interest in High Performance Computing (HPC)
community. In particular, system-level virtualization
provides an opportunity to advance the design and development
of operating systems, programming environments,
administration practices, and resource management tools. This
leads to some potential research topics for HPC, such as
failure tolerance, system management, and solutions for
application porting to new HPC platforms. This talk will
present an overview of the research in System-level
Virtualization taking place by the Systems Research Team in
the Computer Science Research Group at Oak Ridge National
Laboratory."
}
@misc{engelmann07towards,
author = "Christian Engelmann",
title = "Towards High Availability for High-Performance Computing
System Services: {A}ccomplishments and Limitations",
month = mar # "~14, ",
year = "2007",
howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
of Computer Science}, \href{http://www.reading.ac.uk}
{University of Reading}, Reading, United Kingdom}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07towards.pdf",
abstract = "During the last several years, our teams at Oak Ridge
National Laboratory, Louisiana Tech University, and Tennessee
Technological University focused on efficient redundancy
strategies for head and service nodes of high-performance
computing (HPC) systems in order to pave the way for high
availability (HA) in HPC. These nodes typically run critical
HPC system services, like job and resource management, and
represent single points of failure and control for an entire
HPC system. The overarching goal of our research is to
provide high-level reliability, availability, and
serviceability (RAS) for HPC systems by combining HA and HPC
technology. This talk summarizes our accomplishments, such as
developed concepts and implemented proof-of-concept
prototypes, and describes existing limitations, such as
performance issues, which need to be dealt with for
production-type deployment."
}
@misc{engelmann06high,
author = "Christian Engelmann",
title = "High Availability for Ultra-Scale High-End Scientific
Computing",
month = jun # "~9, ",
year = "2006",
howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
of Computer Science}, \href{http://www.reading.ac.uk}
{University of Reading}, Reading, United Kingdom}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06high.ppt.pdf",
abstract = "A major concern in exploiting ultra-scale architectures for
scientific high-end computing (HEC) with tens to hundreds of
thousands of processors, such as the IBM Blue Gene/L and the
Cray X1, is the potential inability to identify problems and
take preemptive action before a failure impacts a running
job. In fact, in systems of this scale, predictions estimate
the mean time to interrupt in terms of hours. Current
solutions for fault-tolerance in HEC focus on dealing with
the result of a failure. However, most are unable to handle
runtime system configuration changes caused by failures and
require a complete restart of essential system services
(e.g. MPI) or even of the entire machine. High availability
(HA) computing strives to avoid the problems of unexpected
failures through preemptive measures. There are various
techniques to implement high availability. In contrast to
active/hot-standby high availability with its fail-over
model, active/active high availability with its virtual
synchrony model is superior in many areas including
scalability, throughput, availability and responsiveness.
However, it is significantly more complex. The overall goal
of our research is to expand today`s effort in HA for HEC,
so that systems that have the ability to hot-swap hardware
components can be kept alive by an OS runtime environment
that understands the concept of dynamic system configuration.
This talk will present an overview of recent research at Oak
Ridge National Laboratory in high availability solutions for
ultra-scale scientific high-end computing."
}
@misc{scott06advancing,
author = "Stephen L. Scott
and Christian Engelmann",
title = "Advancing Reliability, Availability and Serviceability for
High-Performance Computing",
month = apr # "~19, ",
year = "2006",
howpublished = "{Seminar at the \href{http://www.gup.uni-linz.ac.at}
{Institute of Graphics and Parallel Processing},
\href{http://www.uni-linz.ac.at}{Johannes Kepler University},
Linz, Austria}",
url = "http://www.csm.ornl.gov/~engelman/publications/scott06advancing.ppt.pdf",
abstract = "Today’s high performance computing systems have several
reliability deficiencies resulting in noticeable availability
and serviceability issues. For example, head and service
nodes represent a single point of failure and control for an
entire system as they render it inaccessible and unmanageable
in case of a failure until repair, causing a significant
downtime. Furthermore, current solutions for fault-tolerance
focus on dealing with the result of a failure. However, most
are unable to transparently mask runtime system configuration
changes caused by failures and require a complete restart of
essential system services, such as MPI, in case of a failure.
High availability computing strives to avoid the problems of
unexpected failures through preemptive measures. The overall
goal of our research is to expand today’s effort in high
availability for high-performance computing, so that systems
can be kept alive by an OS runtime environment that
understands the concepts of dynamic system configuration and
degraded operation mode. This talk will present an overview
of recent research performed at Oak Ridge National Laboratory
in collaboration with Louisiana Tech University, North
Carolina State University and the University of Reading in
developing core technologies and proof-of-concept prototypes
that improve the overall reliability, availability and
serviceability of high-performance computing systems."
}
@misc{engelmann05high4,
author = "Christian Engelmann",
title = "High Availability for Ultra-Scale High-End Scientific
Computing",
month = oct # "~18, ",
year = "2005",
howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
of Computer Science}, \href{http://www.reading.ac.uk}
{University of Reading}, Reading, United Kingdom}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high4.ppt.pdf",
abstract = "A major concern in exploiting ultra-scale architectures for
scientific high-end computing (HEC) with tens to hundreds of
thousands of processors, such as the IBM Blue Gene/L and the
Cray X1, is the potential inability to identify problems and
take preemptive action before a failure impacts a running
job. In fact, in systems of this scale, predictions estimate
the mean time to interrupt in terms of hours. Current
solutions for fault-tolerance in HEC focus on dealing with
the result of a failure. However, most are unable to handle
runtime system configuration changes caused by failures and
require a complete restart of essential system services (e.g.
MPI) or even of the entire machine. High availability (HA)
computing strives to avoid the problems of unexpected
failures through preemptive measures. There are various
techniques to implement high availability. In contrast to
active/hot-standby high availability with its fail-over
model, active/active high availability with its virtual
synchrony model is superior in many areas including
scalability, throughput, availability and responsiveness.
However, it is significantly more complex. The overall goal
of our research is to expand today`s effort in HA for HEC, so
that systems that have the ability to hot-swap hardware
components can be kept alive by an OS runtime environment
that understands the concept of dynamic system configuration.
This talk will present an overview of recent research at Oak
Ridge National Laboratory in high availability solutions for
ultra-scale scientific high-end computing."
}
@misc{engelmann05high3,
author = "Christian Engelmann",
title = "High Availability for Ultra-Scale High-End Scientific
Computing",
month = sep # "~26, ",
year = "2005",
howpublished = "{Seminar at the \href{http://www.uncfsu.edu/macsc}{Department
of Mathematics and Computer Science},
\href{http://www.uncfsu.edu}{Fayetteville State University},
Fayetteville, NC, USA}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high3.ppt.pdf",
abstract = "A major concern in exploiting ultra-scale architectures for
scientific high-end computing (HEC) with tens to hundreds of
thousands of processors, such as the IBM Blue Gene/L and the
Cray X1, is the potential inability to identify problems and
take preemptive action before a failure impacts a running
job. In fact, in systems of this scale, predictions estimate
the mean time to interrupt in terms of hours. Current
solutions for fault-tolerance in HEC focus on dealing with
the result of a failure. However, most are unable to handle
runtime system configuration changes caused by failures and
require a complete restart of essential system services (e.g.
MPI) or even of the entire machine. High availability (HA)
computing strives to avoid the problems of unexpected
failures through preemptive measures. There are various
techniques to implement high availability. In contrast to
active/hot-standby high availability with its fail-over
model, active/active high availability with its virtual
synchrony model is superior in many areas including
scalability, throughput, availability and responsiveness.
However, it is significantly more complex. The overall goal
of our research is to expand today’s effort in HA for HEC, so
that systems that have the ability to hot-swap hardware
components can be kept alive by an OS runtime environment
that understands the concept of dynamic system configuration.
This talk will present an overview of recent research at Oak
Ridge National Laboratory in fault tolerance and high
availability solutions for ultra-scale scientific high-end
computing."
}
@misc{engelmann05high2,
author = "Christian Engelmann",
title = "High Availability for Ultra-Scale High-End Scientific
Computing",
month = may # "~13, ",
year = "2005",
howpublished = "{Seminar at the \href{http://www.cs.reading.ac.uk}{Department
of Computer Science}, \href{http://www.reading.ac.uk}
{University of Reading}, Reading, United Kingdom}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high2.ppt.pdf",
abstract = "A major concern in exploiting ultra-scale architectures for
scientific high-end computing (HEC) with tens to hundreds of
thousands of processors, such as the IBM Blue Gene/L and the
Cray X1, is the potential inability to identify problems and
take preemptive action before a failure impacts a running
job. In fact, in systems of this scale, predictions estimate
the mean time to interrupt in terms of hours. Current
solutions for fault-tolerance in HEC focus on dealing with
the result of a failure. However, most are unable to handle
runtime system configuration changes caused by failures and
require a complete restart of essential system services (e.g.
MPI) or even of the entire machine. High availability (HA)
computing strives to avoid the problems of unexpected
failures through preemptive measures. There are various
techniques to implement high availability. In contrast to
active/hot-standby high availability with its fail-over
model, active/active high availability with its virtual
synchrony model is superior in many areas including
scalability, throughput, availability and responsiveness.
However, it is significantly more complex. The overall goal
of our research is to expand today’s effort in HA for HEC,
so that systems that have the ability to hot-swap hardware
components can be kept alive by an OS runtime environment
that understands the concept of dynamic system configuration.
This talk will present an overview of recent research at Oak
Ridge National Laboratory in fault-tolerant heterogeneous
metacomputing, advanced super-scalable algorithms and high
availability system software for ultra-scale scientific
high-end computing."
}
@misc{engelmann05high1,
author = "Christian Engelmann",
title = "High Availability for Ultra-Scale High-End Scientific
Computing",
month = apr # "~15, ",
year = "2005",
howpublished = "{Seminar at the \href{http://cenit.latech.edu}{Center for
Entrepreneurship and Information Technology},
\href{http://www.latech.edu}{Louisiana Tech University},
Ruston, LA, USA}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high1.ppt.pdf",
abstract = "A major concern in exploiting ultra-scale architectures for
scientific high-end computing (HEC) with tens to hundreds of
thousands of processors is the potential inability to
identify problems and take preemptive action before a failure
impacts a running job. In fact, in systems of this scale,
predictions estimate the mean time to interrupt in terms of
hours. Current solutions for fault-tolerance in HEC focus on
dealing with the result of a failure. However, most are
unable to handle runtime system configuration changes caused
by failures and require a complete restart of essential
system services (e.g. MPI) or even of the entire machine.
High availability (HA) computing strives to avoid the
problems of unexpected failures through preemptive measures.
There are various techniques to implement high availability.
In contrast to active/hot-standby high availability with its
fail-over model, active/active high availability with its
virtual synchrony model is superior in many areas including
scalability, throughput, availability and responsiveness.
However, it is significantly more complex. The overall goal
of this research is to expand today’s effort in HA for HEC,
so that systems that have the ability to hot-swap hardware
components can be kept alive by an OS runtime environment
that understands the concept of dynamic system configuration.
With the aim of addressing the future challenges of high
availability in ultra-scale HEC, this project intends to
develop a proof-of-concept implementation of an active/active
high availability system software framework."
}
@misc{engelmann04diskless,
author = "Christian Engelmann",
title = "Diskless Checkpointing on Super-scale Architectures --
{A}pplied to the Fast Fourier Transform",
month = feb # "~25, ",
year = "2004",
howpublished = "{Invited talk at the \href{http://www.siam.org/meetings/pp04}
{$11^{th}$ SIAM Conference on Parallel Processing for
Scientific Computing (SIAM PP) 2004}, San Francisco, CA,
USA}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann04diskless.ppt.pdf",
abstract = "This talk discusses the issue of fault-tolerance in
distributed computer systems with tens or hundreds of
thousands of diskless processor units. Such systems, like the
IBM Blue Gene/L, are predicted to be deployed in the next
five to ten years. Since a 100,000-processor system is going
to be less reliable, scientific applications need to be able
to recover from occurring failures more efficiently. In this
paper, we adapt the present technique of diskless
checkpointing to such huge distributed systems in order to
equip existing scientific algorithms with super-scalable
fault-tolerance. First, we discuss the method of diskless
checkpointing, then we adapt this technique to super-scale
architectures and finally we present results from an
implementation of the Fast Fourier Transform that uses the
adapted technique to achieve super-scale fault-tolerance."
}
@misc{engelmann04superscalable,
author = "Christian Engelmann",
title = "Super-scalable Algorithms -- {N}ext Generation Supercomputing
on 100,000 and more Processors",
month = jan # "~29, ",
year = "2004",
howpublished = "{Seminar at the \href{http://www.csm.ornl.gov}{Computer
Science and Mathematics Division}, \href{http://www.ornl.gov}
{Oak Ridge National Laboratory}, Oak Ridge, TN, USA}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann04superscalable.ppt.pdf",
abstract = "This talk discusses recent research into the issues and
potential problems of algorithm scalability and
fault-tolerance on next-generation high-performance computer
systems with tens and even hundreds of thousands of
processors. Such massively parallel computers, like the IBM
Blue Gene/L, are going to be deployed in the next five to ten
years and existing deficiencies in scalability and
fault-tolerance need to be addressed soon. Scientific
algorithms have shown poor scalability on 10,000-processor
systems that exist today. Furthermore, future systems will be
less reliable due to the large number of components.
Super-scalable algorithms, which have the properties of scale
invariance and natural fault-tolerance, are able to get the
correct answer despite multiple task failures and without
checkpointing. We will show that such algorithms exist for a
wide variety of problems, such as finite difference, finite
element, multigrid and global maximum. Despite these
findings, traditional algorithms may still be preferred due
to their known behavior, or simply because a super-scalable
algorithm does not exist or is hard to find for a particular
problem. In this case, we propose a peer-to-peer diskless
checkpointing algorithm that can provide scale invariant
fault-tolerance."
}
@misc{engelmann03distributed,
author = "Christian Engelmann",
title = "Distributed Peer-to-Peer Control for {Harness}",
month = feb # "~11, ",
year = "2004",
howpublished = "{Seminar at the \href{http://www.csc.ncsu.edu}{Department of
Computer Science}, \href{http://www.ncsu.edu}{North Carolina
State University}, Raleigh, NC, USA}",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann03distributed.ppt.pdf",
abstract = "Harness is an adaptable fault-tolerant virtual machine
environment for next-generation heterogeneous distributed
computing developed as a follow on to PVM. It additionally
enables the assembly of applications from plug-ins and
provides fault-tolerance. This work describes the distributed
control, which manages global state replication to ensure a
high-availability of service. Group communication services
achieve an agreement on an initial global state and a linear
history of global state changes at all members of the
distributed virtual machine. This global state is replicated
to all members to easily recover from single, multiple and
cascaded faults. A peer-to-peer ring network architecture and
tunable multi-point failure conditions provide heterogeneity
and scalability. Finally, the integration of the distributed
control into the multi-threaded kernel architecture of
Harness offers a fault-tolerant global state database service
for plug-ins and applications."
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Theses
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@phdthesis{engelmann08symmetric3,
author = "Christian Engelmann",
title = "Symmetric Active/Active High Availability for
High-Performance Computing System Services",
year = "2008",
school = "\href{http://www.cs.reading.ac.uk}{Department of Computer
Science}, \href{http://www.reading.ac.uk}{University of
Reading}, UK",
note = "Thesis research performed at Oak Ridge National Laboratory.
Advisor: Prof. Vassil N. Alexandrov (University of Reading)",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric3.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric3.ppt.pdf",
abstract = "In order to address anticipated high failure rates,
reliability, availability and serviceability have become an
urgent priority for next-generation high-performance
computing (HPC) systems. This thesis aims to pave the way for
highly available HPC systems by focusing on their most
critical components and by reinforcing them with appropriate
high availability solutions. Service components, such as head
and service nodes, are the Achilles heel of a HPC system.
A failure typically results in a complete system-wide outage.
This thesis targets efficient software state replication
mechanisms for service component redundancy to achieve high
availability as well as high performance. Its methodology
relies on defining a modern theoretical foundation for
providing service-level high availability, identifying
availability deficiencies of HPC systems, and comparing
various service-level high availability methods. This thesis
showcases several developed proof-of-concept prototypes
providing high availability for services running on HPC head
and service nodes using the symmetric active/active
replication method, i.e., state-machine replication, to
complement prior work in this area using active/standby and
asymmetric active/active configurations. Presented
contributions include a generic taxonomy for service high
availability, an insight into availability deficiencies of
HPC systems, and a unified definition of service-level high
availability methods. Further contributions encompass a fully
functional symmetric active/active high availability
prototype for a HPC job and resource management service that
does not require modification of service, a fully functional
symmetric active/active high availability prototype for a HPC
parallel file system metadata service that offers high
performance, and two preliminary prototypes for a transparent
symmetric active/active replication software framework for
client-service and dependent service scenarios that hide the
replication infrastructure from clients and services.
Assuming a mean-time to failure of 5,000 hours for a head or
service node, all presented prototypes improve service
availability from 99.285\% to 99.995\% in a two-node system,
and to 99.99996\% with three nodes."
}
@mastersthesis{engelmann01distributed2,
author = "Christian Engelmann",
title = "Distributed Peer-to-Peer Control for {Harness}",
month = jul # "~7, ",
year = "2001",
school = "\href{http://www.cs.reading.ac.uk}{Department of Computer
Science}, \href{http://www.reading.ac.uk}{University of
Reading}, UK",
note = "Thesis research performed at Oak Ridge National Laboratory.
Double diploma in conjunction with the
\href{http://www.f1.fhtw-berlin.de}{Department of
Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
College for Engineering and Economics (FHTW) Berlin},
Germany. Advisors: Prof. Vassil N. Alexandrov (University of
Reading); George A. (Al) Geist (Oak Ridge National
Laboratory)",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed2.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed2.ppt.pdf",
abstract = "Parallel processing, the method of cutting down a large
computational problem into many small tasks which are solved
in parallel, is a field of increasing importance in science.
Cost-effective, flexible and efficient simulations of
mathematical models of physical, chemical or biological
real-world problems are replacing the traditional
experimental research. Current software solutions for
parallel and scientific computation, like Parallel Virtual
Machine and Message Passing Interface, have limitations in
handling faults and failures, in utilizing heterogeneous and
dynamically changing communication structures, and in
enabling migrating or cooperative applications. The current
research in heterogeneous adaptable reconfigurable networked
systems (Harness) aims to produce the next generation of
software solutions for distributed computing. A
high-available and light-weighted distributed virtual
machine service provides an encapsulation of a few hundred
to a few thousand physical machines in a virtual
heterogeneous large scale cluster. A high availability of
a service in distributed systems can be achieved by
replication of the service state on multiple server
processes. If one ore more server processes fails, the
surviving ones continue to provide the service because they
know the state. Since every member of a distributed virtual
machine is part of the distributed virtual machine service
state and is able to change this state, a distributed control
is needed to replicate the state and maintain its
consistency. This distributed control manages state changes
as well as the state-replication and the detection of and
recovery from faults and failures of server processes. This
work analyzes system architectures currently used in
heterogeneous distributed computing by defining terms,
conditions and assumptions. It shows that such systems are
asynchronous and may use partially synchronous communication
to detect and to distinguish different classes of faults and
failures. It describes how a high availability of a large
scale distributed service on a huge number of servers
residing on different geographical locations can be realized.
Asynchronous group communication services, such as Reliable
Broadcast, Atomic Broadcast, Distributed Agreement and
Membership, are analyzed to develop linear scalable
algorithms in an unidirectional and in a bidirectional
connected asynchronous peer-to-peer ring architecture.
A Transaction Control group communication service is
introduced as state-replication service. The system analysis
distinguishes different types of distributed systems, where
active transactions execute state changes using
non-replicated data of one or more servers and inactive
transactions report state changes using replicated data only.
It is applicable for passive fault-tolerant distributed
databases as well as for active fault-tolerant distributed
control mechanisms. No control token is used and time stamps
are avoided, so that all members of a server group have equal
responsibilities and are independent from the system time.
A prototype which implements the most complicated Transaction
Control algorithm is realized due to the complexity of the
distributed system and the early development stage of the
introduced algorithms. The prototype is used to obtain
practical experience with the state-replication algorithm."
}
@mastersthesis{engelmann01distributed,
author = "Christian Engelmann",
title = "Distributed Peer-to-Peer Control for {Harness}",
month = feb # "~23, ",
year = "2001",
school = "\href{http://www.f1.fhtw-berlin.de}{Department of
Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
College for Engineering and Economics (FHTW) Berlin},
Germany",
note = "Thesis research performed at Oak Ridge National Laboratory.
Double diploma in conjunction with the
\href{http://www.cs.reading.ac.uk}{Department of Computer
Science}, \href{http://www.reading.ac.uk}{University of
Reading}, UK. Advisors: Prof. Uwe Metzler (Technical College
for Engineering and Economics (FHTW) Berlin); George A. (Al)
Geist (Oak Ridge National Laboratory)",
url = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed.ppt.pdf",
abstract = "Parallel processing, the method of cutting down a large
computational problem into many small tasks which are solved
in parallel, is a field of increasing importance in science.
Cost-effective, flexible and efficient simulations of
mathematical models of physical, chemical or biological
real-world problems are replacing the traditional
experimental research. Current software solutions for
parallel and scientific computation, like Parallel Virtual
Machine and Message Passing Interface, have limitations in
handling faults and failures, in utilizing heterogeneous and
dynamically changing communication structures, and in
enabling migrating or cooperative applications. The current
research in heterogeneous adaptable reconfigurable networked
systems (Harness) aims to produce the next generation of
software solutions for distributed computing. A
high-available and light-weighted distributed virtual
machine service provides an encapsulation of a few hundred
to a few thousand physical machines in a virtual
heterogeneous large scale cluster. A high availability of
a service in distributed systems can be achieved by
replication of the service state on multiple server
processes. If one ore more server processes fails, the
surviving ones continue to provide the service because they
know the state. Since every member of a distributed virtual
machine is part of the distributed virtual machine service
state and is able to change this state, a distributed control
is needed to replicate the state and maintain its
consistency. This distributed control manages state changes
as well as the state-replication and the detection of and
recovery from faults and failures of server processes. This
work analyzes system architectures currently used in
heterogeneous distributed computing by defining terms,
conditions and assumptions. It shows that such systems are
asynchronous and may use partially synchronous communication
to detect and to distinguish different classes of faults and
failures. It describes how a high availability of a large
scale distributed service on a huge number of servers
residing on different geographical locations can be realized.
Asynchronous group communication services, such as Reliable
Broadcast, Atomic Broadcast, Distributed Agreement and
Membership, are analyzed to develop linear scalable
algorithms in an unidirectional and in a bidirectional
connected asynchronous peer-to-peer ring architecture.
A Transaction Control group communication service is
introduced as state-replication service. The system analysis
distinguishes different types of distributed systems, where
active transactions execute state changes using
non-replicated data of one or more servers and inactive
transactions report state changes using replicated data only.
It is applicable for passive fault-tolerant distributed
databases as well as for active fault-tolerant distributed
control mechanisms. No control token is used and time stamps
are avoided, so that all members of a server group have equal
responsibilities and are independent from the system time.
A prototype which implements the most complicated Transaction
Control algorithm is realized due to the complexity of the
distributed system and the early development stage of the
introduced algorithms. The prototype is used to obtain
practical experience with the state-replication algorithm."
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Co-advised Theses
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@mastersthesis{koenning07virtualized,
author = "Bj{\"o}rn K{\"o}nning",
title = "Virtualized Environments for the {Harness Workbench}",
month = mar # "~14, ",
year = "2007",
school = "\href{http://www.cs.reading.ac.uk}{Department of Computer
Science}, \href{http://www.reading.ac.uk}{University of
Reading}, UK",
note = "Thesis research performed at Oak Ridge National Laboratory.
Advisors: Prof. Vassil N. Alexandrov (University of Reading);
Christian Engelmann (Oak Ridge National Laboratory)",
url = "http://www.csm.ornl.gov/~engelman/students/koenning07virtualized.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/students/koenning07virtualized.ppt.pdf",
abstract = "The expanded use of computational sciences today leads to a
significant need of high performance computing systems. High
performance computing is currently undergoing vigorous
revival, and multiple efforts are underway to develop much
faster computing systems in the near future. New software
tools are required for the efficient use of petascale
computing systems. With the new Harness Workbench Project
the Oak Ridge National Laboratory intends to develop an
appropriate development and runtime environment for high
performance computing platforms. This dissertation project
is part of the Harness Workbench Project, and deals with the
development of a concept for virtualised environments and
various approaches to create and describe them. The developed
virtualisation approach is based on the \verb|chroot|
mechanism and uses platform-independent environment
descriptions. File structures and environment variables are
emulated to provide the portability of computational software
over diverse high performance computing platforms. Security
measures and sandbox characteristic are integrable."
}
@mastersthesis{weber07high,
author = "Matthias Weber",
title = "High Availability for the {Lustre} File System",
month = mar # "~14, ",
year = "2007",
school = "\href{http://www.cs.reading.ac.uk}{Department of Computer
Science}, \href{http://www.reading.ac.uk}{University of
Reading}, UK",
note = "Thesis research performed at Oak Ridge National Laboratory.
Double diploma in conjunction with the
\href{http://www.f1.fhtw-berlin.de}{Department of
Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
College for Engineering and Economics (FHTW) Berlin},
Germany. Advisors: Prof. Vassil N. Alexandrov (University of
Reading); Christian Engelmann (Oak Ridge National
Laboratory)",
url = "http://www.csm.ornl.gov/~engelman/students/weber07high.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/students/weber07high.ppt.pdf",
abstract = "With the growing importance of high performance computing
and, more importantly, the fast growing size of sophisticated
high performance computing systems, research in the area of
high availability is essential to meet the needs to sustain
the current growth. This Master thesis project aims to
improve the availability of Lustre. Major concern of this
project is the metadata server of the file system. The
metadata server of Lustre suffers from the last single point
of failure in the file system. To overcome this single point
of failure an active/active high availability approach is
introduced. The new file system design with multiple MDS
nodes running in virtual synchrony leads to a significant
increase of availability. Two prototype implementations aim
to show how the proposed system design and its new realized
form of symmetric active/active high availability can be
accomplished in practice. The results of this work point out
the difficulties in adapting the file system to the
active/active high availability design. Tests identify not
achieved functionality and show performance problems of the
proposed solution. The findings of this dissertation may be
used for further work on high availability for distributed
file systems."
}
@mastersthesis{baumann06design,
author = "Ronald Baumann",
title = "Design and Development of Prototype Components for the
{Harness} High-Performance Computing Workbench",
month = mar # "~6, ",
year = "2006",
school = "\href{http://www.cs.reading.ac.uk}{Department of Computer
Science}, \href{http://www.reading.ac.uk}{University of
Reading}, UK",
note = "Thesis research performed at Oak Ridge National Laboratory.
Double diploma in conjunction with the
\href{http://www.f1.fhtw-berlin.de}{Department of
Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
College for Engineering and Economics (FHTW) Berlin},
Germany. Advisors: Prof. Vassil N. Alexandrov (University of
Reading); George A. (Al) Geist and Christian Engelmann (Oak
Ridge National Laboratory)",
url = "http://www.csm.ornl.gov/~engelman/students/baumann06design.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/students/baumann06design.ppt.pdf",
abstract = "This master thesis examines plug-in technology, especially
the new field of parallel plug-ins. Plug-ins are popular
because they extend the capabilities of software packages
such as browsers and Photoshop, and allow an individual user
to add new functionality. Parallel plug-ins also provide the
above capabilities to a distributed set of resources, i.e.,
a plug-in now becomes a set of coordinating plug-ins. Second,
the set of plugins may be heterogeneous either in function or
because the underlying resources are heterogeneous. This new
dimension of complexity provides a rich research space which
is explored in this thesis. Experiences are collected and
presented as parallel plug-in paradigms and concepts. The
Harness framework was used in this project, in particular the
plugin manager and available communication capabilities.
Plug-ins provide methods for users to extend Harness
according to their requirements. The result of this thesis is
a parallel plug-in paradigm and template for Harness. Users
of the Harness environment will be able to design and
implement their applications in the form of parallel plug-ins
easier and faster by using the paradigm resulting from this
project. Prototypes were implemented which handle different
aspects of parallel plug-ins. Parallel plug-in configurations
were tested on an appropriate number of Harness kernels,
including available communication and error-handling
capabilities. Furthermore, research was done in the area of
fault tolerance while parallel plug-ins are (un)loaded, as
well as while a task is performed."
}
@mastersthesis{uhlemann06high,
author = "Kai Uhlemann",
title = "High Availability for High-End Scientific Computing",
school = "\href{http://www.cs.reading.ac.uk}{Department of Computer
Science}, \href{http://www.reading.ac.uk}{University of
Reading}, UK",
month = mar # "~6, ",
year = "2006",
school = "Department of Computer Science, University of Reading, UK",
note = "Thesis research performed at Oak Ridge National Laboratory.
Double diploma in conjunction with the
\href{http://www.f1.fhtw-berlin.de}{Department of
Engineering~I}, \href{http://www.f1.fhtw-berlin.de}{Technical
College for Engineering and Economics (FHTW) Berlin},
Germany. Advisors: Prof. Vassil N. Alexandrov (University of
Reading); George A. (Al) Geist and Christian Engelmann (Oak
Ridge National Laboratory)",
url = "http://www.csm.ornl.gov/~engelman/students/uhlemann06high.pdf",
url2 = "http://www.csm.ornl.gov/~engelman/students/uhlemann06high.ppt.pdf",
abstract = "With the growing interest and popularity in high performance
cluster computing and, more importantly, the fast growing
size of compute clusters, research in the area of high
availability is essential to meet the needs to sustain the
current growth. This Master thesis project introduces a new
approach for high availability focusing on the head node of a
cluster system. This projects focus is on providing high
availability to the job scheduler service, which is the most
vital part of the traditional Beowulf-style cluster
architecture. This research seeks to add high availability to
the job scheduler service and resource management system,
typically running on the head node, leading to a significant
increase of availability for cluster computing. Also, this
software project takes advantage of the virtual synchrony
paradigm to achieve active/active replication, the highest
form of high availability. A proof-of-concept implementation
shows how high availability can be designed in software and
what results can be expected of such a system. The results
may be reused for future or existing projects to further
improve and extent the high availability of compute
clusters."
}