#!/bin/bash
set -eu
DEBUG="true" # set false if the verbosity is a problem
SCRIPT_NAME=$(basename $0)
function log_debug {
if [[ $DEBUG = "true" ]]; then
echo "`date` $SCRIPT_NAME tripleo-upgrade $(facter hostname) $1"
fi
}
function is_bootstrap_node {
if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid | tr '[:upper:]' '[:lower:]')" = "$(facter hostname | tr '[:upper:]' '[:lower:]')" ]; then
log_debug "Node is bootstrap"
echo "true"
fi
}
function check_resource_pacemaker {
if [ "$#" -ne 3 ]; then
echo_error "ERROR: check_resource function expects 3 parameters, $# given"
exit 1
fi
local service=$1
local state=$2
local timeout=$3
if [[ -z $(is_bootstrap_node) ]] ; then
log_debug "Node isn't bootstrap, skipping check for $service to be $state here "
return
else
log_debug "Node is bootstrap checking $service to be $state here"
fi
if [ "$state" = "stopped" ]; then
match_for_incomplete='Started'
else # started
match_for_incomplete='Stopped'
fi
nodes_local=$(pcs status | grep ^Online | sed 's/.*\[ \(.*\) \]/\1/g' | sed 's/ /\|/g')
if timeout -k 10 $timeout crm_resource --wait; then
node_states=$(pcs status --full | grep "$service" | grep -v Clone | { egrep "$nodes_local" || true; } )
if echo "$node_states" | grep -q "$match_for_incomplete"; then
echo_error "ERROR: cluster finished transition but $service was not in $state state, exiting."
exit 1
else
echo "$service has $state"
fi
else
echo_error "ERROR: cluster remained unstable for more than $timeout seconds, exiting."
exit 1
fi
}
function pcmk_running {
if [[ $(systemctl is-active pacemaker) = "active" ]] ; then
echo "true"
fi
}
function is_systemd_unknown {
local service=$1
if [[ $(systemctl is-active "$service") = "unknown" ]]; then
log_debug "$service found to be unkown to systemd"
echo "true"
fi
}
function grep_is_cluster_controlled {
local service=$1
if [[ -n $(systemctl status $service -l | grep Drop-In -A 5 | grep pacemaker) ||
-n $(systemctl status $service -l | grep "Cluster Controlled $service") ]] ; then
log_debug "$service is pcmk managed from systemctl grep"
echo "true"
fi
}
function is_systemd_managed {
local service=$1
#if we have pcmk check to see if it is managed there
if [[ -n $(pcmk_running) ]]; then
if [[ -z $(pcs status --full | grep $service) && -z $(is_systemd_unknown $service) ]] ; then
log_debug "$service found to be systemd managed from pcs status"
echo "true"
fi
else
# if it is "unknown" to systemd, then it is pacemaker managed
if [[ -n $(is_systemd_unknown $service) ]] ; then
return
elif [[ -z $(grep_is_cluster_controlled $service) ]] ; then
echo "true"
fi
fi
}
function is_pacemaker_managed {
local service=$1
#if we have pcmk check to see if it is managed there
if [[ -n $(pcmk_running) ]]; then
if [[ -n $(pcs status --full | grep $service) ]]; then
log_debug "$service found to be pcmk managed from pcs status"
echo "true"
fi
else
# if it is unknown to systemd, then it is pcmk managed
if [[ -n $(is_systemd_unknown $service) ]]; then
echo "true"
elif [[ -n $(grep_is_cluster_controlled $service) ]] ; then
echo "true"
fi
fi
}
function is_managed {
local service=$1
if [[ -n $(is_pacemaker_managed $service) || -n $(is_systemd_managed $service) ]]; then
echo "true"
fi
}
function check_resource_systemd {
if [ "$#" -ne 3 ]; then
echo_error "ERROR: check_resource function expects 3 parameters, $# given"
exit 1
fi
local service=$1
local state=$2
local timeout=$3
local check_interval=3
if [ "$state" = "stopped" ]; then
match_for_incomplete='active'
else # started
match_for_incomplete='inactive'
fi
log_debug "Going to check_resource_systemd for $service to be $state"
#sanity check is systemd managed:
if [[ -z $(is_systemd_managed $service) ]]; then
echo "ERROR - $service not found to be systemd managed."
exit 1
fi
tstart=$(date +%s)
tend=$(( $tstart + $timeout ))
while (( $(date +%s) < $tend )); do
if [[ "$(systemctl is-active $service)" = $match_for_incomplete ]]; then
echo "$service not yet $state, sleeping $check_interval seconds."
sleep $check_interval
else
echo "$service is $state"
return
fi
done
echo "Timed out waiting for $service to go to $state after $timeout seconds"
exit 1
}
function check_resource {
local service=$1
local pcmk_managed=$(is_pacemaker_managed $service)
local systemd_managed=$(is_systemd_managed $service)
if [[ -n $pcmk_managed && -n $systemd_managed ]] ; then
log_debug "ERROR $service managed by both systemd and pcmk - SKIPPING"
return
fi
if [[ -n $pcmk_managed ]]; then
check_resource_pacemaker $@
return
elif [[ -n $systemd_managed ]]; then
check_resource_systemd $@
return
fi
log_debug "ERROR cannot check_resource for $service, not managed here?"
}
function manage_systemd_service {
local action=$1
local service=$2
log_debug "Going to systemctl $action $service"
systemctl $action $service
}
function manage_pacemaker_service {
local action=$1
local service=$2
# not if pacemaker isn't running!
if [[ -z $(pcmk_running) ]]; then
echo "$(facter hostname) pacemaker not active, skipping $action $service here"
elif [[ -n $(is_bootstrap_node) ]]; then
log_debug "Going to pcs resource $action $service"
pcs resource $action $service
fi
}
function stop_or_disable_service {
local service=$1
local pcmk_managed=$(is_pacemaker_managed $service)
local systemd_managed=$(is_systemd_managed $service)
if [[ -n $pcmk_managed && -n $systemd_managed ]] ; then
log_debug "Skipping stop_or_disable $service due to management conflict"
return
fi
log_debug "Stopping or disabling $service"
if [[ -n $pcmk_managed ]]; then
manage_pacemaker_service disable $service
return
elif [[ -n $systemd_managed ]]; then
manage_systemd_service stop $service
return
fi
log_debug "ERROR: $service not managed here?"
}
function start_or_enable_service {
local service=$1
local pcmk_managed=$(is_pacemaker_managed $service)
local systemd_managed=$(is_systemd_managed $service)
if [[ -n $pcmk_managed && -n $systemd_managed ]] ; then
log_debug "Skipping start_or_enable $service due to management conflict"
return
fi
log_debug "Starting or enabling $service"
if [[ -n $pcmk_managed ]]; then
manage_pacemaker_service enable $service
return
elif [[ -n $systemd_managed ]]; then
manage_systemd_service start $service
return
fi
log_debug "ERROR $service not managed here?"
}
function restart_service {
local service=$1
local pcmk_managed=$(is_pacemaker_managed $service)
local systemd_managed=$(is_systemd_managed $service)
if [[ -n $pcmk_managed && -n $systemd_managed ]] ; then
log_debug "ERROR $service managed by both systemd and pcmk - SKIPPING"
return
fi
log_debug "Restarting $service"
if [[ -n $pcmk_managed ]]; then
manage_pacemaker_service restart $service
return
elif [[ -n $systemd_managed ]]; then
manage_systemd_service restart $service
return
fi
log_debug "ERROR $service not managed here?"
}
function echo_error {
echo "$@" | tee /dev/fd2
}
# swift is a special case because it is/was never handled by pacemaker
# when stand-alone swift is used, only swift-proxy is running on controllers
function systemctl_swift {
services=( openstack-swift-account-auditor openstack-swift-account-reaper openstack-swift-account-replicator openstack-swift-account \
openstack-swift-container-auditor openstack-swift-container-replicator openstack-swift-container-updater openstack-swift-container \
openstack-swift-object-auditor openstack-swift-object-replicator openstack-swift-object-updater openstack-swift-object openstack-swift-proxy )
local action=$1
case $action in
stop)
services=$(systemctl | grep openstack-swift- | grep running | awk '{print $1}')
;;
start)
enable_swift_storage=$(hiera -c /etc/puppet/hiera.yaml tripleo::profile::base::swift::storage::enable_swift_storage)
if [[ $enable_swift_storage != "true" ]]; then
services=( openstack-swift-proxy )
fi
;;
*) echo "Unknown action $action passed to systemctl_swift"
exit 1
;; # shouldn't ever happen...
esac
for service in ${services[@]}; do
manage_systemd_service $action $service
done
}
# Special-case OVS for https://bugs.launchpad.net/tripleo/+bug/1635205
# Update condition and add --notriggerun for +bug/1669714
function special_case_ovs_upgrade_if_needed {
if rpm -qa | grep "^openvswitch-2.5.0-14" || rpm -q --scripts openvswitch | awk '/postuninstall/,/*/' | grep "systemctl.*try-restart" ; then
echo "Manual upgrade of openvswitch - ovs-2.5.0-14 or restart in postun detected"
rm -rf OVS_UPGRADE
mkdir OVS_UPGRADE && pushd OVS_UPGRADE
echo "Attempting to downloading latest openvswitch with yumdownloader"
yumdownloader --resolve openvswitch
for pkg in $(ls -1 *.rpm); do
if rpm -U --test $pkg 2>&1 | grep "already installed" ; then
echo "Looks like newer version of $pkg is already installed, skipping"
else
echo "Updating $pkg with --nopostun --notriggerun"
rpm -U --replacepkgs --nopostun --notriggerun $pkg
fi
done
popd
else
echo "Skipping manual upgrade of openvswitch - no restart in postun detected"
fi
}
# This code is meant to fix https://bugs.launchpad.net/tripleo/+bug/1686357 on
# existing setups via a minor update workflow and be idempotent. We need to
# run this before the yum update because we fix this up even when there are no
# packages to update on the system (in which case the script exits).
# This code must be called with set +eu (due to the ocf scripts being sourced)
function fixup_wrong_ipv6_vip {
# This XPath query identifies of all the VIPs in pacemaker with netmask /64. Those are IPv6 only resources that have the wrong netmask
# This gives the address of the resource in the CIB, one address per line. For example:
# /cib/configuration/resources/primitive[@id='ip-2001.db8.ca2.4..10']/instance_attributes[@id='ip-2001.db8.ca2.4..10-instance_attributes']\
# /nvpair[@id='ip-2001.db8.ca2.4..10-instance_attributes-cidr_netmask']
vip_xpath_query="//resources/primitive[@type='IPaddr2']/instance_attributes/nvpair[@name='cidr_netmask' and @value='64']"
vip_xpath_xml_addresses=$(cibadmin --query --xpath "$vip_xpath_query" -e 2>/dev/null)
# The following extracts the @id value of the resource
vip_resources_to_fix=$(echo -e "$vip_xpath_xml_addresses" | sed -n "s/.*primitive\[@id='\([^']*\)'.*/\1/p")
# Runnning this in a subshell so that sourcing files cannot possibly affect the running script
(
OCF_PATH="/usr/lib/ocf/lib/heartbeat"
if [ -n "$vip_resources_to_fix" -a -f $OCF_PATH/ocf-shellfuncs -a -f $OCF_PATH/findif.sh ]; then
source $OCF_PATH/ocf-shellfuncs
source $OCF_PATH/findif.sh
for resource in $vip_resources_to_fix; do
echo "Updating IPv6 VIP $resource with a /128 and a correct addrlabel"
# The following will give us something like:
#
ip_cib_nvpair=$(cibadmin --query --xpath "//resources/primitive[@type='IPaddr2' and @id='$resource']/instance_attributes/nvpair[@name='ip']")
# Let's filter out the value of the nvpair to get the ip address
ip_address=$(echo $ip_cib_nvpair | xmllint --xpath 'string(//nvpair/@value)' -)
OCF_RESKEY_cidr_netmask="64"
OCF_RESKEY_ip="$ip_address"
# Unfortunately due to https://bugzilla.redhat.com/show_bug.cgi?id=1445628
# we need to find out the appropiate nic given the ip address.
nic=$(findif $ip_address | awk '{ print $1 }')
ret=$?
if [ -z "$nic" -o $ret -ne 0 ]; then
echo "NIC autodetection failed for VIP $ip_address, not updating VIPs"
# Only exits the subshell
exit 1
fi
ocf_run -info pcs resource update --wait "$resource" ip="$ip_address" cidr_netmask=128 nic="$nic" lvs_ipv6_addrlabel=true lvs_ipv6_addrlabel_value=99
ret=$?
if [ $ret -ne 0 ]; then
echo "pcs resource update for VIP $resource failed, not updating VIPs"
# Only exits the subshell
exit 1
fi
done
fi
)
}
# https://bugs.launchpad.net/tripleo/+bug/1704131 guard against yum update
# waiting for an existing process until the heat stack time out
function check_for_yum_lock {
if [[ -f /var/run/yum.pid ]] ; then
ERR="ERROR existing yum.pid detected - can't continue! Please ensure
there is no other package update process for the duration of the minor update
worfklow. Exiting."
echo $ERR
exit 1
fi
}
# This function tries to resolve an RPM dependency issue that can arise when
# updating ceph packages on nodes that do not run the ceph-osd service. These
# nodes do not require the ceph-osd package, and updates will fail if the
# ceph-osd package cannot be updated because it's not available in any enabled
# repo. The dependency issue is resolved by removing the ceph-osd package from
# nodes that don't require it.
#
# No change is made to nodes that use the ceph-osd service (e.g. ceph storage
# nodes, and hyperconverged nodes running ceph-osd and compute services). The
# ceph-osd package is left in place, and the currently enabled repos will be
# used to update all ceph packages.
function yum_pre_update {
echo "Checking for ceph-osd dependency issues"
# No need to proceed if the ceph-osd package isn't installed
if ! rpm -q ceph-osd >/dev/null 2>&1; then
echo "ceph-osd package is not installed"
# Downstream only: ensure the Ceph OSD product key is removed if the
# ceph-osd package was previously removed.
rm -f /etc/pki/product/288.pem
return
fi
# Do not proceed if there's any sign that the ceph-osd package is in use:
# - Are there OSD entries in /var/lib/ceph/osd?
# - Are any ceph-osd processes running?
# - Are there any ceph data disks (as identified by 'ceph-disk')
if [ -n "$(ls -A /var/lib/ceph/osd 2>/dev/null)" ]; then
echo "ceph-osd package is required (there are OSD entries in /var/lib/ceph/osd)"
return
fi
if [ "$(pgrep -xc ceph-osd)" != "0" ]; then
echo "ceph-osd package is required (there are ceph-osd processes running)"
return
fi
if ceph-disk list |& grep -q "ceph data"; then
echo "ceph-osd package is required (ceph data disks detected)"
return
fi
# Get a list of all ceph packages available from the currently enabled
# repos. Use "--showduplicates" to ensure the list includes installed
# packages that happen to be up to date.
local ceph_pkgs="$(yum list available --showduplicates 'ceph-*' |& awk '/^ceph/ {print $1}' | sort -u)"
# No need to proceed if no ceph packages are available from the currently
# enabled repos.
if [ -z "$ceph_pkgs" ]; then
echo "ceph packages are not available from any enabled repo"
return
fi
# No need to proceed if the ceph-osd package *is* available
if [[ $ceph_pkgs =~ ceph-osd ]]; then
echo "ceph-osd package is available from an enabled repo"
return
fi
echo "ceph-osd package is not required, but is preventing updates to other ceph packages"
echo "Removing ceph-osd package to allow updates to other ceph packages"
yum -y remove ceph-osd
if [ $? -eq 0 ]; then
# Downstream only: remove the Ceph OSD product key (rhbz#1500594)
rm -f /etc/pki/product/288.pem
fi
}
#!/bin/bash
# A heat-config-script which runs yum update during a stack-update.
# Inputs:
# deploy_action - yum will only be run if this is UPDATE
# update_identifier - yum will only run for previously unused values of update_identifier
# command - yum sub-command to run, defaults to "update"
# command_arguments - yum command arguments, defaults to ""
echo "Started yum_update.sh on server $deploy_server_id at `date`"
echo -n "false" > $heat_outputs_path.update_managed_packages
if [ -f /.dockerenv ]; then
echo "Not running due to running inside a container"
exit 0
fi
if [[ -z "$update_identifier" ]]; then
echo "Not running due to unset update_identifier"
exit 0
fi
timestamp_dir=/var/lib/overcloud-yum-update
mkdir -p $timestamp_dir
# sanitise to remove unusual characters
update_identifier=${update_identifier//[^a-zA-Z0-9-_]/}
# seconds to wait for this node to rejoin the cluster after update
cluster_start_timeout=600
galera_sync_timeout=1800
cluster_settle_timeout=1800
timestamp_file="$timestamp_dir/$update_identifier"
if [[ -a "$timestamp_file" ]]; then
echo "Not running for already-run timestamp \"$update_identifier\""
exit 0
fi
touch "$timestamp_file"
pacemaker_status=""
# We include word boundaries in order to not match pacemaker_remote
if hiera -c /etc/puppet/hiera.yaml service_names | grep -q '\bpacemaker\b'; then
pacemaker_status=$(systemctl is-active pacemaker)
fi
# (NB: when backporting this s/pacemaker_short_bootstrap_node_name/bootstrap_nodeid)
# This runs before the yum_update so we are guaranteed to run it even in the absence
# of packages to update (the check for -z "$update_identifier" guarantees that this
# is run only on overcloud stack update -i)
if [[ "$pacemaker_status" == "active" && \
"$(hiera -c /etc/puppet/hiera.yaml pacemaker_short_bootstrap_node_name | tr '[:upper:]' '[:lower:]')" == "$(facter hostname | tr '[:upper:]' '[:lower:]')" ]] ; then \
# OCF scripts don't cope with -eu
echo "Verifying if we need to fix up any IPv6 VIPs"
set +eu
fixup_wrong_ipv6_vip
ret=$?
set -eu
if [ $ret -ne 0 ]; then
echo "Fixing up IPv6 VIPs failed. Stopping here. (See https://bugs.launchpad.net/tripleo/+bug/1686357 for more info)"
exit 1
fi
fi
command_arguments=${command_arguments:-}
# Always ensure yum has full cache
check_for_yum_lock
yum makecache || echo "Yum makecache failed. This can cause failure later on."
# yum check-update exits 100 if updates are available
check_for_yum_lock
set +e
check_update=$(yum check-update 2>&1)
check_update_exit=$?
set -e
if [[ "$check_update_exit" == "1" ]]; then
echo "Failed to check for package updates"
echo "$check_update"
exit 1
elif [[ "$check_update_exit" != "100" ]]; then
echo "No packages require updating"
exit 0
fi
# special case https://bugs.launchpad.net/tripleo/+bug/1635205 +bug/1669714
special_case_ovs_upgrade_if_needed
# Resolve any RPM dependency issues before attempting the update
check_for_yum_lock
yum_pre_update
if [[ "$pacemaker_status" == "active" ]] ; then
echo "Pacemaker running, stopping cluster node and doing full package update"
node_count=$(pcs status xml | grep -o "" | grep -o 'number="[0-9]*"' | grep -o "[0-9]*")
if [[ "$node_count" == "1" ]] ; then
echo "Active node count is 1, stopping node with --force"
pcs cluster stop --force
else
pcs cluster stop
fi
else
echo "Upgrading Puppet modules and dependencies"
check_for_yum_lock
yum -q -y update puppet-tripleo
yum deplist puppet-tripleo | awk '/dependency/{print $2}' | xargs yum -q -y update
echo "Upgrading other packages is handled by config management tooling"
echo -n "true" > $heat_outputs_path.update_managed_packages
exit 0
fi
command=${command:-update}
full_command="yum -q -y $command $command_arguments"
echo "Running: $full_command"
check_for_yum_lock
result=$($full_command)
return_code=$?
echo "$result"
echo "yum return code: $return_code"
if [[ "$pacemaker_status" == "active" ]] ; then
echo "Starting cluster node"
pcs cluster start
hostname=$(hostname -s)
tstart=$(date +%s)
while [[ "$(pcs status | grep "^Online" | grep -F -o $hostname)" == "" ]]; do
sleep 5
tnow=$(date +%s)
if (( tnow-tstart > cluster_start_timeout )) ; then
echo "ERROR $hostname failed to join cluster in $cluster_start_timeout seconds"
pcs status
exit 1
fi
done
RETVAL=$( pcs resource show galera-master | grep wsrep_cluster_address | grep -q `crm_node -n` ; echo $? )
if [[ $RETVAL -eq 0 && -e /etc/sysconfig/clustercheck ]]; then
tstart=$(date +%s)
while ! clustercheck; do
sleep 5
tnow=$(date +%s)
if (( tnow-tstart > galera_sync_timeout )) ; then
echo "ERROR galera sync timed out"
exit 1
fi
done
fi
echo "Waiting for pacemaker cluster to settle"
if ! timeout -k 10 $cluster_settle_timeout crm_resource --wait; then
echo "ERROR timed out while waiting for the cluster to settle"
exit 1
fi
pcs status
fi
echo "Finished yum_update.sh on server $deploy_server_id at `date` with return code: $return_code"
exit $return_code