#!/bin/bash
#
# Copyright (c) 2010-2013 SUSE LLC, All Rights Reserved.
#
# Author: Tim Serong <tserong@suse.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License as
# published by the Free Software Foundation.
#
# This program is distributed in the hope that it would be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# Further, this software is distributed without any warranty that it is
# free of the rightful claim of any third person regarding infringement
# or the like.  Any license provided herein, whether implied or
# otherwise, applies only to this software file.  Patent licenses, if
# any, provided herein do not apply to combinations of this program with
# other software, or any other product whatsoever.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA.
#

. /usr/lib/ha-cluster-functions

declare SHARED_DEVICE
declare SBD_DEVICE
declare OCFS2_DEVICE
declare TEMPLATE

usage()
{
	cat <<END
Usage: $0 [options] [stage]

Options:
    -h,--help   Display this usage information
    -q          Be quiet (don't describe what's happening, just do it)
    -y          Answer "yes" to all prompts (use with caution, this is
                destructive, especially during the "storage" stage)

    -i <if>     Default to IP address on interface 'if' instead of eth0
    -s <dev>    Block device to use for SBD fencing

    -t <name>   Optionally configure cluster with template 'name'
                (currently only "ocfs2" is valid here)

Options for ocfs2 template:
    -p <dev>    Partition this shared storage device (only used in
                "storage" stage)
    -o <dev>    Block device to use for OCFS2 (only used in "vgfs" stage)

Stage can be one of:
    ssh         Create SSH keys for passwordless SSH between cluster nodes
    csync2      Configure csync2
    corosync    Configure corosync
    storage     Partition shared storage (ocfs2 template only)
    sbd         Configure SBD (requires -s <dev>)
    cluster     Bring the cluster online
    vgfs        Create volume group and filesystem (ocfs2 template only,
                requires -o <dev>)

Note:
  - If stage is not specified, the script will run through each stage
    in sequence, with prompts for required information.
  - If using the ocfs2 template, the storage stage will partition a block
    device into two pieces, one for SBD, the remainder for OCFS2.  This is
    good for testing and demonstration, but not ideal for production.
    To use storage you have already configured, pass -s and -o to specify
    the block devices for SBD and OCFS2, and the automatic partitioning
    will be skipped.
END
	exit 0
}

init_ssh()
{
	start_service sshd.service

	invoke mkdir -m 700 -p /root/.ssh

	if [ -f /root/.ssh/id_rsa ]; then
		confirm \
			'/root/.ssh/id_rsa already exists - overwrite?' || return
		invoke rm -f /root/.ssh/id_rsa
	fi

	status "Generating ssh key"
	invoke ssh-keygen -q -f /root/.ssh/id_rsa \
		-C 'Cluster Internal' -N '' \
		|| error "Failed to generate SSH key"
	append /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys
}

# This handles the slightly obscure case where the seed node has ssh keys,
# but those keys aren't present in the seed node's authorized_keys file.
init_ssh_remote()
{
	for key in id_rsa id_dsa id_ecdsa id_ed25519 ; do
		[ -f /root/.ssh/$key ] || continue
		grep -q -s "$(cat /root/.ssh/$key.pub)" /root/.ssh/authorized_keys \
			|| append /root/.ssh/$key.pub /root/.ssh/authorized_keys
	done
}

init_csync2()
{
	status "Configuring csync2"

	# Not automatically updating /etc/hosts - risky in the general case.
	#etc_hosts_add_me
	#[ -n "$(etc_hosts_get_me)" ] || error "No valid entry for $(hostname) in /etc/hosts - csync2 can't work"

	if [ -f "$CSYNC2_KEY" ] || \
		grep -v -q -s '^[[:space:]]*\(#.*\)*$' $CSYNC2_CFG
	then
		confirm 'csync2 is already configured - overwrite?' || return
	fi

	invoke rm -f $CSYNC2_KEY
	status_long "Generating csync2 shared key (this may take a while)"
	# On a quiet VM, "a while" can be "way too long" (not enough entropy)
	invoke csync2 -k $CSYNC2_KEY || error "Can't create csync2 key $keyfile"
	status_done

	local tmp_conf=${CSYNC2_CFG}.$$
	cat > $tmp_conf <<END
group ha_group
{
	key /etc/csync2/key_hagroup;
	host $(hostname);
	include /etc/booth/booth.conf;
	include /etc/corosync/corosync.conf;
	include /etc/corosync/authkey;
	include /etc/csync2/csync2.cfg;
	include /etc/csync2/key_hagroup;
	include /etc/ctdb/nodes;
	include /etc/drbd.conf;
	include /etc/drbd.d;
	include /etc/ha.d/ldirectord.cf;
	include /etc/lvm/lvm.conf;
	include /etc/multipath.conf;
	include /etc/samba/smb.conf;
	include /etc/sysconfig/pacemaker;
	include /etc/sysconfig/sbd;
}
END
	install_tmp $tmp_conf $CSYNC2_CFG

	start_service csync2.socket

	status "csync2 checking files"
	invoke csync2 -cr /
}

# It would be nice if we could just have csync2.cfg include a directory,
# which in turn included one file per node which would be referenced via
# something like "group ha_group { ... config: /etc/csync2/hosts/*; }"
# That way, adding a new node would just mean adding a single new file
# to that directory.  Unfortunately, the 'config' statement only allows
# inclusion of specific individual files, not multiple files via wildcard.
# So we have this function which is called by ha-cluster-join to add the new
# remote node to csync2 config on some existing node.  It is intentionally
# not documented in ha-cluster-init's user-visible usage information.
init_csync2_remote()
{
	local newhost=$1
	#local newline="$2"
	local thishost=$(hostname)
	[ -n "$newhost" ] || error "Hostname not specified"

	BE_QUIET=true

	# Not automatically updating /etc/hosts - risky in the general case.
	#if [ -n "$newline" ]; then
	#	etc_hosts_add_one $newhost "$newline"
	#else
	#	log ": Not updating /etc/hosts - remote host line not specified"
	#fi

	# If host doesn't already exist in csync2 config, add it
	if ! egrep -q -s \
		"^[[:space:]]*host.*[[:space:]]+${newhost}[[:space:];]" \
		$CSYNC2_CFG
	then
		local tmp_conf=${CSYNC2_CFG}.$$
		awk '/^[[:space:]]*host[[:space:]]/ \
			{ if (h==0) print "\thost '$newhost';"; h=1; } \
			{ print $0 } ' $CSYNC2_CFG > $tmp_conf
		install_tmp $tmp_conf $CSYNC2_CFG
		invoke csync2 -c $CSYNC2_CFG
	else
		log ": Not updating $CSYNC2_CFG - remote host $newhost already exists"
	fi
}

init_corosync()
{
	if $YES_TO_ALL; then
		status 'Configuring corosync'
	else
		status "
Configure Corosync:
  This will configure the cluster messaging layer.  You will need
  to specify a network address over which to communicate (default
  is ${NET_IF}'s network, but you can use the network address of any
  active interface), a multicast address and multicast port.
"
	fi

	if [ -f "$COROSYNC_CONF" ]; then
		confirm \
			"$COROSYNC_CONF already exists - overwrite?" || return
	fi

	local bindnetaddr=$(prompt_for_string \
		'Network address to bind to (e.g.: 192.168.1.0)' \
		'([0-9]+\.){3}[0-9]+' "$IP_NETWORK")
	[ -z "$bindnetaddr" ] && error 'No value for bindnetaddr'

	local mcastaddr=$(prompt_for_string \
		'Multicast address (e.g.: 239.x.x.x)' \
		'([0-9]+\.){3}[0-9]+' \
		239.$(random_256).$(random_256).$(random_256))
	[ -z "$mcastaddr" ] && error 'No value for mcastaddr'

	local mcastport=$(prompt_for_string \
		'Multicast port' \
		'[0-9]+' 5405);
	[ -z "$mcastport" ] && error 'No value for mcastport'

	local tmp_conf=${COROSYNC_CONF}.$$
# for now, use our inline corosync.conf to force quorum expected_votes...
#	if grep -q -s '^[[:space:]]*name:[[:space:]]*pacemaker' ${COROSYNC_CONF}.example; then
#		# "name: pacemaker" in corosync example conf is our assumption
#		# that the example is sane, so use it:
#		sed \
#			-e 's/^\([[:space:]]*bindnetaddr:[[:space:]]*\).*/\1'$bindnetaddr'/' \
#			-e 's/^\([[:space:]]*mcastaddr:[[:space:]]*\).*/\1'$mcastaddr'/' \
#			-e 's/^\([[:space:]]*mcastport:[[:space:]]*\).*/\1'$mcastport'/' \
#			/etc/corosync/corosync.conf.example > $tmp_conf
#	else
#		# Fallback to known sane config (defaults from SLE HA 11 SP1
#		# corosync.conf.example.patch)
		cat > $tmp_conf <<END
# Please read the corosync.conf.5 manual page

totem {
	version:	2
	secauth:	off
	cluster_name:	hacluster
	clear_node_high_bit: yes

# Following are old corosync 1.4.x defaults from SLES
#	token:		5000
#	token_retransmits_before_loss_const: 10
#	join:		60
#	consensus:	6000
#	vsftype:	none
#	max_messages:	20
#	threads:	0

	crypto_cipher:	none
	crypto_hash:	none

	interface {
		ringnumber:	0
		bindnetaddr:	$bindnetaddr
		mcastaddr:	$mcastaddr
		mcastport:	$mcastport
		ttl:		1
	}
}
logging {
	fileline:	off
	to_stderr:	no
	to_logfile:	no
	logfile:	/var/log/cluster/corosync.log
	to_syslog:	yes
	debug:		off
	timestamp:	on
	logger_subsys {
		subsys:	QUORUM
		debug:	off
	}
}
quorum {
	# Enable and configure quorum subsystem (default: off)
	# see also corosync.conf.5 and votequorum.5
	provider: corosync_votequorum
	expected_votes: 1
	two_node: 0
}
END
#	fi
	install_tmp $tmp_conf $COROSYNC_CONF
	invoke csync2 -m $COROSYNC_CONF
	invoke csync2 -f $COROSYNC_CONF
	invoke csync2 -xv $COROSYNC_CONF
}

# Non-generic, i.e. not for use with every type of cluster you can imagine:
# requires shared storage, and specifically configures one block device as
# sbd partition + remainder for OCFS2 filesystem (created later).
init_storage()
{
	local dev="$SHARED_DEVICE"
	local partitions
	local part
	local -a devices
	local -i dev_looks_sane=0

	if $YES_TO_ALL || [ -n "$SHARED_DEVICE" ]; then
		status 'Configuring shared storage'
	else
		status "
Configure Shared Storage:
  You will need to provide the path to a shared storage device,
  for example a SAN volume or iSCSI target.  The device path must
  be persistent and consistent across all nodes in the cluster,
  so /dev/disk/by-id/* devices are a good choice.  This device
  will be automatically paritioned into two pieces, 1MB for SBD
  fencing, and the remainder for an OCFS2 filesystem.
"
	fi

	while [ $dev_looks_sane -eq 0 ]; do
		dev=$(prompt_for_string \
			'Path to storage device (e.g. /dev/disk/by-id/...)' \
			'\/.*' "$dev")
		# This will be empty if -y was specified but -p wasn't
		[ -z "$dev" ] && error "No value for shared storage device"

		if [ ! -b "$dev" ]; then
			$YES_TO_ALL && error "$dev is not a block device" \
				|| echo "    That doesn't look like a block device" >&2
		else
			#
			# Got something that looks like a block device, there
			# are four possibilities now:
			#
			#  1) It's completely broken/inaccessible
			#  2) No recognizable partition table
			#  3) Empty partition table
			#  4) Non-empty parition table
			#
			partitions=$(parted -s $dev print | awk '/^[[:space:]]*[0-9]+/ { print $1; }')
			if [ -n "$partitions" ]; then
				# Partitions exist
				status "WARNING: Partitions exist on $dev!"
				confirm 'Are you ABSOLUTELY SURE you want to overwrite?' \
					&& dev_looks_sane=1 || dev=
			else
				# It's either broken, no partition table, or empty partition table
				status "$dev appears to be empty"
				confirm 'Are you sure you wish to use this device' \
					&& dev_looks_sane=1 || dev=
			fi
		fi
	done

	if [ -n "$partitions" ]; then
		confirm 'Really?' || exit
		status_long "Erasing existing partitions..."
		for part in $partitions; do
			invoke parted -s $dev rm $part \
				|| error "Failed to remove partition $part"
		done
		status_done
	fi

	status_long "Creating partitions..."
	invoke parted -s $dev mklabel msdos || error "Failed to create partition table"

	# This is a bit rough, and probably won't result in great performance,
	# but it's fine for test/demo purposes to carve off 1MB for SBD.  Note
	# we have to specify the size of the first partition in this in bytes
	# rather than MB, or parted's rounding gives us a ~30Kb partition
	# (see rhbz#623268).
	invoke parted -s $dev mkpart primary 0 1048576B || error "Failed to create first partition"
	invoke parted -s $dev mkpart primary 1M 100% || error "Failed to create second partition"

	status_done

	# TODO: May not be strictly necessary, but...
	probe_partitions

	# TODO: THIS IS *WRONG* FOR MULTIPATH! (but possibly nothing we can do about it)
	devices=( $(fdisk -l $dev | awk '/^\/dev/ { print $1; }') )

	SBD_DEVICE=${devices[0]}
	[ -z "$SBD_DEVICE" ] && error "Unable to determine device path for SBD partition"

	OCFS2_DEVICE=${devices[1]}
	[ -z "$OCFS2_DEVICE" ] && error "Unable to determine device path for OCFS2 partition"

	status
	status "Created $SBD_DEVICE for SBD partition"
	status "Created $OCFS2_DEVICE for OCFS2 partition"
}

init_sbd()
{
	local dev
	local -i dev_looks_sane=0

	if [ -z "$SBD_DEVICE" ]; then
		# SBD device not set up by init_storage (ocfs2 template) and
		# also not passed in as command line argument - prompt user
		if $YES_TO_ALL; then
			warn "Not configuring SBD ($SYSCONFIG_SBD left untouched)."
			return
		fi
		status "
Configure SBD:
  If you have shared storage, for example a SAN or iSCSI target,
  you can use it avoid split-brain scenarios by configuring SBD.
  This requires a 1 MB partition, accessible to all nodes in the
  cluster.  The device path must be persistent and consistent
  across all nodes in the cluster, so /dev/disk/by-id/* devices
  are a good choice.  Note that all data on the partition you
  specify here will be destroyed.
"

		if [ -n "$(configured_sbd_device)" ]; then
			confirm \
				"SBD is already configured - overwrite?" || return
		fi

		if ! confirm "Do you wish to use SBD?"; then
			warn "Not configuring SBD - STONITH will be disabled."
			# Comment out SBD devices if present
			if [ -f "$SYSCONFIG_SBD" ]; then
				local tmp_conf=$SYSCONFIG_SBD.$$
				sed -e 's/^\([^#].*\)$/#\1/g' $SYSCONFIG_SBD > $tmp_conf
				install_tmp $tmp_conf $SYSCONFIG_SBD
				invoke csync2 -m $SYSCONFIG_SBD
				invoke csync2 -f $SYSCONFIG_SBD
				invoke csync2 -xv $SYSCONFIG_SBD
			fi
			return
		fi
		while [ $dev_looks_sane -eq 0 ]; do
			dev=$(prompt_for_string \
				'Path to storage device (e.g. /dev/disk/by-id/...)' \
				'\/.*' "$dev")
			if [ ! -b "$dev" ]; then
				echo "    That doesn't look like a block device" >&2
			else
				status "All data on $dev will be destroyed"
				confirm 'Are you sure you wish to use this device' \
					&& dev_looks_sane=1 || dev=
			fi
		done
		SBD_DEVICE="$dev"

		SBD_WATCHDOG="yes"
		if ! check_watchdog; then
			warn "No watchdog device detected! SBD will not work without a watchdog."
		fi
	fi

	[ -b "$SBD_DEVICE" ] || error "SBD device $SBD_DEVICE does not exist"

	# TODO: need to ensure watchdog is available
	# (actually, should work if watchdog unavailable, it'll just whine in the logs...)
	# TODO: what about timeouts for multipath devices?
	status_long 'Initializing SBD...'
	invoke sbd -d $SBD_DEVICE create || error "Failed to initialize SBD device"
	status_done

	if [ -f $SYSCONFIG_SBD ]; then
		if grep -E '^SBD_DEVICE=.*' $SYSCONFIG_SBD >/dev/null 2>&1; then
			sed -i -e "s%^SBD_DEVICE=.*%SBD_DEVICE=\"$SBD_DEVICE\"%g"  $SYSCONFIG_SBD
		else
			echo "SBD_DEVICE=\"$SBD_DEVICE\"" >> $SYSCONFIG_SBD
		fi
		if grep -E '^SBD_WATCHDOG=.*' $SYSCONFIG_SBD >/dev/null 2>&1; then
			sed -i -e "s/^SBD_WATCHDOG=.*/SBD_WATCHDOG=\"$SBD_WATCHDOG\"/g"  $SYSCONFIG_SBD
		else
			echo "SBD_WATCHDOG=\"$SBD_WATCHDOG\"" >> $SYSCONFIG_SBD
		fi
	else
		echo "SBD_DEVICE=\"$SBD_DEVICE\"" > $SYSCONFIG_SBD
		echo "SBD_WATCHDOG=\"$SBD_WATCHDOG\"" >> $SYSCONFIG_SBD
	fi

	invoke csync2 -m $SYSCONFIG_SBD
	invoke csync2 -f $SYSCONFIG_SBD
	invoke csync2 -xv $SYSCONFIG_SBD
}

init_cluster()
{
	init_cluster_local

	local -i node_count=$(crmadmin -N|grep 'node'|wc -l)
	[ "$node_count" -lt 1 ] && error "No nodes found in cluster"
	[ "$node_count" -gt 1 ] && error "Joined existing cluster - will not reconfigure."

	status "Loading initial configuration"

	# base config
	local tmp_conf=/tmp/crm.$$
	cat > $tmp_conf <<END
property \$id="cib-bootstrap-options" \\
	stonith-enabled="false" \\
	no-quorum-policy="ignore" \\
	placement-strategy="balanced"
op_defaults \$id="op-options" \\
	timeout="600" \\
	record-pending="true"
rsc_defaults \$id="rsc-options" \\
	resource-stickiness="1" \\
	migration-threshold="3"
END
	crm_configure_load replace $tmp_conf

	# sbd fencing if applicable (getting local sbd_device here direct
	# from SBD config, in case init_cluster is invoked on its own)
	local sbd_device=$([ -f /etc/sysconfig/sbd ] && \
		. /etc/sysconfig/sbd ; echo $SBD_DEVICE)
	if [ -n "$sbd_device" ]; then
		# Can't do crm configure load update here with only
		# stonith-enabled, or it wipes out then entire
		# cib-bootstrap-options section
		# TODO: find out if this is a bug in the crm shell
		invoke crm configure primitive stonith-sbd stonith:external/sbd \
			|| error "Can't create stonith-sbd primitive"
		invoke crm configure property stonith-enabled="true" \
			|| error "Can't enable STONITH"
	fi
}

# Non-generic, i.e. not for use with every type of cluster you can imagine:
# creates specific cluster config (TODO: would be nice to just directly
# use Hawk's wizard template instead of including config directly here)
init_vgfs()
{
	[ -b "$OCFS2_DEVICE" ] || error "OCFS2 device $OCFS2_DEVICE does not exist"

	# TODO: confiugrable mountpoint and vg name
	local tmp_conf=/tmp/crm.$$
	cat > $tmp_conf <<END
primitive dlm ocf:pacemaker:controld \\
	op start timeout="90" op stop timeout="100" \\
	op monitor interval="60" timeout="60"
primitive clusterfs ocf:heartbeat:Filesystem \\
	params directory="/srv/clusterfs" fstype="ocfs2" device="$OCFS2_DEVICE" \\
	op monitor interval="20" timeout="40" \\
	op start timeout="60" op stop timeout="60" \\
	meta target-role="Stopped"
clone base-clone dlm meta interleave="true"
clone c-clusterfs clusterfs meta interleave="true"
order base-then-clusterfs inf: base-clone c-clusterfs
colocation clusterfs-with-base inf: c-clusterfs base-clone
END
	crm_configure_load update $tmp_conf

	wait_for_resource "Waiting for DLM" dlm:0
	wait_for_stop "Making sure filesystem is not active" clusterfs:0

	if blkid $OCFS2_DEVICE 2>/dev/null | grep -q TYPE; then
		if ! confirm "Exiting filesystem found on $OCFS2_DEVICE - destroy?"; then
			for res in base-clone c-clusterfs ; do
				invoke crm resource stop $res
				wait_for_stop "Waiting for resource to stop" $res
			done
			invoke crm configure delete \
				dlm clusterfs \
				base-group base-clone c-clusterfs \
				base-then-clusterfs clusterfs-with-base
			return
		fi
	fi

	status_long "Creating OCFS2 filesystem"
	# TODO: want "-T vmstore", but this'll only fly on >2GB partition
	# Note: using undocumented '-x' switch to avoid prompting if overwriting
	# existing partition.  For the commit that introduced this, see:
	# http://oss.oracle.com/git/?p=ocfs2-tools.git;a=commit;h=8345a068479196172190f4fa287052800fa2b66f
	invoke mkfs.ocfs2 --cluster-stack pcmk --cluster-name hacluster -x $OCFS2_DEVICE || error "Failed to create OCFS2 filesystem on $OCFS2_DEVICE"
	status_done

	# TODO: refactor, maybe
	invoke mkdir -p /srv/clusterfs || error "Can't create mountpoint /srv/clusterfs"
	invoke crm resource meta clusterfs delete target-role \
		|| error "Can't start cluster filesystem clone"
	wait_for_resource "Waiting for /srv/clusterfs to be mounted" clusterfs:0
}

#------------------------------------------------------------------------------

# for --help option
[ "$1" == "--help" ] && usage

while getopts 'hi:o:p:qs:t:y' o; do
	case $o in
	h) usage;;
	i) NET_IF=$OPTARG;;
	o) OCFS2_DEVICE=$OPTARG;;
	p) SHARED_DEVICE=$OPTARG;;
	q) BE_QUIET=true;;
	s) SBD_DEVICE=$OPTARG;;
	t) TEMPLATE=$OPTARG;;
	y) YES_TO_ALL=true;;
	esac
done

shift $(($OPTIND - 1))

stage=$1 ; shift

# vgfs stage requires running cluster, everything else requires inactive cluster,
# except ssh and csync2 (which don't care) and csync2_remote (which mustn't care,
# just in case this breaks ha-cluster-join on another node).
systemctl -q is-active corosync.service
rc=$?
if [ "$stage" == "vgfs" ]; then
	[ $rc -ne 0 ] && error "Cluster is inactive  - can't run vgfs stage"
elif [ "$stage" != "ssh" -a "$stage" != "ssh_remote" -a "$stage" != "csync2" -a "$stage" != "csync2_remote" ]; then
	[ $rc -eq 0 ] && error "Cluster is currently active - can't run"
fi

# Need hostname resolution to work, want NTP (but don't block ssh_remote or csync2_remote)
if [ "$stage" != "ssh_remote" -a "$stage" != "csync2_remote" ]; then
	check_prereqs
fi

case $TEMPLATE in
ocfs2)	;;
"")	;;
*)	error "Invalid template ($TEMPLATE)"
esac

case $stage in
ssh|ssh_remote|csync2|csync2_remote|corosync|storage|sbd|cluster|vgfs)
	init
	# $2 == nasty hack to pick up IP arg to csync2_remote (not strictly
	# necessary currently, as we're not auto-updating /etc/hosts)
	init_$stage "$@"
	;;
"")
	init
	init_ssh
	init_csync2
	init_corosync
	if [ "$TEMPLATE" == "ocfs2" ]; then
		[ -n "$SBD_DEVICE" -a -n "$OCFS2_DEVICE" ] || init_storage
	fi
	init_sbd
	init_cluster
	[ "$TEMPLATE" == "ocfs2" ] && init_vgfs
	;;
*)	echo -e "Invalid stage ($1)\n"
	usage
esac

status "Done (log saved to $LOG_FILE)"
exit 0
