Add test script for running pg_upgrade in ICW

This adds a testrunner to pg_upgrade intended to run at the end of ICW. The running gpdemo cluster is converted from the current GPDB version to the same version, which shall result in an identical cluster. The script first dumps the ICW cluster, the upgrades into a new gpdemo cluster and diffs the dump from that with the original dump. In case the cluster needs to be tweaked before the test, a _pre.sql file can be supplied which will be executed against the old cluster before dumping the schema of it. This file currently drops the relations which hold constraints not yet supported by pg_upgrade. An optional quicktest that the Oid synchronization is maintained for new objects is supported in a smoketest mode. The new cluster is brought up with fsync turned off to speed up the test. This is inspired by the upstream test runner for pg_upgrade.

Add test script for running pg_upgrade in ICW
This adds a testrunner to pg_upgrade intended to run at the end of ICW. The running gpdemo cluster is converted from the current GPDB version to the same version, which shall result in an identical cluster. The script first dumps the ICW cluster, the upgrades into a new gpdemo cluster and diffs the dump from that with the original dump. In case the cluster needs to be tweaked before the test, a _pre.sql file can be supplied which will be executed against the old cluster before dumping the schema of it. This file currently drops the relations which hold constraints not yet supported by pg_upgrade. An optional quicktest that the Oid synchronization is maintained for new objects is supported in a smoketest mode. The new cluster is brought up with fsync turned off to speed up the test. This is inspired by the upstream test runner for pg_upgrade.
a100814c · Daniel Gustafsson · f51f2f57 · a100814c · a100814c · a100814c
6 changed file
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -152,6 +152,7 @@ installcheck-world:
 	$(MAKE) -C src/test/kerberos installcheck
 	$(MAKE) -C gpMgmt/bin installcheck
 	gpcheckcat -A
+	$(MAKE) -C contrib/pg_upgrade check

 installcheck-resgroup:
 	$(MAKE) -C src/test/isolation2 $@

--- a/contrib/pg_upgrade/.gitignore
+++ b/contrib/pg_upgrade/.gitignore
 /pg_upgrade
+lalshell
--- a/contrib/pg_upgrade/Makefile
+++ b/contrib/pg_upgrade/Makefile
@@ -3,6 +3,8 @@
 #
 # $PostgreSQL: pgsql/contrib/pg_upgrade/Makefile,v 1.4 2010/07/03 14:23:13 momjian Exp $

+top_builddir = ../..
+
 PGFILEDESC = "pg_upgrade - an in-place binary upgrade utility"
 PGAPPICON = win32

@@ -22,7 +24,12 @@ PGXS := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
 else
 subdir = contrib/pg_upgrade
-top_builddir = ../..
 include $(top_builddir)/src/Makefile.global
 include $(top_srcdir)/contrib/contrib-global.mk
 endif
+
+installcheck: test_gpdb.sh all
+	$(SHELL) $< -o $(top_builddir)/gpAux/gpdemo/datadirs/ -b $(DESTDIR)$(bindir)
+
+check: test_gpdb.sh all
+	$(SHELL) $< -C -s -o $(top_builddir)/gpAux/gpdemo/datadirs/ -b $(DESTDIR)$(bindir)
--- a/contrib/pg_upgrade/README.gpdb
+++ b/contrib/pg_upgrade/README.gpdb
@@ -78,3 +78,20 @@ synchronization.
  is not allowed in utility mode. All external partitions must be either moved
  out of the partition hierarchy with ALTER TABLE EXCHANGE, or dropped, prior
  to the upgrade.
+
+
+Testing Greenplum pg_upgrade
+----------------------------
+
+There is a testrunner script in contrib/pg_upgrade which runs a full upgrade of
+the gpdemo cluster and it's current contents. The idea is that it can be used
+to test upgrades by running as the final step of ICW. The test can be invoked
+by "make installcheck" in the contrib/pg_upgrade directory.
+
+The testscript also has a "smoketest" option which only tests upgrading a
+single node.  This is *not* a test of an actual Greenplum cluster upgrade,
+but a test of the Oid synchronization required to handle a node upgrade. The
+intention is for this to be a quick test that all objects are handled by
+pg_upgrade; upgrade testing still requires the full run across all
+segments/mirrors. The smoketest can be invoked by "make check" in the
+contrib/pg_upgrade directory.
--- a/contrib/pg_upgrade/test_gpdb.sh
+++ b/contrib/pg_upgrade/test_gpdb.sh
+#!/bin/bash
+
+# contrib/pg_upgrade/test_gpdb.sh
+#
+# Test driver for upgrading a Greenplum cluster with pg_upgrade. For test data,
+# this script assumes the gpdemo cluster in gpAux/gpdemo/datadirs contains the
+# end-state of an ICW test run. Performs a pg_dumpall, initializes a parallel
+# gpdemo cluster and upgrades it against the ICW cluster and then performs
+# another pg_dumpall. If the two dumps match then the upgrade created a new
+# identical copy of the cluster.
+
+OLD_BINDIR=
+OLD_DATADIR=
+NEW_BINDIR=
+NEW_DATADIR=
+
+qddir=
+
+# The normal ICW run has a gpcheckcat call, so allow this testrunner to skip
+# running it in case it was just executed to save time.
+gpcheckcat=1
+
+# Smoketesting pg_upgrade is done by just upgrading the QD without diffing the
+# results. This is *NOT* a test of whether pg_upgrade can successfully upgrade
+# a cluster but a test intended to catch when objects aren't properly handled
+# in pg_dump/pg_upgrade wrt Oid synchronization
+smoketest=0
+
+# Not all platforms have a realpath binary in PATH, most notably macOS doesn't,
+# so provide an alternative implementation. Returns an absolute path in the
+# variable reference passed as the first parameter.  Code inspired by:
+# http://stackoverflow.com/questions/3572030/bash-script-absolute-path-with-osx
+realpath()
+{
+	local __ret=$1
+	local path
+
+	if [[ $2 = /* ]]; then
+		path="$2"
+	else
+		path="$PWD/${2#./}"
+	fi
+
+	eval $__ret="'$path'"
+}
+
+restore_cluster()
+{
+	# Reset the pg_control files from the old cluster which were renamed
+	# .old by pg_upgrade to avoid booting up an upgraded cluster.
+	find ${OLD_DATADIR} -type f -name 'pg_control.old' |
+	while read control_file; do
+		mv "${control_file}" "${control_file%.old}"
+	done
+
+	# Remove the copied lalshell unless we're running in the gpdemo
+	# directory where it's version controlled
+	if ! git ls-files lalshell --error-unmatch >/dev/null 2>&1; then
+		rm -f lalshell
+	fi
+}
+
+upgrade_qd()
+{
+	mkdir -p $1
+
+	# Run pg_upgrade
+	pushd $1
+	time ${NEW_BINDIR}/pg_upgrade --old-bindir=${OLD_BINDIR} --old-datadir=$2 --new-bindir=${NEW_BINDIR} --new-datadir=$3 --dispatcher-mode
+	if (( $? )) ; then
+		echo "ERROR: Failure encountered in upgrading qd node"
+		exit 1
+	fi
+	popd
+
+	# Remember where we were when we upgraded the QD node. pg_upgrade generates
+	# some files there that we need to copy to QE nodes.
+	qddir=$1
+}
+
+upgrade_segment()
+{
+	mkdir -p $1
+
+	# Copy the OID files from the QD to segments.
+	cp $qddir/pg_upgrade_dump_*_oids.sql $1
+
+	# Run pg_upgrade
+	pushd $1
+	time ${NEW_BINDIR}/pg_upgrade --old-bindir=${OLD_BINDIR} --old-datadir=$2 --new-bindir=${NEW_BINDIR} --new-datadir=$3
+	if (( $? )) ; then
+		echo "ERROR: Failure encountered in upgrading node"
+		exit 1
+	fi
+	popd
+}
+
+usage()
+{
+	appname=`basename $0`
+	echo "$appname usage:"
+	echo " -o <dir>     Directory containing old datadir"
+	echo " -b <dir>     Directory containing binaries"
+	echo " -s           Run smoketest only"
+	echo " -C           Skip gpcheckcat test"
+	exit 0
+}
+
+# Main
+temp_root=`pwd`/tmp_check
+
+while getopts ":o:b:sC" opt; do
+	case ${opt} in
+		o )
+			realpath OLD_DATADIR "${OPTARG}"
+			;;
+		b )
+			realpath NEW_BINDIR "${OPTARG}"
+			realpath OLD_BINDIR "${OPTARG}"
+			;;
+		s )
+			smoketest=1
+			;;
+		C )
+			gpcheckcat=0
+			;;
+		* )
+			usage
+			;;
+	esac
+done
+
+if [ -z "${OLD_DATADIR}" ] || [ -z "${NEW_BINDIR}" ]; then
+	usage
+fi
+
+rm -rf "$temp_root"
+mkdir -p "$temp_root"
+if [ ! -d "$temp_root" ]; then
+	echo "ERROR: unable to create workdir: $temp_root"
+	exit 1
+fi
+
+trap restore_cluster EXIT
+
+# The cluster should be running by now, but in case it isn't, issue a restart.
+# Worst case we powercycle once for no reason, but it's better than failing
+# due to not having a cluster to work with.
+gpstart -a
+
+# Run any pre-upgrade tasks to prep the cluster
+if [ -f "test_gpdb_pre.sql" ]; then
+	psql -f test_gpdb_pre.sql regression
+fi
+
+# Ensure that the catalog is sane before attempting an upgrade. While there is
+# (limited) catalog checking inside pg_upgrade, it won't catch all issues, and
+# upgrading a faulty catalog won't work.
+if (( $gpcheckcat )) ; then
+	gpcheckcat
+		if (( $? )) ; then
+		echo "ERROR: gpcheckcat reported catalog issues, fix before upgrading"
+		exit 1
+	fi
+fi
+
+if (( !$smoketest )) ; then
+	${NEW_BINDIR}/pg_dumpall --schema-only -f "$temp_root/dump1.sql"
+fi
+
+gpstop -a
+
+# Create a new gpdemo cluster in the temproot. Using the old datadir for the
+# path to demo_cluster.sh is a bit of a hack, but since this test relies on
+# gpdemo having been used for ICW it will do for now.
+export MASTER_DEMO_PORT=17432
+export DEMO_PORT_BASE=27432
+export NUM_PRIMARY_MIRROR_PAIRS=3
+export MASTER_DATADIR=${temp_root}
+cp ${OLD_DATADIR}/../lalshell .
+BLDWRAP_POSTGRES_CONF_ADDONS=fsync=off ${OLD_DATADIR}/../demo_cluster.sh
+
+NEW_DATADIR="${temp_root}/datadirs"
+
+export MASTER_DATA_DIRECTORY="${NEW_DATADIR}/qddir/demoDataDir-1"
+export PGPORT=17432
+gpstop -a
+MASTER_DATA_DIRECTORY=""; unset MASTER_DATA_DIRECTORY
+PGPORT=""; unset PGPORT
+PGOPTIONS=""; unset PGOPTIONS
+
+# Start by upgrading the master
+upgrade_qd "${temp_root}/upgrade/qd" "${OLD_DATADIR}/qddir/demoDataDir-1/" "${NEW_DATADIR}/qddir/demoDataDir-1/"
+
+# If this is a minimal smoketest to ensure that we are pulling the Oids across
+# from the old cluster to the new, then exit here as we have now successfully
+# upgraded a node (the QD).
+if (( $smoketest )) ; then
+	restore_cluster
+	exit
+fi
+
+# Upgrade all the segments and mirrors. In a production setup the segments
+# would be upgraded first and then the mirrors once the segments are verified.
+# In this scenario we can cut corners since we don't have any important data
+# in the test cluster and we only consern ourselves with 100% success rate.
+for i in 1 2 3
+do
+	j=$(($i-1))
+	upgrade_segment "${temp_root}/upgrade/dbfast$i" "${OLD_DATADIR}/dbfast$i/demoDataDir$j/" "${NEW_DATADIR}/dbfast$i/demoDataDir$j/"
+	upgrade_segment "${temp_root}/upgrade/dbfast_mirror$i" "${OLD_DATADIR}/dbfast_mirror$i/demoDataDir$j/" "${NEW_DATADIR}/dbfast_mirror$i/demoDataDir$j/"
+done
+
+. ${NEW_BINDIR}/../greenplum_path.sh
+
+# Start the new cluster, dump it and stop it again when done. We need to bump
+# the exports to the new cluster for starting it but reset back to the old
+# when done. Set the same variables as gpdemo-env.sh exports. Since creation
+# of that file can collide between the gpdemo clusters, perform it manually
+export PGPORT=17432
+export MASTER_DATA_DIRECTORY="${NEW_DATADIR}/qddir/demoDataDir-1"
+gpstart -a
+
+# Run any post-upgrade tasks to prep the cluster for diffing
+if [ -f "test_gpdb_post.sql" ]; then
+	psql -f test_gpdb_post.sql regression
+fi
+
+${NEW_BINDIR}/pg_dumpall --schema-only -f "$temp_root/dump2.sql"
+gpstop -a
+export PGPORT=15432
+export MASTER_DATA_DIRECTORY="${OLD_DATADIR}/qddir/demoDataDir-1"
+
+# Since we've used the same pg_dumpall binary to create both dumps, whitespace
+# shouldn't be a cause of difference in the files but it is. Partitioning info
+# is generated via backend functionality in the cluster being dumped, and not
+# in pg_dump, so whitespace changes can trip up the diff.
+if diff -w "$temp_root/dump1.sql" "$temp_root/dump2.sql" >/dev/null; then
+	echo "Passed"
+	exit 0
+else
+	# To aid debugging in pipelines, print the diff to stdout
+	diff "$temp_root/dump1.sql" "$temp_root/dump2.sql"
+	echo "Error: before and after dumps differ"
+	exit 1
+fi
--- a/contrib/pg_upgrade/test_gpdb_pre.sql
+++ b/contrib/pg_upgrade/test_gpdb_pre.sql
+DROP TABLE IF EXISTS alter_ao_part_tables_column.sto_altap3 CASCADE;
+DROP TABLE IF EXISTS alter_ao_part_tables_row.sto_altap3 CASCADE;
+DROP TABLE IF EXISTS co_cr_sub_partzlib8192_1_2 CASCADE;
+DROP TABLE IF EXISTS co_cr_sub_partzlib8192_1 CASCADE;
+DROP TABLE IF EXISTS co_wt_sub_partrle_type8192_1_2 CASCADE;
+DROP TABLE IF EXISTS co_wt_sub_partrle_type8192_1 CASCADE;
+DROP TABLE IF EXISTS ao_wt_sub_partzlib8192_5 CASCADE;
+DROP TABLE IF EXISTS ao_wt_sub_partzlib8192_5_2 CASCADE;
+DROP TABLE IF EXISTS constraint_pt1 CASCADE;
+DROP TABLE IF EXISTS constraint_pt2 CASCADE;
+DROP TABLE IF EXISTS constraint_pt3 CASCADE;
+DROP TABLE IF EXISTS contest_inherit CASCADE;
+
+-- The indexes on mpp3033a partitions don't have their default names,
+-- presumably because the default names are taken when the tests
+-- are run. That's a problem, because in QD, pg_dump and restore will
+-- create them with new, defalt, names, as part of the CREATE INDEX
+-- command on the parent table. But when we do pg_dump and restore
+-- on a QE node, it doesn't have the partition hierarchy available,
+-- and will dump restore each index separately, with the original name.
+DROP TABLE IF EXISTS mpp3033a CASCADE;
+DROP TABLE IF EXISTS mpp3033b CASCADE;
+
+DROP TABLE IF EXISTS mpp17707 CASCADE;
+
+-- These partitioned tables have different indexes on different
+-- partitions. pg_dump cannot currently reconstruct that situation
+-- correctly.
+DROP TABLE IF EXISTS mpp7635_aoi_table2 CASCADE;
+DROP TABLE IF EXISTS partition_pruning.pt_lt_tab CASCADE;
+DROP TABLE IF EXISTS dcl_messaging_test CASCADE;
+DROP TABLE IF EXISTS my_tq_agg_opt_part CASCADE;
+DROP TABLE IF EXISTS pt_indx_tab CASCADE;
+
+-- Thes partitioned tables have a SERIAL column. That's also not
+-- not reconstructed by pg_dump + restore correctly.
+DROP TABLE IF EXISTS ao_wt_sub_partzlib8192_5_2_uncompr CASCADE;
+DROP TABLE IF EXISTS ao_wt_sub_partzlib8192_5_uncompr CASCADE;
+DROP TABLE IF EXISTS co_cr_sub_partzlib8192_1_2_uncompr CASCADE;
+DROP TABLE IF EXISTS co_cr_sub_partzlib8192_1_uncompr CASCADE;
+DROP TABLE IF EXISTS co_wt_sub_partrle_type8192_1_2_uncompr CASCADE;
+DROP TABLE IF EXISTS co_wt_sub_partrle_type8192_1_uncompr CASCADE;