test_gpdb.sh 12.1 KB
Newer Older
1
#!/usr/bin/env bash
2 3 4

# contrib/pg_upgrade/test_gpdb.sh
#
5 6 7 8 9 10 11 12 13 14
# Test driver for upgrading a Greenplum cluster with pg_upgrade within the same
# major version. Upgrading within the same major version is obviously not
# testing the full functionality of pg_upgrade, but it's a good compromise for
# being able to test pg_upgrade in a normal CI pipeline. Testing an actual
# major version upgrade need another setup. For test data, this script assumes
# the gpdemo cluster in gpAux/gpdemo/datadirs contains the end-state of an ICW
# test run. The test first performs a pg_dumpall, then initializes a parallel
# gpdemo cluster and upgrades it against the ICW cluster. After the upgrade it
# performs another pg_dumpall, if the two dumps match then the upgrade created
# a new identical copy of the cluster.
15 16 17 18 19 20

OLD_BINDIR=
OLD_DATADIR=
NEW_BINDIR=
NEW_DATADIR=

21 22 23
DEMOCLUSTER_OPTS=
PGUPGRADE_OPTS=

24 25 26 27
# The normal ICW run has a gpcheckcat call, so allow this testrunner to skip
# running it in case it was just executed to save time.
gpcheckcat=1

28 29 30 31 32
# gpdemo can create a cluster without mirrors, and if such a cluster should be
# upgraded then mirror upgrading must be turned off as it otherwise will report
# a failure.
mirrors=0

33 34 35 36 37 38
# Smoketesting pg_upgrade is done by just upgrading the QD without diffing the
# results. This is *NOT* a test of whether pg_upgrade can successfully upgrade
# a cluster but a test intended to catch when objects aren't properly handled
# in pg_dump/pg_upgrade wrt Oid synchronization
smoketest=0

39 40 41 42 43
# For debugging purposes it can be handy to keep the temporary directory around
# after the test. If set to 1 the directory isn't removed when the testscript
# exits
retain_tempdir=0

44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
# Not all platforms have a realpath binary in PATH, most notably macOS doesn't,
# so provide an alternative implementation. Returns an absolute path in the
# variable reference passed as the first parameter.  Code inspired by:
# http://stackoverflow.com/questions/3572030/bash-script-absolute-path-with-osx
realpath()
{
	local __ret=$1
	local path

	if [[ $2 = /* ]]; then
		path="$2"
	else
		path="$PWD/${2#./}"
	fi

	eval $__ret="'$path'"
}

restore_cluster()
{
64
	pushd $base_dir
65 66 67 68 69 70 71 72 73 74 75 76
	# Reset the pg_control files from the old cluster which were renamed
	# .old by pg_upgrade to avoid booting up an upgraded cluster.
	find ${OLD_DATADIR} -type f -name 'pg_control.old' |
	while read control_file; do
		mv "${control_file}" "${control_file%.old}"
	done

	# Remove the copied lalshell unless we're running in the gpdemo
	# directory where it's version controlled
	if ! git ls-files lalshell --error-unmatch >/dev/null 2>&1; then
		rm -f lalshell
	fi
77

78
	# Remove the temporary cluster, and associated files, if requested
79
	if (( !$retain_tempdir )) ; then
80 81 82 83 84 85
		# If we are asked to blow away the temp root, echo any potential error
		# files to the output channel to aid debugging
		find ${temp_root} -type f -name "*.txt" | grep -v share |
		while read error_file; do
			cat ${error_file}
		done
86 87 88 89 90 91
		# Remove configuration files created by setting up the new cluster
		rm -f "clusterConfigPostgresAddonsFile"
		rm -f "clusterConfigFile"
		rm -f "gpdemo-env.sh"
		rm -f "hostfile"
		# Remove temporary cluster
92 93
		rm -rf "$temp_root"
	fi
94 95
}

96 97 98 99 100 101 102 103 104 105 106 107 108
# Test for a nasty regression -- if VACUUM FREEZE doesn't work correctly during
# upgrade, things fail later in mysterious ways. As a litmus test, check to make
# sure that catalog tables have been frozen. (We use gp_segment_configuration
# because the upgrade shouldn't have touched it after the freeze.)
check_vacuum_worked()
{
	local datadir=$1
	local contentid=$2

	echo "Verifying VACUUM FREEZE using gp_segment_configuration xmins..."

	# Start the instance using the same pg_ctl invocation used by pg_upgrade.
	"${NEW_BINDIR}/pg_ctl" -w -l /dev/null -D "${datadir}" \
109
		-o "-p 18432 --gp_dbid=1 --gp_num_contents_in_cluster=0 --gp_contentid=${contentid} --xid_warn_limit=10000000 -b" \
110 111 112 113 114
		start

	# Query for the xmin ages.
	local xmin_ages=$( \
		PGOPTIONS='-c gp_session_role=utility' \
115 116
		"${NEW_BINDIR}/psql" -c 'SELECT age(xmin) FROM pg_catalog.gp_segment_configuration GROUP BY age(xmin);' \
			 -p 18432 -t -A template1 \
117 118 119 120 121 122 123
	)

	# Stop the instance.
	"${NEW_BINDIR}/pg_ctl" -l /dev/null -D "${datadir}" stop

	# Check to make sure all the xmins are frozen (maximum age).
	while read age; do
124
		if [ "$age" -ne "2147483647" ]; then
125 126 127 128 129 130 131 132
			echo "ERROR: gp_segment_configuration has an entry of age $age"
			return 1
		fi
	done <<< "$xmin_ages"

	return 0
}

133 134 135 136 137 138
upgrade_qd()
{
	mkdir -p $1

	# Run pg_upgrade
	pushd $1
139
	time ${NEW_BINDIR}/pg_upgrade --mode=dispatcher --old-bindir=${OLD_BINDIR} --old-datadir=$2 --new-bindir=${NEW_BINDIR} --new-datadir=$3 ${PGUPGRADE_OPTS}
140 141 142 143 144 145
	if (( $? )) ; then
		echo "ERROR: Failure encountered in upgrading qd node"
		exit 1
	fi
	popd

146 147 148 149 150
	if ! check_vacuum_worked "$3" -1; then
		echo "ERROR: VACUUM FREEZE appears to have failed during QD upgrade"
		exit 1
	fi

151 152 153 154 155 156 157 158 159 160 161
	# Remember where we were when we upgraded the QD node. pg_upgrade generates
	# some files there that we need to copy to QE nodes.
	qddir=$1
}

upgrade_segment()
{
	mkdir -p $1

	# Run pg_upgrade
	pushd $1
162
	time ${NEW_BINDIR}/pg_upgrade --mode=segment --old-bindir=${OLD_BINDIR} --old-datadir=$2 --new-bindir=${NEW_BINDIR} --new-datadir=$3 ${PGUPGRADE_OPTS}
163 164 165 166 167
	if (( $? )) ; then
		echo "ERROR: Failure encountered in upgrading node"
		exit 1
	fi
	popd
168 169 170 171

	# TODO: run check_vacuum_worked on each segment, too, once we have a good
	# candidate catalog table (gp_segment_configuration doesn't exist on
	# segments).
172 173 174 175 176 177 178 179 180 181
}

usage()
{
	appname=`basename $0`
	echo "$appname usage:"
	echo " -o <dir>     Directory containing old datadir"
	echo " -b <dir>     Directory containing binaries"
	echo " -s           Run smoketest only"
	echo " -C           Skip gpcheckcat test"
182 183 184
	echo " -k           Add checksums to new cluster"
	echo " -K           Remove checksums during upgrade"
	echo " -m           Upgrade mirrors"
185
	echo " -r           Retain temporary installation after test"
186 187 188 189 190
	exit 0
}

# Main
temp_root=`pwd`/tmp_check
191
base_dir=`pwd`
192

193
while getopts ":o:b:sCkKmr" opt; do
194 195 196 197 198 199 200 201 202 203 204 205 206 207
	case ${opt} in
		o )
			realpath OLD_DATADIR "${OPTARG}"
			;;
		b )
			realpath NEW_BINDIR "${OPTARG}"
			realpath OLD_BINDIR "${OPTARG}"
			;;
		s )
			smoketest=1
			;;
		C )
			gpcheckcat=0
			;;
208 209
		k )
			add_checksums=1
210
			PGUPGRADE_OPTS+=' --add-checksum '
211 212 213 214
			;;
		K )
			remove_checksums=1
			DEMOCLUSTER_OPTS=' -K '
215
			PGUPGRADE_OPTS+=' --remove-checksum '
216 217 218 219
			;;
		m )
			mirrors=1
			;;
220 221
		r )
			retain_tempdir=1
222
			PGUPGRADE_OPTS+=' --retain '
223
			;;
224 225 226 227 228 229 230 231 232 233
		* )
			usage
			;;
	esac
done

if [ -z "${OLD_DATADIR}" ] || [ -z "${NEW_BINDIR}" ]; then
	usage
fi

234 235 236
# This should be rejected by pg_upgrade as well, but this test is not concerned
# with testing handling incorrect option handling in pg_upgrade so we'll error
# out early instead.
237 238 239 240 241
if [ ! -z "${add_checksums}"] && [ ! -z "${remove_checksums}" ]; then
	echo "ERROR: adding and removing checksums are mutually exclusive"
	exit 1
fi

242 243 244 245 246 247 248 249 250 251
rm -rf "$temp_root"
mkdir -p "$temp_root"
if [ ! -d "$temp_root" ]; then
	echo "ERROR: unable to create workdir: $temp_root"
	exit 1
fi

trap restore_cluster EXIT

# The cluster should be running by now, but in case it isn't, issue a restart.
252 253 254 255 256 257 258 259 260 261 262
# Since we expect the testcluster to be a stock standard gpdemo, we test for
# the presence of it. Worst case we powercycle once for no reason, but it's
# better than failing due to not having a cluster to work with.
if [ -f "/tmp/.s.PGSQL.15432.lock" ]; then
	ps aux | grep  `head -1 /tmp/.s.PGSQL.15432.lock` | grep -q postgres
	if (( $? )) ; then
		gpstart -a
	fi
else
	gpstart -a
fi
263 264 265 266

# Run any pre-upgrade tasks to prep the cluster
if [ -f "test_gpdb_pre.sql" ]; then
	psql -f test_gpdb_pre.sql regression
267
	psql -f test_gpdb_pre.sql isolation2test
268 269 270 271 272 273 274
fi

# Ensure that the catalog is sane before attempting an upgrade. While there is
# (limited) catalog checking inside pg_upgrade, it won't catch all issues, and
# upgrading a faulty catalog won't work.
if (( $gpcheckcat )) ; then
	gpcheckcat
D
Daniel Gustafsson 已提交
275
	if (( $? )) ; then
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294
		echo "ERROR: gpcheckcat reported catalog issues, fix before upgrading"
		exit 1
	fi
fi

if (( !$smoketest )) ; then
	${NEW_BINDIR}/pg_dumpall --schema-only -f "$temp_root/dump1.sql"
fi

gpstop -a

# Create a new gpdemo cluster in the temproot. Using the old datadir for the
# path to demo_cluster.sh is a bit of a hack, but since this test relies on
# gpdemo having been used for ICW it will do for now.
export MASTER_DEMO_PORT=17432
export DEMO_PORT_BASE=27432
export NUM_PRIMARY_MIRROR_PAIRS=3
export MASTER_DATADIR=${temp_root}
cp ${OLD_DATADIR}/../lalshell .
295
BLDWRAP_POSTGRES_CONF_ADDONS=fsync=off ${OLD_DATADIR}/../demo_cluster.sh ${DEMOCLUSTER_OPTS}
296 297 298 299 300 301 302 303 304 305 306 307 308

NEW_DATADIR="${temp_root}/datadirs"

export MASTER_DATA_DIRECTORY="${NEW_DATADIR}/qddir/demoDataDir-1"
export PGPORT=17432
gpstop -a
MASTER_DATA_DIRECTORY=""; unset MASTER_DATA_DIRECTORY
PGPORT=""; unset PGPORT
PGOPTIONS=""; unset PGOPTIONS

# Start by upgrading the master
upgrade_qd "${temp_root}/upgrade/qd" "${OLD_DATADIR}/qddir/demoDataDir-1/" "${NEW_DATADIR}/qddir/demoDataDir-1/"

309 310
# If this is a minimal smoketest to ensure that we are handling all objects
# properly, then exit here as we have now successfully upgraded the QD.
311 312 313 314 315 316 317 318
if (( $smoketest )) ; then
	restore_cluster
	exit
fi

# Upgrade all the segments and mirrors. In a production setup the segments
# would be upgraded first and then the mirrors once the segments are verified.
# In this scenario we can cut corners since we don't have any important data
319
# in the test cluster and we only concern ourselves with 100% success rate.
320 321 322
for i in 1 2 3
do
	j=$(($i-1))
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338
	k=$(($i+1))

	# Replace the QE datadir with a copy of the QD datadir, in order to
	# bootstrap the QE upgrade so that we don't need to dump/restore
	mv "${NEW_DATADIR}/dbfast$i/demoDataDir$j/" "${NEW_DATADIR}/dbfast$i/demoDataDir$j.old/"
	cp -rp "${NEW_DATADIR}/qddir/demoDataDir-1/" "${NEW_DATADIR}/dbfast$i/demoDataDir$j/"
	# Retain the segment configuration
	cp "${NEW_DATADIR}/dbfast$i/demoDataDir$j.old/postgresql.conf" "${NEW_DATADIR}/dbfast$i/demoDataDir$j/postgresql.conf"
	cp "${NEW_DATADIR}/dbfast$i/demoDataDir$j.old/pg_hba.conf" "${NEW_DATADIR}/dbfast$i/demoDataDir$j/pg_hba.conf"
	cp "${NEW_DATADIR}/dbfast$i/demoDataDir$j.old/postmaster.opts" "${NEW_DATADIR}/dbfast$i/demoDataDir$j/postmaster.opts"
	cp "${NEW_DATADIR}/dbfast$i/demoDataDir$j.old/gp_replication.conf" "${NEW_DATADIR}/dbfast$i/demoDataDir$j/gp_replication.conf"
	# Remove QD only files
	rm -f "${NEW_DATADIR}/dbfast$i/demoDataDir$j/gp_dbid"
	rm -f "${NEW_DATADIR}/dbfast$i/demoDataDir$j/gpssh.conf"
	rm -rf "${NEW_DATADIR}/dbfast$i/demoDataDir$j/gpperfmon"
	# Upgrade the segment data files without dump/restore of the schema
339
	upgrade_segment "${temp_root}/upgrade/dbfast$i" "${OLD_DATADIR}/dbfast$i/demoDataDir$j/" "${NEW_DATADIR}/dbfast$i/demoDataDir$j/"
340

341 342 343
	if (( $mirrors )) ; then
		upgrade_segment "${temp_root}/upgrade/dbfast_mirror$i" "${OLD_DATADIR}/dbfast_mirror$i/demoDataDir$j/" "${NEW_DATADIR}/dbfast_mirror$i/demoDataDir$j/"
	fi
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378
done

. ${NEW_BINDIR}/../greenplum_path.sh

# Start the new cluster, dump it and stop it again when done. We need to bump
# the exports to the new cluster for starting it but reset back to the old
# when done. Set the same variables as gpdemo-env.sh exports. Since creation
# of that file can collide between the gpdemo clusters, perform it manually
export PGPORT=17432
export MASTER_DATA_DIRECTORY="${NEW_DATADIR}/qddir/demoDataDir-1"
gpstart -a

# Run any post-upgrade tasks to prep the cluster for diffing
if [ -f "test_gpdb_post.sql" ]; then
	psql -f test_gpdb_post.sql regression
fi

${NEW_BINDIR}/pg_dumpall --schema-only -f "$temp_root/dump2.sql"
gpstop -a
export PGPORT=15432
export MASTER_DATA_DIRECTORY="${OLD_DATADIR}/qddir/demoDataDir-1"

# Since we've used the same pg_dumpall binary to create both dumps, whitespace
# shouldn't be a cause of difference in the files but it is. Partitioning info
# is generated via backend functionality in the cluster being dumped, and not
# in pg_dump, so whitespace changes can trip up the diff.
if diff -w "$temp_root/dump1.sql" "$temp_root/dump2.sql" >/dev/null; then
	echo "Passed"
	exit 0
else
	# To aid debugging in pipelines, print the diff to stdout
	diff "$temp_root/dump1.sql" "$temp_root/dump2.sql"
	echo "Error: before and after dumps differ"
	exit 1
fi