From 994cc7ce996044a59225b4a121a1069cad480911 Mon Sep 17 00:00:00 2001 From: Paul Guo Date: Mon, 27 Apr 2020 08:03:51 +0800 Subject: [PATCH] Enlarge timeout in isolation2:pg_ctl UDF (#9991) Currently this UDF might report a false positive if the node is still starting up after timeout since currently pg_ctl returns 0 for this case. This behavior is changed in upstream with the below patch: commit f13ea95f9e473a43ee4e1baeb94daaf83535d37c Author: Tom Lane Date: Wed Jun 28 17:31:24 2017 -0400 Change pg_ctl to detect server-ready by watching status in postmaster.pid. We've seen some test flakiness due to this issue since pg_ctl restart needs more time sometimes on pipeline (by default pg_ctl timeout is 60 seconds). Yesterday I found on a hang job that a primary needs ~ 4 minutes to get the recovery finished during 'pg_ctl restart' (It's test ao_same_trans_truncate_crash which enables fsync. Even it launches a checkpoint before pg_ctl restart, pg_ctl restarts still needs a lot of time). Enlarge the timeout of pg_ctl to 600 seconds now and add a pg_ctl stdout checking before returning OK in the UDF (this check could be removed after PG 12 merge finishes so I added a FIXME there). Here is the output of the pg_ctl experiment: $ pg_ctl -l postmaster.log -D /data/gpdb7/gpAux/gpdemo/datadirs/dbfast1/demoDataDir0 -w -m immediate restart -t 1 waiting for server to shut down.... done server stopped waiting for server to start.... stopped waiting server is still starting up $ echo $? 0 Reviewed-by: Asim R P Cherry-picked from 934d87c6addf7c38f49dbbc38269db1eda944157 --- src/test/isolation2/helpers/server_helpers.sql | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/test/isolation2/helpers/server_helpers.sql b/src/test/isolation2/helpers/server_helpers.sql index 3f02aed657..76ec774f5b 100644 --- a/src/test/isolation2/helpers/server_helpers.sql +++ b/src/test/isolation2/helpers/server_helpers.sql @@ -20,14 +20,22 @@ returns text as $$ cmd = 'pg_ctl promote -D %s' % datadir elif command in ('stop', 'restart'): cmd = 'pg_ctl -l postmaster.log -D %s ' % datadir - cmd = cmd + '-w -m %s %s' % (command_mode, command) + cmd = cmd + '-w -t 600 -m %s %s' % (command_mode, command) else: return 'Invalid command input' proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) stdout, stderr = proc.communicate() - if proc.returncode == 0: + + # GPDB_12_MERGE_FIXME: upstream patch f13ea95f9e473a43ee4e1baeb94daaf83535d37c + # (Change pg_ctl to detect server-ready by watching status in postmaster.pid.) + # makes pg_ctl return 1 when the postgres is still starting up after timeout + # so there is only need of checking of returncode then. For now we still + # need to check stdout additionally since if the postgres is starting up + # pg_ctl still returns 0 after timeout. + + if proc.returncode == 0 and stdout.find("server is still starting up") == -1: return 'OK' else: raise PgCtlError(stdout+'|'+stderr) -- GitLab