From 937187e978996f824743bc67956f327d5bdc039f Mon Sep 17 00:00:00 2001 From: Ashwin Agrawal Date: Tue, 27 Oct 2020 09:54:34 -0700 Subject: [PATCH] Validate cluster state during regression tests It is often observed in CI that a test that leaves the cluster in an inconsistent state (e.g. primary-mirror pair is not in sync, or primary has not finished crash recovery) cause several following tests in the schedule to fail. What's worse, the culprit test itself may be reported as passed because its validation criteria did not include the state of the cluster. This has found to mislead debugging efforts, ultimately waste of time. To make debugging CI failures more efficient, this patch enhances pg_regress to perform the validation internally. If the validation fails, further testing is aborted. The validation is performed before running a group of tests specified on one line in the schedule file. Also, validation is performed before every single test, if running in serialized fashion. When the cluster validation is found to fail, the culprit is in the previously run test group. This patch is built on the ground work and analysis laid out in PR #9865 and PR #10825 by Wu Hao and Asim R P. Reviewed-by: Asim R P --- src/test/regress/pg_regress.c | 89 ++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 13 deletions(-) diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c index 0f6c23f15a..2882e53608 100644 --- a/src/test/regress/pg_regress.c +++ b/src/test/regress/pg_regress.c @@ -136,6 +136,8 @@ static int success_count = 0; static int fail_count = 0; static int fail_ignore_count = 0; +static bool halt_work = false; + static bool directory_exists(const char *dir); static void make_directory(const char *dir); @@ -149,6 +151,7 @@ static int run_diff(const char *cmd, const char *filename); static char *content_zero_hostname = NULL; static char *get_host_name(int16 contentid, char role); +static bool cluster_healthy(void); /* * allow core files if possible. @@ -2173,6 +2176,9 @@ run_schedule(const char *schedule, test_function tfunc) exit(2); } + if (!cluster_healthy()) + break; + gettimeofday(&start_time, NULL); if (num_tests == 1) { @@ -2336,6 +2342,9 @@ run_single_test(const char *test, test_function tfunc) *tl; bool differ = false; + if (!cluster_healthy()) + return; + status(_("test %-28s ... "), test); pid = (tfunc) (test, &resultfiles, &expectfiles, &tags); INSTR_TIME_SET_CURRENT(starttime); @@ -3212,17 +3221,17 @@ regression_main(int argc, char *argv[], init_function ifunc, test_function tfunc */ header(_("running regression test queries")); - for (sl = setup_tests; sl != NULL; sl = sl->next) + for (sl = setup_tests; sl != NULL && !halt_work; sl = sl->next) { run_single_test(sl->str, tfunc); } - for (sl = schedulelist; sl != NULL; sl = sl->next) + for (sl = schedulelist; sl != NULL && !halt_work; sl = sl->next) { run_schedule(sl->str, tfunc); } - for (sl = extra_tests; sl != NULL; sl = sl->next) + for (sl = extra_tests; sl != NULL && !halt_work; sl = sl->next) { run_single_test(sl->str, tfunc); } @@ -3308,21 +3317,44 @@ regression_main(int argc, char *argv[], init_function ifunc, test_function tfunc return 0; } -static char * -get_host_name(int16 contentid, char role) +/* + * Issue a command via psql, connecting to the specified database + * + */ +static void +psql_command_output(const char *database, char *buffer, int buf_len, const char *query,...) { - char psql_cmd[MAXPGPATH]; - FILE *fp; - char line[1024]; + char query_formatted[1024]; + char query_escaped[2048]; + char psql_cmd[MAXPGPATH + 2048]; + va_list args; + char *s; + char *d; + FILE *fp; int len; - char *hostname = NULL; + /* Generate the query with insertion of sprintf arguments */ + va_start(args, query); + vsnprintf(query_formatted, sizeof(query_formatted), query, args); + va_end(args); + + /* Now escape any shell double-quote metacharacters */ + d = query_escaped; + for (s = query_formatted; *s; s++) + { + if (strchr("\\\"$`", *s)) + *d++ = '\\'; + *d++ = *s; + } + *d = '\0'; + + /* And now we can build and execute the shell command */ len = snprintf(psql_cmd, sizeof(psql_cmd), - "\"%s%spsql\" -X -t -c \"select hostname from gp_segment_configuration where role=\'%c\' and content = %d;\" -d \"postgres\"", + "\"%s%spsql\" -X -t -c \"%s\" \"%s\"", bindir ? bindir : "", bindir ? "/" : "", - role, - contentid); + query_escaped, + database); if (len >= sizeof(psql_cmd)) exit(2); @@ -3334,7 +3366,7 @@ get_host_name(int16 contentid, char role) exit(2); } - if (fgets(line, sizeof(line), fp) == NULL) + if (fgets(buffer, buf_len, fp) == NULL) { fprintf(stderr, "%s: cannot read the result\n", progname); (void) pclose(fp); @@ -3346,6 +3378,37 @@ get_host_name(int16 contentid, char role) fprintf(stderr, "%s: cannot close shell command\n", progname); exit(2); } +} + +static bool +cluster_healthy(void) +{ + char line[1024]; + psql_command_output("postgres", line, 1024, + "SELECT * FROM gp_segment_configuration WHERE status = 'd' OR preferred_role != role;"); + + halt_work = false; + if (strcmp(line, "\n") != 0) + { + fprintf(stderr, _("\n==================================\n")); + fprintf(stderr, _(" Cluster validation failed:\n%s"), line); + fprintf(stderr, _("==================================\n")); + halt_work = true; + } + + return !halt_work; +} + +static char * +get_host_name(int16 contentid, char role) +{ + char line[1024]; + char *hostname = NULL; + + psql_command_output("postgres", line, 1024, + "SELECT hostname FROM gp_segment_configuration WHERE role=\'%c\' AND content = %d;", + role, + contentid); hostname = psprintf("%s", trim_white_space(line)); -- GitLab