提交 937187e9 编写于 作者: A Ashwin Agrawal

Validate cluster state during regression tests

It is often observed in CI that a test that leaves the cluster in an
inconsistent state (e.g. primary-mirror pair is not in sync, or
primary has not finished crash recovery) cause several following tests
in the schedule to fail.  What's worse, the culprit test itself may be
reported as passed because its validation criteria did not include the
state of the cluster.  This has found to mislead debugging efforts,
ultimately waste of time.

To make debugging CI failures more efficient, this patch enhances
pg_regress to perform the validation internally. If the validation
fails, further testing is aborted.

The validation is performed before running a group of tests specified
on one line in the schedule file. Also, validation is performed before
every single test, if running in serialized fashion.

When the cluster validation is found to fail, the culprit is in the
previously run test group.

This patch is built on the ground work and analysis laid out in
PR #9865 and PR #10825 by Wu Hao and Asim R P.
Reviewed-by: NAsim R P <pasim@vmware.com>
上级 8299a524
......@@ -136,6 +136,8 @@ static int success_count = 0;
static int fail_count = 0;
static int fail_ignore_count = 0;
static bool halt_work = false;
static bool directory_exists(const char *dir);
static void make_directory(const char *dir);
......@@ -149,6 +151,7 @@ static int run_diff(const char *cmd, const char *filename);
static char *content_zero_hostname = NULL;
static char *get_host_name(int16 contentid, char role);
static bool cluster_healthy(void);
/*
* allow core files if possible.
......@@ -2173,6 +2176,9 @@ run_schedule(const char *schedule, test_function tfunc)
exit(2);
}
if (!cluster_healthy())
break;
gettimeofday(&start_time, NULL);
if (num_tests == 1)
{
......@@ -2336,6 +2342,9 @@ run_single_test(const char *test, test_function tfunc)
*tl;
bool differ = false;
if (!cluster_healthy())
return;
status(_("test %-28s ... "), test);
pid = (tfunc) (test, &resultfiles, &expectfiles, &tags);
INSTR_TIME_SET_CURRENT(starttime);
......@@ -3212,17 +3221,17 @@ regression_main(int argc, char *argv[], init_function ifunc, test_function tfunc
*/
header(_("running regression test queries"));
for (sl = setup_tests; sl != NULL; sl = sl->next)
for (sl = setup_tests; sl != NULL && !halt_work; sl = sl->next)
{
run_single_test(sl->str, tfunc);
}
for (sl = schedulelist; sl != NULL; sl = sl->next)
for (sl = schedulelist; sl != NULL && !halt_work; sl = sl->next)
{
run_schedule(sl->str, tfunc);
}
for (sl = extra_tests; sl != NULL; sl = sl->next)
for (sl = extra_tests; sl != NULL && !halt_work; sl = sl->next)
{
run_single_test(sl->str, tfunc);
}
......@@ -3308,21 +3317,44 @@ regression_main(int argc, char *argv[], init_function ifunc, test_function tfunc
return 0;
}
static char *
get_host_name(int16 contentid, char role)
/*
* Issue a command via psql, connecting to the specified database
*
*/
static void
psql_command_output(const char *database, char *buffer, int buf_len, const char *query,...)
{
char psql_cmd[MAXPGPATH];
FILE *fp;
char line[1024];
char query_formatted[1024];
char query_escaped[2048];
char psql_cmd[MAXPGPATH + 2048];
va_list args;
char *s;
char *d;
FILE *fp;
int len;
char *hostname = NULL;
/* Generate the query with insertion of sprintf arguments */
va_start(args, query);
vsnprintf(query_formatted, sizeof(query_formatted), query, args);
va_end(args);
/* Now escape any shell double-quote metacharacters */
d = query_escaped;
for (s = query_formatted; *s; s++)
{
if (strchr("\\\"$`", *s))
*d++ = '\\';
*d++ = *s;
}
*d = '\0';
/* And now we can build and execute the shell command */
len = snprintf(psql_cmd, sizeof(psql_cmd),
"\"%s%spsql\" -X -t -c \"select hostname from gp_segment_configuration where role=\'%c\' and content = %d;\" -d \"postgres\"",
"\"%s%spsql\" -X -t -c \"%s\" \"%s\"",
bindir ? bindir : "",
bindir ? "/" : "",
role,
contentid);
query_escaped,
database);
if (len >= sizeof(psql_cmd))
exit(2);
......@@ -3334,7 +3366,7 @@ get_host_name(int16 contentid, char role)
exit(2);
}
if (fgets(line, sizeof(line), fp) == NULL)
if (fgets(buffer, buf_len, fp) == NULL)
{
fprintf(stderr, "%s: cannot read the result\n", progname);
(void) pclose(fp);
......@@ -3346,6 +3378,37 @@ get_host_name(int16 contentid, char role)
fprintf(stderr, "%s: cannot close shell command\n", progname);
exit(2);
}
}
static bool
cluster_healthy(void)
{
char line[1024];
psql_command_output("postgres", line, 1024,
"SELECT * FROM gp_segment_configuration WHERE status = 'd' OR preferred_role != role;");
halt_work = false;
if (strcmp(line, "\n") != 0)
{
fprintf(stderr, _("\n==================================\n"));
fprintf(stderr, _(" Cluster validation failed:\n%s"), line);
fprintf(stderr, _("==================================\n"));
halt_work = true;
}
return !halt_work;
}
static char *
get_host_name(int16 contentid, char role)
{
char line[1024];
char *hostname = NULL;
psql_command_output("postgres", line, 1024,
"SELECT hostname FROM gp_segment_configuration WHERE role=\'%c\' AND content = %d;",
role,
contentid);
hostname = psprintf("%s", trim_white_space(line));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册