提交 db41b9b9 编写于 作者: P Pengzhou Tang

add test cases for dispatch gang creation.

To test corner cases, we use faultinjector utility to simulate
segment recovery, segment FATAL&ERROR level errors when gangs are creating.
上级 39ed6031
......@@ -359,7 +359,7 @@ void cdbconn_doConnect(SegmentDatabaseDescriptor *segdbDesc,
if (!segdbDesc->errcode)
segdbDesc->errcode = ERRCODE_GP_INTERCONNECTION_ERROR;
appendPQExpBuffer(&segdbDesc->error_message, "%s\n", PQerrorMessage(segdbDesc->conn));
appendPQExpBuffer(&segdbDesc->error_message, "%s", PQerrorMessage(segdbDesc->conn));
/* Don't use elog, it's not thread-safe */
if (gp_log_gang >= GPVARS_VERBOSITY_DEBUG)
......
......@@ -22,6 +22,7 @@
#include "utils/guc.h"
#include "utils/lsyscache.h"
#include "utils/memutils.h"
#include "utils/faultinjector.h"
#include "miscadmin.h"
#include "cdb/cdbdisp.h"
......@@ -1286,6 +1287,8 @@ cdbdisp_dispatchX(DispatchCommandQueryParms *pQueryParms,
ds->primaryResults->writer_gang = primaryGang;
cdbdisp_dispatchToGang(ds, primaryGang, si, &direct);
SIMPLE_FAULT_INJECTOR(AfterOneSliceDispatched);
}
pfree(sliceVector);
......
......@@ -27,7 +27,6 @@
#include "miscadmin.h"
#include "utils/gp_atomic.h"
static int getTimeout(const struct timeval* startTS);
static Gang *createGang_async(GangType type, int gang_id, int size, int content);
CreateGangFunc pCreateGangFuncAsync = createGang_async;
......@@ -48,6 +47,7 @@ createGang_async(GangType type, int gang_id, int size, int content)
int in_recovery_mode_count = 0;
int successful_connections = 0;
bool retry = false;
bool timeout = false;
ELOG_DISPATCHER_DEBUG("createGang type = %d, gang_id = %d, size = %d, content = %d",
type, gang_id, size, content);
......@@ -63,7 +63,7 @@ createGang_async(GangType type, int gang_id, int size, int content)
/* Check writer gang firstly*/
if (type != GANGTYPE_PRIMARY_WRITER && !isPrimaryWriterGangAlive())
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to create gang on one or more segments"),
errmsg("failed to acquire resources on one or more segments"),
errdetail("writer gang got broken before creating reader gangs")));
create_gang_retry:
......@@ -82,7 +82,6 @@ create_gang_retry:
MemoryContextSwitchTo(newGangDefinition->perGangContext);
struct pollfd *fds;
struct timeval startTS;
PG_TRY();
{
......@@ -115,7 +114,7 @@ create_gang_retry:
if(cdbconn_isBadConnection(segdbDesc))
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to create gang on one or more segments"),
errmsg("failed to acquire resources on one or more segments"),
errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami)));
}
......@@ -124,13 +123,11 @@ create_gang_retry:
* timeout clock (= get the start timestamp), and poll until they're
* all completed or we reach timeout.
*/
gettimeofday(&startTS, NULL);
fds = (struct pollfd *) palloc0(sizeof(struct pollfd) * size);
for(;;)
{
int nready;
int timeout;
int nfds = 0;
for (i = 0; i < size; i++)
......@@ -153,7 +150,7 @@ create_gang_retry:
errmsg("failed to acquire resources on one or more segments"),
errdetail("Internal error: No motion listener port (%s)", segdbDesc->whoami)));
successful_connections++;
break;
continue;
case PGRES_POLLING_READING:
fds[nfds].fd = PQsocket(segdbDesc->conn);
......@@ -176,18 +173,22 @@ create_gang_retry:
else
{
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to create gang on one or more segments"),
errmsg("failed to acquire resources on one or more segments"),
errdetail("%s (%s)", PQerrorMessage(segdbDesc->conn), segdbDesc->whoami)));
}
break;
default:
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to create gang on one or more segments"),
errdetail("unknown pollStatus")));
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to acquire resources on one or more segments"),
errdetail("unknow pollstatus (%s)", segdbDesc->whoami)));
break;
}
if (timeout)
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to acquire resources on one or more segments"),
errdetail("timeout expired\n (%s)", segdbDesc->whoami)));
}
if (nfds == 0)
......@@ -195,10 +196,9 @@ create_gang_retry:
CHECK_FOR_INTERRUPTS();
timeout = getTimeout(&startTS);
/* Wait until something happens */
nready = poll(fds, nfds, timeout);
nready = poll(fds, nfds, gp_segment_connect_timeout ?
gp_segment_connect_timeout : -1);
if (nready < 0)
{
int sock_errno = SOCK_ERRNO;
......@@ -206,18 +206,11 @@ create_gang_retry:
continue;
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to create gang on one or more segments"),
errmsg("failed to acquire resources on one or more segments"),
errdetail("poll() failed: errno = %d", sock_errno)));
}
else if (nready == 0)
{
if (timeout != 0)
continue;
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to create gang on one or more segments"),
errdetail("createGang timeout after %d seconds", gp_segment_connect_timeout)));
}
timeout = true;
}
ELOG_DISPATCHER_DEBUG("createGang: %d processes requested; %d successful connections %d in recovery",
......@@ -234,14 +227,14 @@ create_gang_retry:
if (isFTSEnabled() &&
FtsTestSegmentDBIsDown(newGangDefinition->db_descriptors, size))
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to create gang on one or more segments"),
errmsg("failed to acquire resources on one or more segments"),
errdetail("FTS detected one or more segments are down")));
if ( gp_gang_creation_retry_count <= 0 ||
create_gang_retry_counter++ >= gp_gang_creation_retry_count ||
type != GANGTYPE_PRIMARY_WRITER)
ereport(ERROR, (errcode(ERRCODE_GP_INTERCONNECTION_ERROR),
errmsg("failed to create gang on one or more segments"),
errmsg("failed to acquire resources on one or more segments"),
errdetail("segments is in recovery mode")));
ELOG_DISPATCHER_DEBUG("createGang: gang creation failed, but retryable.");
......@@ -280,26 +273,3 @@ create_gang_retry:
return newGangDefinition;
}
static int getTimeout(const struct timeval* startTS)
{
struct timeval now;
int timeout;
int64 diff_us;
gettimeofday(&now, NULL);
if (gp_segment_connect_timeout > 0)
{
diff_us = (now.tv_sec - startTS->tv_sec) * 1000000;
diff_us += (int) now.tv_usec - (int) startTS->tv_usec;
if (diff_us > (int64) gp_segment_connect_timeout * 1000000)
timeout = 0;
else
timeout = gp_segment_connect_timeout * 1000 - diff_us / 1000;
}
else
timeout = -1;
return timeout;
}
......@@ -2953,6 +2953,8 @@ retry1:
break;
}
SIMPLE_FAULT_INJECTOR(ProcessStartupPacketFault);
return STATUS_OK;
}
......
......@@ -3361,6 +3361,7 @@ drop_unnamed_stmt(void)
void
quickdie(SIGNAL_ARGS)
{
SIMPLE_FAULT_INJECTOR(QuickDie);
quickdie_impl();
}
......
......@@ -313,6 +313,12 @@ FaultInjectorIdentifierEnumToString[] = {
/* inject fault while translating relcache entries */
_("send_qe_details_init_backend"),
/* inject fault before sending QE details during backend initialization */
_("process_startup_packet"),
/* inject fault in ProcessStartupPacket() */
_("quickdie"),
/* inject fault in quickdie*/
_("after_one_slice_dispatched"),
/* inject fault in cdbdisp_dispatchX*/
_("not recognized"),
};
......
......@@ -209,6 +209,9 @@ typedef enum FaultInjectorIdentifier_e {
OptRelcacheTranslatorCatalogAccess,
SendQEDetailsInitBackend,
ProcessStartupPacketFault,
QuickDie,
AfterOneSliceDispatched,
/* INSERT has to be done before that line */
FaultInjectorIdMax,
......
-- Misc tests related to dispatching queries to segments.
-- Test quoting of GUC values and databse names when they're sent to segments
-- There used to be a bug in the quoting when the search_path setting was sent
-- to the segment. It was not easily visible when search_path was set with a
-- SET command, only when the setting was sent as part of the startup packet.
-- Set search_path as a per-user setting so that we can test that.
CREATE DATABASE "dispatch test db";
ALTER DATABASE "dispatch test db" SET search_path="my schema",public;
NOTICE: schema "my schema" does not exist
\c "dispatch test db"
CREATE SCHEMA "my schema";
-- Create a table with the same name in both schemas, "my schema" and public.
CREATE TABLE "my table" (t text);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 't' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO "my table" VALUES ('myschema.mytable');
CREATE TABLE public."my table" (t text);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 't' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO public."my table" VALUES ('public.mytable');
SELECT t as unquoted FROM "my table";
unquoted
------------------
myschema.mytable
(1 row)
SELECT t as myschema FROM "my schema"."my table";
myschema
------------------
myschema.mytable
(1 row)
SELECT t as public FROM public."my table";
public
----------------
public.mytable
(1 row)
DROP TABLE "my table";
DROP TABLE public."my table";
-- Create another table with the same name. To make sure the DROP worked
-- and dropped the correct table.
CREATE TABLE "my table" (id integer);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
DROP TABLE "my table";
-- Clean up
\c regression
DROP DATABASE "dispatch test db";
--
-- test QD will report failure if QE fails to send its motion_listener back
-- during backend initialization
--
-- start_ignore
\! gpfaultinjector -f send_qe_details_init_backend -y reset -s 2
20160823:15:12:59:015496 gpfaultinjector:localhost:gpadmin-[INFO]:-Starting gpfaultinjector with args: -f send_qe_details_init_backend -y reset -s 2
20160823:15:12:59:015496 gpfaultinjector:localhost:gpadmin-[INFO]:-local Greenplum Version: 'postgres (Greenplum Database) 4.3.99.00 build dev'
20160823:15:12:59:015496 gpfaultinjector:localhost:gpadmin-[INFO]:-Obtaining Segment details from master...
20160823:15:12:59:015496 gpfaultinjector:localhost:gpadmin-[INFO]:-Injecting fault on 1 segment(s)
20160823:15:12:59:015496 gpfaultinjector:localhost:gpadmin-[INFO]:-Injecting fault on localhost.localdomain:/home/gpadmin/workspace/data/single_debug/primary/gpseg0:content=0:dbid=2:mode=s:status=u
20160823:15:12:59:015496 gpfaultinjector:localhost:gpadmin-[INFO]:-DONE
-- inject a 'skip' fault before QE sends its motion_listener
\! gpfaultinjector -f send_qe_details_init_backend -y skip -s 2 -o 0
20160823:15:12:59:015508 gpfaultinjector:localhost:gpadmin-[INFO]:-Starting gpfaultinjector with args: -f send_qe_details_init_backend -y skip -s 2 -o 0
20160823:15:12:59:015508 gpfaultinjector:localhost:gpadmin-[INFO]:-local Greenplum Version: 'postgres (Greenplum Database) 4.3.99.00 build dev'
20160823:15:12:59:015508 gpfaultinjector:localhost:gpadmin-[INFO]:-Obtaining Segment details from master...
20160823:15:12:59:015508 gpfaultinjector:localhost:gpadmin-[INFO]:-Injecting fault on 1 segment(s)
20160823:15:12:59:015508 gpfaultinjector:localhost:gpadmin-[INFO]:-Injecting fault on localhost.localdomain:/home/gpadmin/workspace/data/single_debug/primary/gpseg0:content=0:dbid=2:mode=s:status=u
20160823:15:12:59:015508 gpfaultinjector:localhost:gpadmin-[INFO]:-DONE
-- end_ignore
-- terminate exiting QEs first
\c
-- verify failure will be reported
SELECT 1 FROM gp_dist_random('gp_id');
ERROR: failed to acquire resources on one or more segments
DETAIL: Internal error: No motion listener port (seg0 10.152.10.117:25432)
-- reset fault injector
-- start_ignore
\! gpfaultinjector -f send_qe_details_init_backend -y reset -s 2
20160823:15:12:59:015522 gpfaultinjector:localhost:gpadmin-[INFO]:-Starting gpfaultinjector with args: -f send_qe_details_init_backend -y reset -s 2
20160823:15:12:59:015522 gpfaultinjector:localhost:gpadmin-[INFO]:-local Greenplum Version: 'postgres (Greenplum Database) 4.3.99.00 build dev'
20160823:15:12:59:015522 gpfaultinjector:localhost:gpadmin-[INFO]:-Obtaining Segment details from master...
20160823:15:12:59:015522 gpfaultinjector:localhost:gpadmin-[INFO]:-Injecting fault on 1 segment(s)
20160823:15:12:59:015522 gpfaultinjector:localhost:gpadmin-[INFO]:-Injecting fault on localhost.localdomain:/home/gpadmin/workspace/data/single_debug/primary/gpseg0:content=0:dbid=2:mode=s:status=u
20160823:15:12:59:015522 gpfaultinjector:localhost:gpadmin-[INFO]:-DONE
-- end_ignore
......@@ -25,7 +25,7 @@ ignore: leastsquares
test: opr_sanity_gp decode_expr bitmapscan bitmapscan_ao case_gp limit_gp notin percentile naivebayes join_gp union_gp gpcopy gp_create_table
test: filter gpctas gpdist matrix toast sublink table_functions olap_setup complex opclass_ddl bitmap_index information_schema
test: indexjoin as_alias regex_gp gpparams with_clause transient_types gang_mgmt
# dispatch should always run seperately from other cases.
test: dispatch
# 'segspace' relies on the segment spill space to be 0, and uses fault injectors
......
......@@ -118,6 +118,13 @@ s/Table "pg_temp_\d+.temp/Table "pg_temp_#####/
m/^LOG.*\"Feature/
s/^LOG.*\"Feature/\"Feature/
m/^.*(seg\d.*:.*)/
s/^.*(seg\d.*:.*)//
m/^DETAIL: Internal error: No motion listener port \(seg\d.*:.*\)/
s/^DETAIL: Internal error: No motion listener port \(seg\d.*:.*\)//
# Mask out gp_debug_linger HINT message for dispatch
m/^HINT: Process \d+ will wait for gp_debug_linger=\d+ seconds before termination\./
s/^HINT: Process \d+ will wait for gp_debug_linger=\d+ seconds before termination\.//
m/^ \(seg\d .*:\d+\)/
s/^ \(seg\d .*:\d+\)//
-- end_matchsubs
-- Misc tests related to dispatching queries to segments.
-- Test quoting of GUC values and databse names when they're sent to segments
-- There used to be a bug in the quoting when the search_path setting was sent
-- to the segment. It was not easily visible when search_path was set with a
-- SET command, only when the setting was sent as part of the startup packet.
-- Set search_path as a per-user setting so that we can test that.
CREATE DATABASE "dispatch test db";
ALTER DATABASE "dispatch test db" SET search_path="my schema",public;
\c "dispatch test db"
CREATE SCHEMA "my schema";
-- Create a table with the same name in both schemas, "my schema" and public.
CREATE TABLE "my table" (t text);
INSERT INTO "my table" VALUES ('myschema.mytable');
CREATE TABLE public."my table" (t text);
INSERT INTO public."my table" VALUES ('public.mytable');
SELECT t as unquoted FROM "my table";
SELECT t as myschema FROM "my schema"."my table";
SELECT t as public FROM public."my table";
DROP TABLE "my table";
DROP TABLE public."my table";
-- Create another table with the same name. To make sure the DROP worked
-- and dropped the correct table.
CREATE TABLE "my table" (id integer);
DROP TABLE "my table";
-- Clean up
\c regression
DROP DATABASE "dispatch test db";
--
-- test QD will report failure if QE fails to send its motion_listener back
-- during backend initialization
--
-- start_ignore
\! gpfaultinjector -q -f send_qe_details_init_backend -y reset -s 2
-- inject a 'skip' fault before QE sends its motion_listener
\! gpfaultinjector -q -f send_qe_details_init_backend -y skip -s 2 -o 0
-- end_ignore
-- terminate exiting QEs first
\c
-- verify failure will be reported
SELECT 1 FROM gp_dist_random('gp_id');
-- reset fault injector
-- start_ignore
\! gpfaultinjector -q -f send_qe_details_init_backend -y reset -s 2
-- end_ignore
--
-- Test suit : test gang creation and commands dispatching
--
--start_ignore
drop table if exists dispatch_test;
drop table if exists dispatch_test_t1;
drop table if exists dispatch_test_t2;
drop table if exists dispatch_test_t3;
--end_ignore
create table dispatch_test as select i as c1 from generate_series(1, 10) i;
create table dispatch_test_t1 (c1 int, c2 int, c3 int);
create table dispatch_test_t2 (c1 int, c2 int, c3 int);
create table dispatch_test_t3 (c1 int, c2 int, c3 int);
insert into dispatch_test_t1 values (1,1,2);
insert into dispatch_test_t2 values (2,1,2);
insert into dispatch_test_t3 values (3,1,2);
CREATE OR REPLACE FUNCTION cleanupAllGangs() RETURNS BOOL
AS '@abs_builddir@/regress@DLSUFFIX@', 'cleanupAllGangs' LANGUAGE C;
-- check if segments has backend running
CREATE OR REPLACE FUNCTION numBackendsOnSegment() RETURNS INTEGER
AS '@abs_builddir@/regress@DLSUFFIX@', 'numBackendsOnSegment' LANGUAGE C;
-- check if QD has reusable gangs
CREATE OR REPLACE FUNCTION hasGangsExist() RETURNS BOOL
AS '@abs_builddir@/regress@DLSUFFIX@', 'hasGangsExist' LANGUAGE C;
CREATE VIEW v_hasBackendsOnSegment as select sum(numBackendsOnSegment()) > 0 from
gp_dist_random('gp_id');
-- disable debug linger to get immediate feedback from FATAL errors.
set gp_debug_linger to 0;
-- test log debug related code within dispatch
set gp_log_gang to debug;
set log_min_messages to DEBUG;
-- Case 1.1
-- A segment in recovery mode, writer gang retry gp_gang_creation_retry_count times and finally success
-- set maximun retry time to 60 seconds, this should be long enough for segment
-- recovery back. otherwise it should be bug somewhere
set gp_gang_creation_retry_count to 120;
set gp_gang_creation_retry_timer to 500;
select cleanupAllGangs();
-- trigger fault and put segment 0 into recovery mode
\! gpfaultinjector -q -f process_startup_packet -y segv --seg_dbid 2
--start_ignore
select 'trigger fault' from gp_dist_random('gp_id');
--end_ignore
-- should success after retry
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
-- Case 1.2
-- A segment in recovery mode for long time, writer gang retry gp_gang_creation_retry_count times and finally failed
-- set maximun retry time to 0.4s, so we can test if gp_gang_creation_retry_count
-- is actually work
set gp_gang_creation_retry_count to 2;
set gp_gang_creation_retry_timer to 200;
select cleanupAllGangs();
-- trigger fault and put segment 0 into recovery mode
\! gpfaultinjector -q -f process_startup_packet -y segv --seg_dbid 2
\! gpfaultinjector -q -f quickdie -y suspend --seg_dbid 2
--start_ignore
select 'trigger fault' from gp_dist_random('gp_id');
--end_ignore
-- should failed after 2 times
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
\! gpfaultinjector -q -f quickdie -y resume --seg_dbid 2
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
\! gpfaultinjector -q -f quickdie -y reset --seg_dbid 2
--start_ignore
-- enlarge the retry count
set gp_gang_creation_retry_count to 128 ;
-- this will block until segment 0 recovery back, or report an error
-- after 24 seconds.
select 'wait recovery finish' from gp_dist_random('gp_id');
--end_ignore
-- cleanup all reusable gangs
select cleanupAllGangs();
-- expect no zombie backends left on segments
select * from v_hasBackendsOnSegment;
-- should success
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
-- Case 1.3
-- segment has non in-recovery-mode errors
-- when creating writer gang, an error reported and all gangs should be cleaned.
-- when creating reader gang, an error reported and only current gang is cleaned.
select cleanupAllGangs();
-- segment 0 report an error when get a request
\! gpfaultinjector -q -f process_startup_packet -y error --seg_dbid 2
-- expect failure
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
-- expect no resuable gang exist
select * from hasGangsExist();
-- expect no zombie backends left on segments.
select * from v_hasBackendsOnSegment;
-- cleanupAllGangs();
select cleanupAllGangs();
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
-- segment 0 report an error when get the second request (reader gang creation request)
\! gpfaultinjector -q -f process_startup_packet -y error --seg_dbid 2 -o 3
-- expect failure
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
-- expect resuable gang exist
select * from hasGangsExist();
-- expect QEs exist.
select * from v_hasBackendsOnSegment;
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
-- Case 1.4
-- Test createGang timeout.
-- gp_segment_connect_timeout = 0 : wait forever
-- gp_segment_connect_timeout = 1 : wait 1 second
set gp_segment_connect_timeout to 1;
select cleanupAllGangs();
\! gpfaultinjector -q -f process_startup_packet -y suspend --seg_dbid 2
-- expect timeout failure
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
\! gpfaultinjector -q -f process_startup_packet -y resume --seg_dbid 2
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
set gp_segment_connect_timeout to 0;
select cleanupAllGangs();
\! gpfaultinjector -q -f process_startup_packet -y sleep --seg_dbid 2 -z 1
-- expect success
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
-- Case 1.5
-- query was cancelled when dispatching commands to one gang.
-- query should be cancelled as expected.
-- must set log_min_messages to default when using interrupt, there is a bug in fault injection.
set log_min_messages to default;
\! gpfaultinjector -q -f after_one_slice_dispatched -y interrupt --seg_dbid 1
-- should fail and report error
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
\! gpfaultinjector -q -f after_one_slice_dispatched -y reset --seg_dbid 1
-- Misc tests related to dispatching queries to segments.
-- Test quoting of GUC values and databse names when they're sent to segments
-- There used to be a bug in the quoting when the search_path setting was sent
-- to the segment. It was not easily visible when search_path was set with a
-- SET command, only when the setting was sent as part of the startup packet.
-- Set search_path as a per-user setting so that we can test that.
CREATE DATABASE "dispatch test db";
ALTER DATABASE "dispatch test db" SET search_path="my schema",public;
NOTICE: schema "my schema" does not exist
\c "dispatch test db"
CREATE SCHEMA "my schema";
-- Create a table with the same name in both schemas, "my schema" and public.
CREATE TABLE "my table" (t text);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 't' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO "my table" VALUES ('myschema.mytable');
CREATE TABLE public."my table" (t text);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 't' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
INSERT INTO public."my table" VALUES ('public.mytable');
SELECT t as unquoted FROM "my table";
unquoted
------------------
myschema.mytable
(1 row)
SELECT t as myschema FROM "my schema"."my table";
myschema
------------------
myschema.mytable
(1 row)
SELECT t as public FROM public."my table";
public
----------------
public.mytable
(1 row)
DROP TABLE "my table";
DROP TABLE public."my table";
-- Create another table with the same name. To make sure the DROP worked
-- and dropped the correct table.
CREATE TABLE "my table" (id integer);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
DROP TABLE "my table";
-- Clean up
\c regression
DROP DATABASE "dispatch test db";
--
-- test QD will report failure if QE fails to send its motion_listener back
-- during backend initialization
--
-- start_ignore
\! gpfaultinjector -q -f send_qe_details_init_backend -y reset -s 2
-- inject a 'skip' fault before QE sends its motion_listener
\! gpfaultinjector -q -f send_qe_details_init_backend -y skip -s 2 -o 0
-- end_ignore
-- terminate exiting QEs first
\c
-- verify failure will be reported
SELECT 1 FROM gp_dist_random('gp_id');
ERROR: failed to acquire resources on one or more segments
DETAIL: Internal error: No motion listener port (seg0 127.0.0.1:40000)
-- reset fault injector
-- start_ignore
\! gpfaultinjector -q -f send_qe_details_init_backend -y reset -s 2
-- end_ignore
--
-- Test suit : test gang creation and commands dispatching
--
--start_ignore
drop table if exists dispatch_test;
NOTICE: table "dispatch_test" does not exist, skipping
drop table if exists dispatch_test_t1;
NOTICE: table "dispatch_test_t1" does not exist, skipping
drop table if exists dispatch_test_t2;
NOTICE: table "dispatch_test_t2" does not exist, skipping
drop table if exists dispatch_test_t3;
NOTICE: table "dispatch_test_t3" does not exist, skipping
--end_ignore
create table dispatch_test as select i as c1 from generate_series(1, 10) i;
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'c1' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table dispatch_test_t1 (c1 int, c2 int, c3 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table dispatch_test_t2 (c1 int, c2 int, c3 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
create table dispatch_test_t3 (c1 int, c2 int, c3 int);
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'c1' as the Greenplum Database data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
insert into dispatch_test_t1 values (1,1,2);
insert into dispatch_test_t2 values (2,1,2);
insert into dispatch_test_t3 values (3,1,2);
CREATE OR REPLACE FUNCTION cleanupAllGangs() RETURNS BOOL
AS '@abs_builddir@/regress@DLSUFFIX@', 'cleanupAllGangs' LANGUAGE C;
-- check if segments has backend running
CREATE OR REPLACE FUNCTION numBackendsOnSegment() RETURNS INTEGER
AS '@abs_builddir@/regress@DLSUFFIX@', 'numBackendsOnSegment' LANGUAGE C;
-- check if QD has reusable gangs
CREATE OR REPLACE FUNCTION hasGangsExist() RETURNS BOOL
AS '@abs_builddir@/regress@DLSUFFIX@', 'hasGangsExist' LANGUAGE C;
CREATE VIEW v_hasBackendsOnSegment as select sum(numBackendsOnSegment()) > 0 from
gp_dist_random('gp_id');
-- disable debug linger to get immediate feedback from FATAL errors.
set gp_debug_linger to 0;
-- test log debug related code within dispatch
set gp_log_gang to debug;
set log_min_messages to DEBUG;
-- Case 1.1
-- A segment in recovery mode, writer gang retry gp_gang_creation_retry_count times and finally success
-- set maximun retry time to 60 seconds, this should be long enough for segment
-- recovery back. otherwise it should be bug somewhere
set gp_gang_creation_retry_count to 120;
set gp_gang_creation_retry_timer to 500;
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
-- trigger fault and put segment 0 into recovery mode
\! gpfaultinjector -q -f process_startup_packet -y segv --seg_dbid 2
--start_ignore
select 'trigger fault' from gp_dist_random('gp_id');
ERROR: failed to acquire resources on one or more segments
DETAIL: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
(seg0 127.0.0.1:40000)
--end_ignore
-- should success after retry
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
c1 | c2 | c3 | c1 | c2 | c3 | c1 | c2 | c3
----+----+----+----+----+----+----+----+----
1 | 1 | 2 | 2 | 1 | 2 | 3 | 1 | 2
(1 row)
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
-- Case 1.2
-- A segment in recovery mode for long time, writer gang retry gp_gang_creation_retry_count times and finally failed
-- set maximun retry time to 0.4s, so we can test if gp_gang_creation_retry_count
-- is actually work
set gp_gang_creation_retry_count to 2;
set gp_gang_creation_retry_timer to 200;
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
-- trigger fault and put segment 0 into recovery mode
\! gpfaultinjector -q -f process_startup_packet -y segv --seg_dbid 2
\! gpfaultinjector -q -f quickdie -y suspend --seg_dbid 2
--start_ignore
select 'trigger fault' from gp_dist_random('gp_id');
ERROR: failed to acquire resources on one or more segments
DETAIL: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
(seg0 127.0.0.1:40000)
--end_ignore
-- should failed after 2 times
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
ERROR: failed to acquire resources on one or more segments
DETAIL: segments is in recovery mode
\! gpfaultinjector -q -f quickdie -y resume --seg_dbid 2
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
\! gpfaultinjector -q -f quickdie -y reset --seg_dbid 2
--start_ignore
-- enlarge the retry count
set gp_gang_creation_retry_count to 128 ;
-- this will block until segment 0 recovery back, or report an error
-- after 24 seconds.
select 'wait recovery finish' from gp_dist_random('gp_id');
?column?
----------------------
wait recovery finish
wait recovery finish
wait recovery finish
(3 rows)
--end_ignore
-- cleanup all reusable gangs
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
-- expect no zombie backends left on segments
select * from v_hasBackendsOnSegment;
?column?
----------
f
(1 row)
-- should success
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
c1 | c2 | c3 | c1 | c2 | c3 | c1 | c2 | c3
----+----+----+----+----+----+----+----+----
1 | 1 | 2 | 2 | 1 | 2 | 3 | 1 | 2
(1 row)
-- Case 1.3
-- segment has non in-recovery-mode errors
-- when creating writer gang, an error reported and all gangs should be cleaned.
-- when creating reader gang, an error reported and only current gang is cleaned.
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
-- segment 0 report an error when get a request
\! gpfaultinjector -q -f process_startup_packet -y error --seg_dbid 2
-- expect failure
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
ERROR: failed to acquire resources on one or more segments
DETAIL: FATAL: fault triggered, fault name:'process_startup_packet' fault type:'error' (faultinjector.c:683)
HINT: Process 8632 will wait for gp_debug_linger=120 seconds before termination.
Note that its locks and other resources will not be released until then.
(seg0 127.0.0.1:40000)
-- expect no resuable gang exist
select * from hasGangsExist();
hasgangsexist
---------------
f
(1 row)
-- expect no zombie backends left on segments.
select * from v_hasBackendsOnSegment;
?column?
----------
f
(1 row)
-- cleanupAllGangs();
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
-- segment 0 report an error when get the second request (reader gang creation request)
\! gpfaultinjector -q -f process_startup_packet -y error --seg_dbid 2 -o 3
-- expect failure
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
ERROR: failed to acquire resources on one or more segments
DETAIL: FATAL: fault triggered, fault name:'process_startup_packet' fault type:'error' (faultinjector.c:683)
HINT: Process 8685 will wait for gp_debug_linger=120 seconds before termination.
Note that its locks and other resources will not be released until then.
(seg0 127.0.0.1:40000)
-- expect resuable gang exist
select * from hasGangsExist();
hasgangsexist
---------------
t
(1 row)
-- expect QEs exist.
select * from v_hasBackendsOnSegment;
?column?
----------
t
(1 row)
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
-- Case 1.4
-- Test createGang timeout.
-- gp_segment_connect_timeout = 0 : wait forever
-- gp_segment_connect_timeout = 1 : wait 1 second
set gp_segment_connect_timeout to 1;
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
\! gpfaultinjector -q -f process_startup_packet -y suspend --seg_dbid 2
-- expect timeout failure
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
ERROR: failed to acquire resources on one or more segments
DETAIL: timeout expired
(seg0 10.22.22.22:40000)
\! gpfaultinjector -q -f process_startup_packet -y resume --seg_dbid 2
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
set gp_segment_connect_timeout to 0;
select cleanupAllGangs();
cleanupallgangs
-----------------
t
(1 row)
\! gpfaultinjector -q -f process_startup_packet -y sleep --seg_dbid 2 -z 1
-- expect success
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
c1 | c2 | c3 | c1 | c2 | c3 | c1 | c2 | c3
----+----+----+----+----+----+----+----+----
1 | 1 | 2 | 2 | 1 | 2 | 3 | 1 | 2
(1 row)
\! gpfaultinjector -q -f process_startup_packet -y reset --seg_dbid 2
-- Case 1.5
-- query was cancelled when dispatching commands to one gang.
-- query should be cancelled as expected.
-- must set log_min_messages to default when using interrupt, there is a bug in fault injection.
set log_min_messages to default;
\! gpfaultinjector -q -f after_one_slice_dispatched -y interrupt --seg_dbid 1
-- should fail and report error
select * from dispatch_test_t1, dispatch_test_t2, dispatch_test_t3
where dispatch_test_t1.c2 = dispatch_test_t2.c2 and dispatch_test_t2.c3 = dispatch_test_t3.c3;
ERROR: canceling statement due to user request
\! gpfaultinjector -q -f after_one_slice_dispatched -y reset --seg_dbid 1
......@@ -8,12 +8,15 @@
#include <float.h>
#include <math.h>
#include <unistd.h>
#include "pgstat.h"
#include "access/transam.h"
#include "access/xact.h"
#include "catalog/pg_language.h"
#include "catalog/pg_type.h"
#include "cdb/memquota.h"
#include "cdb/cdbgang.h"
#include "commands/sequence.h"
#include "commands/trigger.h"
#include "executor/executor.h"
......@@ -74,6 +77,15 @@ extern Datum checkRelationAfterInvalidation(PG_FUNCTION_ARGS);
/* Gang management test support */
extern Datum gangRaiseInfo(PG_FUNCTION_ARGS);
/* brutally cleanup all gangs */
extern Datum cleanupAllGangs(PG_FUNCTION_ARGS);
/* check if QD has gangs exist */
extern Datum hasGangsExist(PG_FUNCTION_ARGS);
/* get number of backends on segments except myself */
extern Datum numBackendsOnSegment(PG_FUNCTION_ARGS);
/*
* test_atomic_ops was backported from 9.5. This prototype doesn't appear
* in the upstream version, because the PG_FUNCTION_INFO_V1() macro includes
......@@ -2433,6 +2445,40 @@ gangRaiseInfo(PG_FUNCTION_ARGS)
PG_RETURN_BOOL(true);
}
PG_FUNCTION_INFO_V1(cleanupAllGangs);
Datum
cleanupAllGangs(PG_FUNCTION_ARGS)
{
disconnectAndDestroyAllGangs(false);
PG_RETURN_BOOL(true);
}
PG_FUNCTION_INFO_V1(hasGangsExist);
Datum
hasGangsExist(PG_FUNCTION_ARGS)
{
if (gangsExist())
PG_RETURN_BOOL(true);
PG_RETURN_BOOL(false);
}
PG_FUNCTION_INFO_V1(numBackendsOnSegment);
Datum
numBackendsOnSegment(PG_FUNCTION_ARGS)
{
int beid;
int32 result = 0;
int pid = getpid();
int tot_backends = pgstat_fetch_stat_numbackends();
for (beid = 1; beid <= tot_backends; beid++)
{
PgBackendStatus *beentry = pgstat_fetch_stat_beentry(beid);
if (beentry && beentry->st_procpid >0 && beentry->st_procpid != pid)
result++;
}
PG_RETURN_INT32(result);
}
#ifndef PG_HAVE_ATOMIC_FLAG_SIMULATION
static void
......
-- Misc tests related to dispatching queries to segments.
-- Test quoting of GUC values and databse names when they're sent to segments
-- There used to be a bug in the quoting when the search_path setting was sent
-- to the segment. It was not easily visible when search_path was set with a
-- SET command, only when the setting was sent as part of the startup packet.
-- Set search_path as a per-user setting so that we can test that.
CREATE DATABASE "dispatch test db";
ALTER DATABASE "dispatch test db" SET search_path="my schema",public;
\c "dispatch test db"
CREATE SCHEMA "my schema";
-- Create a table with the same name in both schemas, "my schema" and public.
CREATE TABLE "my table" (t text);
INSERT INTO "my table" VALUES ('myschema.mytable');
CREATE TABLE public."my table" (t text);
INSERT INTO public."my table" VALUES ('public.mytable');
SELECT t as unquoted FROM "my table";
SELECT t as myschema FROM "my schema"."my table";
SELECT t as public FROM public."my table";
DROP TABLE "my table";
DROP TABLE public."my table";
-- Create another table with the same name. To make sure the DROP worked
-- and dropped the correct table.
CREATE TABLE "my table" (id integer);
DROP TABLE "my table";
-- Clean up
\c regression
DROP DATABASE "dispatch test db";
--
-- test QD will report failure if QE fails to send its motion_listener back
-- during backend initialization
--
-- start_ignore
\! gpfaultinjector -f send_qe_details_init_backend -y reset -s 2
-- inject a 'skip' fault before QE sends its motion_listener
\! gpfaultinjector -f send_qe_details_init_backend -y skip -s 2 -o 0
-- end_ignore
-- terminate exiting QEs first
\c
-- verify failure will be reported
SELECT 1 FROM gp_dist_random('gp_id');
-- reset fault injector
-- start_ignore
\! gpfaultinjector -f send_qe_details_init_backend -y reset -s 2
-- end_ignore
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册