提交 69ebd66c 编写于 作者: A Ashwin Agrawal

Make crash_recovery_dtm test stable.

crash_recovery_dtm test has a scenario which intends to test if QE
undergoes crash recovery after writing prepare record but before
responding to QD, the abort processing is completed fine. For the same
test used GUC `debug_abort_after_segment_prepared` to PANIC all the QE
at that specific point for DELETE. Next test executes SELECT query to
validate the DELETE was aborted. But flakiness comes if this SELECT
query gets executed while PANIC processing is still underway as test
had no way to wait till PANIC and restart completed before running the
SELECT.

Now the test instead uses fault injector to sleep at intended point
and uses pg_ctl restart -w to make sure recovery is completed and only
after that the SELECT query will be executed.

So, as a result removing the test only GUC
`debug_abort_after_segment_prepared` and related code for it.
上级 0132983b
......@@ -1159,14 +1159,6 @@ EndPrepare(GlobalTransaction gxact)
XLogFlush(gxact->prepare_lsn);
/* If we crash now, we have prepared: WAL replay will fix things */
if (Debug_abort_after_segment_prepared)
{
ereport(PANIC,
(errcode(ERRCODE_FAULT_INJECT),
errmsg("Raise an error as directed by Debug_abort_after_segment_prepared")));
}
/*
* Now we may update the CLOG, if we wrote COMMIT record above
*/
......@@ -1201,7 +1193,7 @@ EndPrepare(GlobalTransaction gxact)
*/
MyPgXact->delayChkpt = false;
SIMPLE_FAULT_INJECTOR(EndPreparedTwoPhaseSleep);
SIMPLE_FAULT_INJECTOR(EndPreparedTwoPhase);
/*
* Wait for synchronous replication, if required.
......
......@@ -107,7 +107,6 @@ bool Debug_print_full_dtm = false;
bool Debug_print_snapshot_dtm = false;
bool Debug_disable_distributed_snapshot = false;
bool Debug_abort_after_distributed_prepared = false;
bool Debug_abort_after_segment_prepared = false;
bool Debug_print_server_processes = false;
bool Debug_print_control_checkpoints = false;
bool Debug_appendonly_print_insert = false;
......@@ -1304,17 +1303,6 @@ struct config_bool ConfigureNamesBool_gp[] =
NULL, NULL, NULL
},
{
{"debug_abort_after_segment_prepared", PGC_SUSET, DEVELOPER_OPTIONS,
gettext_noop("Cause an abort after segment has written prepared XLOG record."),
NULL,
GUC_SUPERUSER_ONLY | GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
},
&Debug_abort_after_segment_prepared,
false,
NULL, NULL, NULL
},
{
{"debug_appendonly_print_blockdirectory", PGC_SUSET, DEVELOPER_OPTIONS,
gettext_noop("Print log messages for append-only block directory."),
......
......@@ -106,8 +106,8 @@ FI_IDENT(DtmBroadcastAbortPrepared, "dtm_broadcast_abort_prepared")
FI_IDENT(DtmXLogDistributedCommit, "dtm_xlog_distributed_commit")
/* inject fault before initializing dtm */
FI_IDENT(DtmInit, "dtm_init")
/* inject sleep after creation of two phase file */
FI_IDENT(EndPreparedTwoPhaseSleep, "end_prepare_two_phase_sleep")
/* inject fault after writing prepare record */
FI_IDENT(EndPreparedTwoPhase, "end_prepare_two_phase")
/* inject fault after segment receives state transition request (sleep after creating two phase files) */
FI_IDENT(SegmentTransitionRequest, "segment_transition_request")
/* inject fault after segment is probed by FTS */
......
......@@ -7,8 +7,14 @@
-- m/(PANIC):.*unable to complete*/
-- s/gid \=\s*\d+-\d+/gid \= DUMMY/gm
--
-- m/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/
-- s/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/
--
-- end_matchsubs
include: helpers/server_helpers.sql;
CREATE
-- This function is used to loop until master shutsdown, to make sure
-- next command executed is only after restart and doesn't go through
-- while PANIC is still being processed by master, as master continues
......@@ -186,8 +192,13 @@ INSERT 10
11: CHECKPOINT;
CHECKPOINT
-- Set to maximum number of 2PC retries to avoid any failures.
11: SET dtx_phase2_retry_count=9;
SET
11: alter system set dtx_phase2_retry_count to 15;
ALTER
11: select pg_reload_conf();
pg_reload_conf
----------------
t
(1 row)
-- skip FTS probes always
11: SELECT gp_inject_fault_infinite('fts_probe', 'skip', 1);
gp_inject_fault_infinite
......@@ -204,17 +215,46 @@ SET
-------------------------------
t
(1 row)
11: SET debug_abort_after_segment_prepared = true;
SET
11: DELETE FROM QE_panic_test_table;
ERROR: Raise an error as directed by Debug_abort_after_segment_prepared (seg0 127.0.0.1:40000 pid=547)
11: SELECT count(*) from QE_panic_test_table;
11: SELECT gp_inject_fault('end_prepare_two_phase', 'infinite_loop', dbid) from gp_segment_configuration where role = 'p' and content = 0;
gp_inject_fault
-----------------
t
(1 row)
-- statement to trigger fault after writing prepare record
12&: DELETE FROM QE_panic_test_table; <waiting ...>
11: SELECT gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid) from gp_segment_configuration where role = 'p' and content = 0;
gp_wait_until_triggered_fault
-------------------------------
t
(1 row)
11: SELECT pg_ctl(datadir, 'restart') from gp_segment_configuration where role = 'p' and content = 0;
pg_ctl
------------------------------------------------------------------------------------------------------
waiting for server to shut down done
server stopped
waiting for server to start done
server started
(1 row)
12<: <... completed>
ERROR: Error on receive from seg0 127.0.1.1:25432 pid=16146: server closed the connection unexpectedly
DETAIL:
This probably means the server terminated abnormally
before or while processing the request.
13: SELECT count(*) from QE_panic_test_table;
count
-------
10
(1 row)
11: SELECT gp_inject_fault('fts_probe', 'reset', 1);
13: SELECT gp_inject_fault('fts_probe', 'reset', 1);
gp_inject_fault
-----------------
t
(1 row)
13: alter system reset dtx_phase2_retry_count;
ALTER
13: select pg_reload_conf();
pg_reload_conf
----------------
t
(1 row)
......@@ -7,8 +7,13 @@
-- m/(PANIC):.*unable to complete*/
-- s/gid \=\s*\d+-\d+/gid \= DUMMY/gm
--
-- m/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/
-- s/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/
--
-- end_matchsubs
include: helpers/server_helpers.sql;
-- This function is used to loop until master shutsdown, to make sure
-- next command executed is only after restart and doesn't go through
-- while PANIC is still being processed by master, as master continues
......@@ -99,12 +104,19 @@ $$ LANGUAGE plpgsql;
-- To help speedy recovery
11: CHECKPOINT;
-- Set to maximum number of 2PC retries to avoid any failures.
11: SET dtx_phase2_retry_count=9;
11: alter system set dtx_phase2_retry_count to 15;
11: select pg_reload_conf();
-- skip FTS probes always
11: SELECT gp_inject_fault_infinite('fts_probe', 'skip', 1);
11: SELECT gp_request_fts_probe_scan();
11: select gp_wait_until_triggered_fault('fts_probe', 1, 1);
11: SET debug_abort_after_segment_prepared = true;
11: DELETE FROM QE_panic_test_table;
11: SELECT count(*) from QE_panic_test_table;
11: SELECT gp_inject_fault('fts_probe', 'reset', 1);
11: SELECT gp_inject_fault('end_prepare_two_phase', 'infinite_loop', dbid) from gp_segment_configuration where role = 'p' and content = 0;
-- statement to trigger fault after writing prepare record
12&: DELETE FROM QE_panic_test_table;
11: SELECT gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid) from gp_segment_configuration where role = 'p' and content = 0;
11: SELECT pg_ctl(datadir, 'restart') from gp_segment_configuration where role = 'p' and content = 0;
12<:
13: SELECT count(*) from QE_panic_test_table;
13: SELECT gp_inject_fault('fts_probe', 'reset', 1);
13: alter system reset dtx_phase2_retry_count;
13: select pg_reload_conf();
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册