From 69ebd66c91cf3e0c8161568bc349e3c3a394eb59 Mon Sep 17 00:00:00 2001 From: Ashwin Agrawal Date: Tue, 12 Feb 2019 00:38:47 -0800 Subject: [PATCH] Make crash_recovery_dtm test stable. crash_recovery_dtm test has a scenario which intends to test if QE undergoes crash recovery after writing prepare record but before responding to QD, the abort processing is completed fine. For the same test used GUC `debug_abort_after_segment_prepared` to PANIC all the QE at that specific point for DELETE. Next test executes SELECT query to validate the DELETE was aborted. But flakiness comes if this SELECT query gets executed while PANIC processing is still underway as test had no way to wait till PANIC and restart completed before running the SELECT. Now the test instead uses fault injector to sleep at intended point and uses pg_ctl restart -w to make sure recovery is completed and only after that the SELECT query will be executed. So, as a result removing the test only GUC `debug_abort_after_segment_prepared` and related code for it. --- src/backend/access/transam/twophase.c | 10 +--- src/backend/utils/misc/guc_gp.c | 12 ---- src/include/utils/faultinjector_lists.h | 4 +- .../expected/crash_recovery_dtm.out | 56 ++++++++++++++++--- .../isolation2/sql/crash_recovery_dtm.sql | 22 ++++++-- 5 files changed, 68 insertions(+), 36 deletions(-) diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index 03a8828ae1..14cbc800ab 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -1159,14 +1159,6 @@ EndPrepare(GlobalTransaction gxact) XLogFlush(gxact->prepare_lsn); - /* If we crash now, we have prepared: WAL replay will fix things */ - if (Debug_abort_after_segment_prepared) - { - ereport(PANIC, - (errcode(ERRCODE_FAULT_INJECT), - errmsg("Raise an error as directed by Debug_abort_after_segment_prepared"))); - } - /* * Now we may update the CLOG, if we wrote COMMIT record above */ @@ -1201,7 +1193,7 @@ EndPrepare(GlobalTransaction gxact) */ MyPgXact->delayChkpt = false; - SIMPLE_FAULT_INJECTOR(EndPreparedTwoPhaseSleep); + SIMPLE_FAULT_INJECTOR(EndPreparedTwoPhase); /* * Wait for synchronous replication, if required. diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index f780811c4e..fa023bdd4b 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -107,7 +107,6 @@ bool Debug_print_full_dtm = false; bool Debug_print_snapshot_dtm = false; bool Debug_disable_distributed_snapshot = false; bool Debug_abort_after_distributed_prepared = false; -bool Debug_abort_after_segment_prepared = false; bool Debug_print_server_processes = false; bool Debug_print_control_checkpoints = false; bool Debug_appendonly_print_insert = false; @@ -1304,17 +1303,6 @@ struct config_bool ConfigureNamesBool_gp[] = NULL, NULL, NULL }, - { - {"debug_abort_after_segment_prepared", PGC_SUSET, DEVELOPER_OPTIONS, - gettext_noop("Cause an abort after segment has written prepared XLOG record."), - NULL, - GUC_SUPERUSER_ONLY | GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE - }, - &Debug_abort_after_segment_prepared, - false, - NULL, NULL, NULL - }, - { {"debug_appendonly_print_blockdirectory", PGC_SUSET, DEVELOPER_OPTIONS, gettext_noop("Print log messages for append-only block directory."), diff --git a/src/include/utils/faultinjector_lists.h b/src/include/utils/faultinjector_lists.h index b09c91dbf5..a0de09ca40 100644 --- a/src/include/utils/faultinjector_lists.h +++ b/src/include/utils/faultinjector_lists.h @@ -106,8 +106,8 @@ FI_IDENT(DtmBroadcastAbortPrepared, "dtm_broadcast_abort_prepared") FI_IDENT(DtmXLogDistributedCommit, "dtm_xlog_distributed_commit") /* inject fault before initializing dtm */ FI_IDENT(DtmInit, "dtm_init") -/* inject sleep after creation of two phase file */ -FI_IDENT(EndPreparedTwoPhaseSleep, "end_prepare_two_phase_sleep") +/* inject fault after writing prepare record */ +FI_IDENT(EndPreparedTwoPhase, "end_prepare_two_phase") /* inject fault after segment receives state transition request (sleep after creating two phase files) */ FI_IDENT(SegmentTransitionRequest, "segment_transition_request") /* inject fault after segment is probed by FTS */ diff --git a/src/test/isolation2/expected/crash_recovery_dtm.out b/src/test/isolation2/expected/crash_recovery_dtm.out index 8b037df90c..c1c7be0d89 100644 --- a/src/test/isolation2/expected/crash_recovery_dtm.out +++ b/src/test/isolation2/expected/crash_recovery_dtm.out @@ -7,8 +7,14 @@ -- m/(PANIC):.*unable to complete*/ -- s/gid \=\s*\d+-\d+/gid \= DUMMY/gm -- +-- m/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/ +-- s/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/ +-- -- end_matchsubs +include: helpers/server_helpers.sql; +CREATE + -- This function is used to loop until master shutsdown, to make sure -- next command executed is only after restart and doesn't go through -- while PANIC is still being processed by master, as master continues @@ -186,8 +192,13 @@ INSERT 10 11: CHECKPOINT; CHECKPOINT -- Set to maximum number of 2PC retries to avoid any failures. -11: SET dtx_phase2_retry_count=9; -SET +11: alter system set dtx_phase2_retry_count to 15; +ALTER +11: select pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) -- skip FTS probes always 11: SELECT gp_inject_fault_infinite('fts_probe', 'skip', 1); gp_inject_fault_infinite @@ -204,17 +215,46 @@ SET ------------------------------- t (1 row) -11: SET debug_abort_after_segment_prepared = true; -SET -11: DELETE FROM QE_panic_test_table; -ERROR: Raise an error as directed by Debug_abort_after_segment_prepared (seg0 127.0.0.1:40000 pid=547) -11: SELECT count(*) from QE_panic_test_table; +11: SELECT gp_inject_fault('end_prepare_two_phase', 'infinite_loop', dbid) from gp_segment_configuration where role = 'p' and content = 0; + gp_inject_fault +----------------- + t +(1 row) +-- statement to trigger fault after writing prepare record +12&: DELETE FROM QE_panic_test_table; +11: SELECT gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid) from gp_segment_configuration where role = 'p' and content = 0; + gp_wait_until_triggered_fault +------------------------------- + t +(1 row) +11: SELECT pg_ctl(datadir, 'restart') from gp_segment_configuration where role = 'p' and content = 0; + pg_ctl +------------------------------------------------------------------------------------------------------ + waiting for server to shut down done +server stopped +waiting for server to start done +server started + +(1 row) +12<: <... completed> +ERROR: Error on receive from seg0 127.0.1.1:25432 pid=16146: server closed the connection unexpectedly +DETAIL: + This probably means the server terminated abnormally + before or while processing the request. +13: SELECT count(*) from QE_panic_test_table; count ------- 10 (1 row) -11: SELECT gp_inject_fault('fts_probe', 'reset', 1); +13: SELECT gp_inject_fault('fts_probe', 'reset', 1); gp_inject_fault ----------------- t (1 row) +13: alter system reset dtx_phase2_retry_count; +ALTER +13: select pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) diff --git a/src/test/isolation2/sql/crash_recovery_dtm.sql b/src/test/isolation2/sql/crash_recovery_dtm.sql index 0cd60c04f1..70a8524155 100644 --- a/src/test/isolation2/sql/crash_recovery_dtm.sql +++ b/src/test/isolation2/sql/crash_recovery_dtm.sql @@ -7,8 +7,13 @@ -- m/(PANIC):.*unable to complete*/ -- s/gid \=\s*\d+-\d+/gid \= DUMMY/gm -- +-- m/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/ +-- s/^ERROR: Error on receive from seg0.*: server closed the connection unexpectedly/ERROR: server closed the connection unexpectedly/ +-- -- end_matchsubs +include: helpers/server_helpers.sql; + -- This function is used to loop until master shutsdown, to make sure -- next command executed is only after restart and doesn't go through -- while PANIC is still being processed by master, as master continues @@ -99,12 +104,19 @@ $$ LANGUAGE plpgsql; -- To help speedy recovery 11: CHECKPOINT; -- Set to maximum number of 2PC retries to avoid any failures. -11: SET dtx_phase2_retry_count=9; +11: alter system set dtx_phase2_retry_count to 15; +11: select pg_reload_conf(); -- skip FTS probes always 11: SELECT gp_inject_fault_infinite('fts_probe', 'skip', 1); 11: SELECT gp_request_fts_probe_scan(); 11: select gp_wait_until_triggered_fault('fts_probe', 1, 1); -11: SET debug_abort_after_segment_prepared = true; -11: DELETE FROM QE_panic_test_table; -11: SELECT count(*) from QE_panic_test_table; -11: SELECT gp_inject_fault('fts_probe', 'reset', 1); +11: SELECT gp_inject_fault('end_prepare_two_phase', 'infinite_loop', dbid) from gp_segment_configuration where role = 'p' and content = 0; +-- statement to trigger fault after writing prepare record +12&: DELETE FROM QE_panic_test_table; +11: SELECT gp_wait_until_triggered_fault('end_prepare_two_phase', 1, dbid) from gp_segment_configuration where role = 'p' and content = 0; +11: SELECT pg_ctl(datadir, 'restart') from gp_segment_configuration where role = 'p' and content = 0; +12<: +13: SELECT count(*) from QE_panic_test_table; +13: SELECT gp_inject_fault('fts_probe', 'reset', 1); +13: alter system reset dtx_phase2_retry_count; +13: select pg_reload_conf(); -- GitLab