From db60b003c9a6e20db69ee2bf5bb5b232861767fb Mon Sep 17 00:00:00 2001 From: "(Jerome)Junfeng Yang" Date: Mon, 13 Jul 2020 09:50:54 +0800 Subject: [PATCH] Fix flaky test for replication_keeps_crash. (#10423) Remove the set `gp_fts_probe_retries to 1` which may cause FTS probe failed. This was first added to reduce the test time, but set a lower retry value may cause the test failed to probe FTS update segment configuration. Since reduce the `gp_fts_replication_attempt_count` also save the test time, so skip alter ``gp_fts_probe_retries`. Also find an assertion may not match when mark mirror down happens before walsender exit, which will free the replication status before walsender exit and try to record disconnect info. Which lead the segment crash and starts recover. --- src/backend/replication/gp_replication.c | 9 ++++++--- .../expected/segwalrep/replication_keeps_crash.out | 10 ++++++---- .../sql/segwalrep/replication_keeps_crash.sql | 4 ++-- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/backend/replication/gp_replication.c b/src/backend/replication/gp_replication.c index 59457fdea9..2dcc8cbd68 100644 --- a/src/backend/replication/gp_replication.c +++ b/src/backend/replication/gp_replication.c @@ -291,10 +291,13 @@ FTSReplicationStatusMarkDisconnectForReplication(const char *app_name) LWLockAcquire(FTSReplicationStatusLock, LW_SHARED); - replication_status = RetrieveFTSReplicationStatus(app_name, false /* skip_warn */); + /* + * FTS may already mark the mirror down and free the replication status. + * For this case, a NULL pointer will return. + */ + replication_status = RetrieveFTSReplicationStatus(app_name, true /* skip_warn */); - /* replication_status must exist */ - Assert(replication_status); + /* if replication_status is NULL, do nothing */ FTSReplicationStatusMarkDisconnect(replication_status); LWLockRelease(FTSReplicationStatusLock); diff --git a/src/test/isolation2/expected/segwalrep/replication_keeps_crash.out b/src/test/isolation2/expected/segwalrep/replication_keeps_crash.out index 01039af38c..6f0aec6f3b 100644 --- a/src/test/isolation2/expected/segwalrep/replication_keeps_crash.out +++ b/src/test/isolation2/expected/segwalrep/replication_keeps_crash.out @@ -15,8 +15,6 @@ CREATE -- modify fts gucs to speed up the test. 1: alter system set gp_fts_probe_interval to 10; ALTER -1: alter system set gp_fts_probe_retries to 1; -ALTER 1: alter system set gp_fts_replication_attempt_count to 3; ALTER 1: select pg_reload_conf(); @@ -49,6 +47,12 @@ select gp_inject_fault_infinite('wal_sender_loop', 'error', dbid) from gp_segmen -- LSN to be flushed on mirror. 1&: create table mirror_block_t1 (a int) distributed by (a); +select gp_wait_until_triggered_fault('wal_sender_loop', 1, dbid) from gp_segment_configuration where role='p' and content=0; + gp_wait_until_triggered_fault +------------------------------- + Success: +(1 row) + -- trigger fts to mark mirror down. select gp_request_fts_probe_scan(); gp_request_fts_probe_scan @@ -108,8 +112,6 @@ DROP 1: alter system reset gp_fts_probe_interval; ALTER -1: alter system reset gp_fts_probe_retries; -ALTER 1: alter system reset gp_fts_replication_attempt_count; ALTER 1: select pg_reload_conf(); diff --git a/src/test/isolation2/sql/segwalrep/replication_keeps_crash.sql b/src/test/isolation2/sql/segwalrep/replication_keeps_crash.sql index 9439f6ce20..ec36a360ec 100644 --- a/src/test/isolation2/sql/segwalrep/replication_keeps_crash.sql +++ b/src/test/isolation2/sql/segwalrep/replication_keeps_crash.sql @@ -13,7 +13,6 @@ include: helpers/server_helpers.sql; -- modify fts gucs to speed up the test. 1: alter system set gp_fts_probe_interval to 10; -1: alter system set gp_fts_probe_retries to 1; 1: alter system set gp_fts_replication_attempt_count to 3; 1: select pg_reload_conf(); @@ -27,6 +26,8 @@ select gp_inject_fault_infinite('wal_sender_loop', 'error', dbid) -- LSN to be flushed on mirror. 1&: create table mirror_block_t1 (a int) distributed by (a); +select gp_wait_until_triggered_fault('wal_sender_loop', 1, dbid) from gp_segment_configuration where role='p' and content=0; + -- trigger fts to mark mirror down. select gp_request_fts_probe_scan(); @@ -55,6 +56,5 @@ SELECT role, preferred_role, content, mode, status FROM gp_segment_configuration drop table mirror_block_t1; 1: alter system reset gp_fts_probe_interval; -1: alter system reset gp_fts_probe_retries; 1: alter system reset gp_fts_replication_attempt_count; 1: select pg_reload_conf(); -- GitLab