提交 705ac7cc 编写于 作者: A Ashwin Agrawal

Remove flaky negative walrep test.

This scenario intended to be tested is if primary (walsender) exits, walreceiver
must exit as well. Then wal receiber should come-up and try to reconnect and
fail again till primary is not brought back. Once primary is up, it should be
able to connect.

The way is coded today is extremely hacky logic as simulates via fault injection
by creating file `wal_rcv_test` and providing option to suspend at what
point. Then signal.SIGUSR2 is sent to notify standby to inject the fault. There
exists no mechanism to check if fault was hit or not. So, tests tend to little
unreliable at times since don't know if fault was hit or not, hence large sleeps
are used in this test.

Plus, also file wal_rcv.pid is created in code just for testing purpose to
validate some behaviors.

Hence, removing this flaky time-consuming test.

(cherry picked from commit ff9c80cc)
上级 a88112e3
......@@ -37,134 +37,6 @@ import signal
class neg_test(StandbyRunMixin, MPPTestCase):
def test_negative(self):
for trigger_content in ['wait_before_send','wait_before_rcv']:
for master_shutdown_mode in ['immediate','fast','smart']:
self.connection_scenario(trigger_content, master_shutdown_mode)
logger.info('All tests passed!')
def get_pid_having_keyword(self, keyword):
proc = subprocess.Popen(['ps', '-ef'], stdout=subprocess.PIPE)
stdout = proc.communicate()[0]
search = keyword
for line in stdout.split('\n'):
if (line.find(search) > 0):
split_line = re.split(r'\s+', line.strip())
break
self.assertTrue(len(split_line) > 0)
pid = int(split_line[1])
return pid
def generate_trigger_file(self, filepath, filename, content):
self.assertTrue(os.path.exists (filepath))
self.assertTrue(os.path.isdir(filepath))
self.assertTrue(filename and content)
trigger_file = open(os.path.join(filepath,filename), "wb")
trigger_file.write(content)
trigger_file.close()
def connection_scenario(self, trigger_content, master_shutdown_mode):
# Verify if the system is UP
# Setup a standby
# Once the WAL receiver starts, signal it to suspend based on where the
# input parameter wants
# Once suspended, shutdown the Master(primary) based on the input mode.
# Release the WAL receiver and it should fail (dead). But later after waiting
# for some time it should re-try to connect to the Master and fail again
# till the actual Master comes up again.
#Note :- Sleeps used in this test are a little larger than normal times
#to cope up with events like for e.g. spawning of WAL Receiver which entirely
#depends on when the startup process signals the Postmaster to do it
# Verify if the database is up. Run some sql.
PSQL.run_sql_command('DROP table if exists foo')
Command('remove standby', 'gpinitstandby -ra').run()
self.assertEqual(self.standby.create(), 0)
# Trigger & evidence files cleanup
if (os.path.exists(os.path.join(self.standby.datadir,'wal_rcv.pid'))):
os.remove(os.path.join(self.standby.datadir,'wal_rcv.pid'))
if (os.path.exists(os.path.join(self.standby.datadir,
'wal_rcv_test'))):
os.remove(os.path.join(self.standby.datadir, 'wal_rcv_test'))
# Setup a standby
res = self.standby.start()
self.assertTrue(res.wasSuccessful())
# Wait for the walreceiver to start
num_walsender = self.wait_for_walsender()
self.assertEqual(num_walsender, 1)
logger.info('Activated WAL Receiver...')
# Cleanup the standby configuration from Master catalog
# This is to avoid re-start of the standby on Master re-start
dburl = dbconn.DbURL()
self.standby.remove_catalog_standby(dburl)
# Once the WAL receiver starts, signal it to suspend based on where the
# input parameter wants
wal_rcv_pid = self.get_pid_having_keyword('wal receiver process')
logger.info('Suspending WAL Receiver(' + str(wal_rcv_pid) +') ' + 'with ' + trigger_content)
self.generate_trigger_file(self.standby.datadir, 'wal_rcv_test', trigger_content)
os.kill(wal_rcv_pid, signal.SIGUSR2)
time.sleep(10)
self.assertTrue(not os.path.exists(os.path.join(self.standby.datadir,'wal_rcv.pid')))
# Once suspended, shutdown the Master(primary) based on the input mode.
logger.info('Shutdown the Master in ' + master_shutdown_mode + ' mode')
if master_shutdown_mode == 'immediate':
cmd = Command("gpstop master immediate", "gpstop -aim")
elif master_shutdown_mode == 'smart':
cmd = Command("gpstop master smart", "gpstop -am")
elif master_shutdown_mode == 'fast':
cmd = Command("gpstop master fast", "gpstop -afm")
cmd.run()
self.assertEqual(cmd.get_results().rc, 0, str(cmd))
# Release (resume) the WAL receiver and it should fail (dead). But later after waiting
# for some time it should re-try to connect to the Master and fail again
# till the actual Master comes up again.
logger.info('Resume the WAL Receiver(' + str(wal_rcv_pid) + ')')
self.generate_trigger_file(self.standby.datadir, 'wal_rcv_test', "resume")
os.kill(wal_rcv_pid, signal.SIGUSR2)
time.sleep(10)
# The pid file should exist. This is a proof that the WAL receiver came up
# but did not get a chance to connect to the Master and hence did not clean up
# the pid file
self.assertTrue(os.path.exists(os.path.join(self.standby.datadir,'wal_rcv.pid')))
logger.info('The WAL receiver pid file exists which means it restarted\n'
'but still could not connect to the Master (primary) and hence the\n'
'pid file was not cleared')
# Stop the standby as its of no use anymore
rc = subprocess.Popen('pg_ctl stop -D ' + self.standby.datadir + ' -m immediate',
shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
#TODO RKT - Ideally, only the Primary should have been started here. But given the current nature of
#gpstart supporting Master start only in utility mode and WAL repl not supporting utility
#connection to the Master, normal gpstart (Master, Standby and Segment restart) will be used
#for time being. This will be changed once utility support is added to WAL based repl.
cmd = Command("gpstart", "gpstart -a")
cmd.run()
self.assertTrue(cmd.get_results().rc in (0, 1), str(cmd))
logger.info('Pass (' + trigger_content + ',' + master_shutdown_mode + ')')
# Cleanup for the next iteration
shutil.rmtree(self.standby.datadir, True)
def test_fail_back(self):
"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册