提交 9254b1d4 编写于 作者: C Chumki Roy

Fix persistent rebuild tool issue when mirror data dir is empty

During pt rebuild when the mirror is down and the mirror data directory has some missing files,
persistent rebuild will fail with the error "missing files from source".
To mitigate this, we will be skipping the pertsistent rebuild for mirror if it is down.
We don't need to backup the files if the segment is already down anyways.

Add unit tests and behave test.

Authors: Chumki Roy & Marbin Tan
上级 333d859c
......@@ -170,7 +170,7 @@ class DbIdInfo:
"""
Stores all the information regarding a single dbid
"""
def __init__(self, content, role, dbid, port, hostname, filespace_dirs, fs_to_ts_map, ts_to_dboid_map):
def __init__(self, content, role, dbid, port, hostname, filespace_dirs, fs_to_ts_map, ts_to_dboid_map, is_down):
self.content = content
self.role = role
self.dbid = dbid
......@@ -179,13 +179,14 @@ class DbIdInfo:
self.filespace_dirs = filespace_dirs
self.fs_to_ts_map = fs_to_ts_map
self.ts_to_dboid_map = ts_to_dboid_map
self.is_down = is_down
def __eq__(self, other):
return vars(self) == vars(other)
def __str__(self):
return '%s:%s:%s:%s:%s:%s:%s:%s' % (self.content, self.role, self.dbid, self.port, self.hostname, self.filespace_dirs,
self.fs_to_ts_map, self.ts_to_dboid_map)
return '%s:%s:%s:%s:%s:%s:%s:%s:%s' % (self.content, self.role, self.dbid, self.port, self.hostname, self.filespace_dirs,
self.fs_to_ts_map, self.ts_to_dboid_map, self.is_down)
class GetDbIdInfo:
"""
......@@ -241,6 +242,15 @@ class GetDbIdInfo:
for seg in self.gparray.getDbList():
if seg.getSegmentContentId() in self.content_id:
is_down = seg.isSegmentDown()
role = seg.getSegmentRole()
# We don't want to run the rebuild on the segments that
# are down. This can cause issues, especially when the segment
# in question has missing data/files.
if is_down and role == 'm':
continue
fs_to_ts_map = self._get_filespace_to_tablespace_map(seg)
ts_oids = []
for fsoid, ts in fs_to_ts_map.items():
......@@ -253,7 +263,8 @@ class GetDbIdInfo:
hostname=seg.getSegmentHostName(),
filespace_dirs=seg.getSegmentFilespaces(),
fs_to_ts_map=fs_to_ts_map,
ts_to_dboid_map=ts_to_dboid_map)
ts_to_dboid_map=ts_to_dboid_map,
is_down=is_down)
dbid_info.append(di)
return dbid_info
......@@ -1192,9 +1203,8 @@ class RebuildPersistentTables(Operation):
We also need to backup for mirrors and standby if they are configured
"""
if self.has_mirrors or self.has_standby:
# TODO: is this where we check if the mirror is down?
for dbidinfo in self.dbid_info:
if dbidinfo.role == 'm':
if dbidinfo.role == 'm' and not dbidinfo.is_down: # Checking if the mirror is down
content = dbidinfo.content
mirror_dbid = dbidinfo.dbid
mirror_hostname = dbidinfo.hostname
......
@gppersistent_rebuid
Feature: persistent rebuild tests
Scenario: Persistent rebuild tools should not error out when mirror is marked down and files are missing in the data directory for single node
Given the database is running
And the information of a "mirror" segment on any host is saved
When user kills a mirror process with the saved information
And user temporarily moves the data directory of the killed mirror
And wait until the mirror is down
Then run gppersistent_rebuild with the saved content id
And gppersistent_rebuild should return a return code of 0
And the user runs command "$GPHOME/bin/lib/gpcheckcat -R persistent -A"
And gpcheckcat should return a return code of 0
And user returns the data directory to the default location of the killed mirror
And the user runs command "gprecoverseg -a"
And gprecoverseg should return a return code of 0
......@@ -3890,3 +3890,57 @@ def impl(_):
for file in os.listdir(repair_dir):
if not timestamp in file:
raise Exception("file found containing inconsistent timestamp")
@when('user kills a mirror process with the saved information')
def impl(context):
cmdStr = "ps ux | grep 'mirror process' | grep %s | awk '{print $2}'" % context.mirror_port
cmd=Command(name='get mirror pid: %s' % cmdStr, cmdStr=cmdStr)
cmd.run()
pid = cmd.get_stdout_lines()[0]
kill_process(int(pid), context.mirror_segdbname, sig=signal.SIGABRT)
@when('user temporarily moves the data directory of the killed mirror')
@then('user temporarily moves the data directory of the killed mirror')
def impl(context):
rmStr = "mv %s{,.bk}" % context.mirror_datadir
cmd=Command(name='Move mirror data directory', cmdStr=rmStr)
cmd.run(validateAfter=True)
@when('user returns the data directory to the default location of the killed mirror')
@then('user returns the data directory to the default location of the killed mirror')
def impl(context):
rmStr = "mv %s{.bk,}" % context.mirror_datadir
cmd=Command(name='Move mirror data directory', cmdStr=rmStr)
cmd.run(validateAfter=True)
@when('wait until the mirror is down')
def impl(context):
qry = "select status from gp_segment_configuration where dbid='%s' and status='d' " % context.mirror_segdbId
start_time = current_time = datetime.now()
while (current_time - start_time).seconds < 120:
row_count = len(getRows('template1', qry))
if row_count == 1:
break
sleep(5)
current_time = datetime.now()
@when('run gppersistent_rebuild with the saved content id')
@then('run gppersistent_rebuild with the saved content id')
def impl(context):
cmdStr = "echo -e 'y\ny\n' | $GPHOME/sbin/gppersistentrebuild -c %s" % context.mirror_segcid
cmd=Command(name='Run gppersistentrebuild',cmdStr=cmdStr)
cmd.run(validateAfter=True)
context.ret_code = cmd.get_results().rc
@given('the information of a "{seg}" segment on any host is saved')
@when('the information of a "{seg}" segment on any host is saved')
@then('the information of a "{seg}" segment on any host is saved')
def impl(context, seg):
if seg == "mirror":
gparray = GpArray.initFromCatalog(dbconn.DbURL())
mirror_segs = [seg for seg in gparray.getDbList() if seg.isSegmentMirror()]
context.mirror_segdbId = mirror_segs[0].getSegmentDbId()
context.mirror_segcid = mirror_segs[0].getSegmentContentId()
context.mirror_segdbname = mirror_segs[0].getSegmentHostName()
context.mirror_datadir = mirror_segs[0].getSegmentDataDirectory()
context.mirror_port = mirror_segs[0].getSegmentPort()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册