未验证 提交 e31c96ac 编写于 作者: C chenjian 提交者: GitHub

Fix the bug that the child thread crashes causing the main thread to deadlock (#1013)

上级 4943b793
...@@ -59,7 +59,8 @@ class FileFactory(object): ...@@ -59,7 +59,8 @@ class FileFactory(object):
if not HDFS_ENABLED: if not HDFS_ENABLED:
raise RuntimeError('Please install module named "hdfs".') raise RuntimeError('Please install module named "hdfs".')
try: try:
default_file_factory.register_filesystem("hdfs", HDFileSystem()) default_file_factory.register_filesystem(
"hdfs", HDFileSystem())
except hdfs.util.HdfsError: except hdfs.util.HdfsError:
raise RuntimeError( raise RuntimeError(
"Please initialize `~/.hdfscli.cfg` for HDFS.") "Please initialize `~/.hdfscli.cfg` for HDFS.")
...@@ -182,8 +183,9 @@ class HDFileSystem(object): ...@@ -182,8 +183,9 @@ class HDFileSystem(object):
encoding = None if binary_mode else "utf-8" encoding = None if binary_mode else "utf-8"
try: try:
with self.cli.read(hdfs_path=filename[7:], offset=offset, with self.cli.read(
encoding=encoding) as reader: hdfs_path=filename[7:], offset=offset,
encoding=encoding) as reader:
data = reader.read() data = reader.read()
continue_from_token = {"last_offset": offset + len(data)} continue_from_token = {"last_offset": offset + len(data)}
return data, continue_from_token return data, continue_from_token
...@@ -214,7 +216,8 @@ class BosConfigClient(object): ...@@ -214,7 +216,8 @@ class BosConfigClient(object):
def __init__(self, bos_ak, bos_sk, bos_sts, bos_host="bj.bcebos.com"): def __init__(self, bos_ak, bos_sk, bos_sts, bos_host="bj.bcebos.com"):
self.config = BceClientConfiguration( self.config = BceClientConfiguration(
credentials=BceCredentials(bos_ak, bos_sk), credentials=BceCredentials(bos_ak, bos_sk),
endpoint=bos_host, security_token=bos_sts) endpoint=bos_host,
security_token=bos_sts)
self.bos_client = BosClient(self.config) self.bos_client = BosClient(self.config)
def exists(self, path): def exists(self, path):
...@@ -234,11 +237,12 @@ class BosConfigClient(object): ...@@ -234,11 +237,12 @@ class BosConfigClient(object):
if not object_key.endswith('/'): if not object_key.endswith('/'):
object_key += '/' object_key += '/'
init_data = b'' init_data = b''
self.bos_client.append_object(bucket_name=bucket_name, self.bos_client.append_object(
key=object_key, bucket_name=bucket_name,
data=init_data, key=object_key,
content_md5=content_md5(init_data), data=init_data,
content_length=len(init_data)) content_md5=content_md5(init_data),
content_length=len(init_data))
@staticmethod @staticmethod
def join(path, *paths): def join(path, *paths):
...@@ -255,9 +259,8 @@ class BosConfigClient(object): ...@@ -255,9 +259,8 @@ class BosConfigClient(object):
# if not object_key.endswith('/'): # if not object_key.endswith('/'):
# object_key += '/' # object_key += '/'
print('Uploading file `%s`' % filename) print('Uploading file `%s`' % filename)
self.bos_client.put_object_from_file(bucket=bucket_name, self.bos_client.put_object_from_file(
key=object_key, bucket=bucket_name, key=object_key, file_name=filename)
file_name=filename)
class BosFileSystem(object): class BosFileSystem(object):
...@@ -288,14 +291,36 @@ class BosFileSystem(object): ...@@ -288,14 +291,36 @@ class BosFileSystem(object):
bos_sts = os.getenv("BOS_STS") bos_sts = os.getenv("BOS_STS")
self.config = BceClientConfiguration( self.config = BceClientConfiguration(
credentials=BceCredentials(access_key_id, secret_access_key), credentials=BceCredentials(access_key_id, secret_access_key),
endpoint=bos_host, security_token=bos_sts) endpoint=bos_host,
security_token=bos_sts)
def set_bos_config(self, bos_ak, bos_sk, bos_sts, bos_host="bj.bcebos.com"): def set_bos_config(self, bos_ak, bos_sk, bos_sts,
bos_host="bj.bcebos.com"):
self.config = BceClientConfiguration( self.config = BceClientConfiguration(
credentials=BceCredentials(bos_ak, bos_sk), credentials=BceCredentials(bos_ak, bos_sk),
endpoint=bos_host, security_token=bos_sts) endpoint=bos_host,
security_token=bos_sts)
self.bos_client = BosClient(self.config) self.bos_client = BosClient(self.config)
def renew_bos_client_from_server(self):
import requests
import json
from visualdl.utils.dir import CONFIG_PATH
with open(CONFIG_PATH, 'r') as fp:
server_url = json.load(fp)['server_url']
url = server_url + '/sts/'
res = requests.post(url=url).json()
err_code = res.get('code')
msg = res.get('msg')
if '000000' == err_code:
sts_ak = msg.get('sts_ak')
sts_sk = msg.get('sts_sk')
sts_token = msg.get('token')
self.set_bos_config(sts_ak, sts_sk, sts_token)
else:
print('Renew bos client error. Error msg: {}'.format(msg))
return
def isfile(self, filename): def isfile(self, filename):
return exists(filename) return exists(filename)
...@@ -324,11 +349,12 @@ class BosFileSystem(object): ...@@ -324,11 +349,12 @@ class BosFileSystem(object):
if not object_key.endswith('/'): if not object_key.endswith('/'):
object_key += '/' object_key += '/'
init_data = b'' init_data = b''
self.bos_client.append_object(bucket_name=bucket_name, self.bos_client.append_object(
key=object_key, bucket_name=bucket_name,
data=init_data, key=object_key,
content_md5=content_md5(init_data), data=init_data,
content_length=len(init_data)) content_md5=content_md5(init_data),
content_length=len(init_data))
@staticmethod @staticmethod
def join(path, *paths): def join(path, *paths):
...@@ -344,10 +370,10 @@ class BosFileSystem(object): ...@@ -344,10 +370,10 @@ class BosFileSystem(object):
length = int( length = int(
self.get_meta(bucket_name, object_key).metadata.content_length) self.get_meta(bucket_name, object_key).metadata.content_length)
if offset < length: if offset < length:
data = self.bos_client.get_object_as_string(bucket_name=bucket_name, data = self.bos_client.get_object_as_string(
key=object_key, bucket_name=bucket_name,
range=[offset, key=object_key,
length - 1]) range=[offset, length - 1])
else: else:
data = b'' data = b''
...@@ -371,29 +397,45 @@ class BosFileSystem(object): ...@@ -371,29 +397,45 @@ class BosFileSystem(object):
bucket_name, object_key = get_object_info(filename) bucket_name, object_key = get_object_info(filename)
if not self.exists(filename): if not self.exists(filename):
init_data = b'' init_data = b''
self.bos_client.append_object(bucket_name=bucket_name, try:
key=object_key, self.bos_client.append_object(
data=init_data, bucket_name=bucket_name,
content_md5=content_md5(init_data), key=object_key,
content_length=len(init_data)) data=init_data,
content_md5=content_md5(init_data),
content_length=len(init_data))
except (exception.BceServerError, exception.BceHttpClientError):
self.renew_bos_client_from_server()
self.bos_client.append_object(
bucket_name=bucket_name,
key=object_key,
data=init_data,
content_md5=content_md5(init_data),
content_length=len(init_data))
return
content_length = len(file_content) content_length = len(file_content)
try: try:
offset = self.get_meta(bucket_name, offset = self.get_meta(bucket_name,
object_key).metadata.content_length object_key).metadata.content_length
self.bos_client.append_object(bucket_name=bucket_name, self.bos_client.append_object(
key=object_key, bucket_name=bucket_name,
data=file_content, key=object_key,
content_md5=content_md5(file_content), data=file_content,
content_length=content_length, content_md5=content_md5(file_content),
offset=offset) content_length=content_length,
offset=offset)
except (exception.BceServerError, exception.BceHttpClientError): except (exception.BceServerError, exception.BceHttpClientError):
init_data = b'' self.renew_bos_client_from_server()
self.bos_client.append_object(bucket_name=bucket_name, offset = self.get_meta(bucket_name,
key=object_key, object_key).metadata.content_length
data=init_data, self.bos_client.append_object(
content_md5=content_md5(init_data), bucket_name=bucket_name,
content_length=len(init_data)) key=object_key,
data=file_content,
content_md5=content_md5(file_content),
content_length=content_length,
offset=offset)
self._file_contents_to_add = b'' self._file_contents_to_add = b''
self._file_contents_count = 0 self._file_contents_count = 0
...@@ -435,9 +477,10 @@ class BosFileSystem(object): ...@@ -435,9 +477,10 @@ class BosFileSystem(object):
contents_map[key] = [value] contents_map[key] = [value]
temp_walk = [] temp_walk = []
for key, value in contents_map.items(): for key, value in contents_map.items():
temp_walk.append( temp_walk.append([
[BosFileSystem.join('bos://' + self.bucket, key), [], BosFileSystem.join('bos://' + self.bucket, key), [],
value]) value
])
self.length = len(temp_walk) self.length = len(temp_walk)
self.contents = temp_walk self.contents = temp_walk
...@@ -458,8 +501,7 @@ class BosFileSystem(object): ...@@ -458,8 +501,7 @@ class BosFileSystem(object):
else: else:
prefix = object_key if object_key.endswith( prefix = object_key if object_key.endswith(
'/') else object_key + '/' '/') else object_key + '/'
response = self.bos_client.list_objects(bucket_name, response = self.bos_client.list_objects(bucket_name, prefix=prefix)
prefix=prefix)
contents = [content.key for content in response.contents] contents = [content.key for content in response.contents]
return WalkGenerator(bucket_name, contents) return WalkGenerator(bucket_name, contents)
...@@ -633,7 +675,8 @@ class BFile(object): ...@@ -633,7 +675,8 @@ class BFile(object):
def close(self): def close(self):
if isinstance(self.fs, BosFileSystem): if isinstance(self.fs, BosFileSystem):
try: try:
self.fs.append(self._filename, b'', self.binary_mode, force=True) self.fs.append(
self._filename, b'', self.binary_mode, force=True)
except Exception: except Exception:
pass pass
self.flush() self.flush()
......
...@@ -30,6 +30,7 @@ if isinstance(QUEUE_TIMEOUT, str): ...@@ -30,6 +30,7 @@ if isinstance(QUEUE_TIMEOUT, str):
class RecordWriter(object): class RecordWriter(object):
"""Package data with crc32 or not. """Package data with crc32 or not.
""" """
def __init__(self, writer): def __init__(self, writer):
self._writer = writer self._writer = writer
...@@ -77,8 +78,13 @@ class RecordFileWriter(object): ...@@ -77,8 +78,13 @@ class RecordFileWriter(object):
directory and asynchronously writes `Record` protocol buffers to this directory and asynchronously writes `Record` protocol buffers to this
file. file.
""" """
def __init__(self, logdir, max_queue_size=10, flush_secs=120,
filename_suffix='', filename=''): def __init__(self,
logdir,
max_queue_size=10,
flush_secs=120,
filename_suffix='',
filename=''):
self._logdir = logdir self._logdir = logdir
if not bfile.exists(logdir): if not bfile.exists(logdir):
bfile.makedirs(logdir) bfile.makedirs(logdir)
...@@ -93,16 +99,19 @@ class RecordFileWriter(object): ...@@ -93,16 +99,19 @@ class RecordFileWriter(object):
else: else:
fn = "vdlrecords.%010d.log%s" % (time.time(), filename_suffix) fn = "vdlrecords.%010d.log%s" % (time.time(), filename_suffix)
self._file_name = bfile.join(logdir, fn) self._file_name = bfile.join(logdir, fn)
print( print('Since the log filename should contain `vdlrecords`, '
'Since the log filename should contain `vdlrecords`, the filename is invalid and `{}` will replace `{}`'.format( # noqa: E501 'the filename is invalid and `{}` will replace `{}`'.
fn, filename)) format( # noqa: E501
fn, filename))
else: else:
self._file_name = bfile.join(logdir, "vdlrecords.%010d.log%s" % ( self._file_name = bfile.join(
time.time(), filename_suffix)) logdir,
"vdlrecords.%010d.log%s" % (time.time(), filename_suffix))
self._general_file_writer = bfile.BFile(self._file_name, "wb") self._general_file_writer = bfile.BFile(self._file_name, "wb")
self._async_writer = _AsyncWriter(RecordWriter( self._async_writer = _AsyncWriter(
self._general_file_writer), max_queue_size, flush_secs) RecordWriter(self._general_file_writer), max_queue_size,
flush_secs)
# TODO(shenyuhan) Maybe file_version in future. # TODO(shenyuhan) Maybe file_version in future.
# _record = record_pb2.Record() # _record = record_pb2.Record()
# self.add_record(_record) # self.add_record(_record)
...@@ -140,8 +149,7 @@ class _AsyncWriter(object): ...@@ -140,8 +149,7 @@ class _AsyncWriter(object):
self._closed = False self._closed = False
self._bytes_queue = queue.Queue(max_queue_size) self._bytes_queue = queue.Queue(max_queue_size)
self._worker = _AsyncWriterThread(self._bytes_queue, self._worker = _AsyncWriterThread(self._bytes_queue,
self._record_writer, self._record_writer, flush_secs)
flush_secs)
self._lock = threading.Lock() self._lock = threading.Lock()
self._worker.start() self._worker.start()
...@@ -188,6 +196,7 @@ class _AsyncWriterThread(threading.Thread): ...@@ -188,6 +196,7 @@ class _AsyncWriterThread(threading.Thread):
self.join() self.join()
def run(self): def run(self):
has_unresolved_bug = False
while True: while True:
now = time.time() now = time.time()
queue_wait_duration = self._next_flush_time - now queue_wait_duration = self._next_flush_time - now
...@@ -205,6 +214,14 @@ class _AsyncWriterThread(threading.Thread): ...@@ -205,6 +214,14 @@ class _AsyncWriterThread(threading.Thread):
self._has_pending_data = True self._has_pending_data = True
except queue.Empty: except queue.Empty:
pass pass
except Exception as e:
# prevent the main thread from deadlock due to writing error.
if not has_unresolved_bug:
print('Warning: Writing data Error, Due to unresolved Exception {}'.format(e))
print('Warning: Writing data to FileSystem failed since {}.'.format(
time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
has_unresolved_bug = True
pass
finally: finally:
if data: if data:
self._queue.task_done() self._queue.task_done()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册