From 6b54ae80d6b66c63c4ea4d38902e918feafdaf9e Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Wed, 4 Aug 2021 15:18:24 +0800 Subject: [PATCH] fix download hang (#3881) --- ppdet/utils/download.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/ppdet/utils/download.py b/ppdet/utils/download.py index 50be46c6a..4f9bfc2ce 100644 --- a/ppdet/utils/download.py +++ b/ppdet/utils/download.py @@ -409,7 +409,7 @@ def _download_dist(url, path, md5sum=None): os.remove(lock_path) else: while os.path.exists(lock_path): - time.sleep(1) + time.sleep(0.5) return fullname else: return _download(url, path, md5sum) @@ -504,14 +504,26 @@ def _decompress_dist(fname): from paddle.distributed import ParallelEnv unique_endpoints = _get_unique_endpoints(ParallelEnv() .trainer_endpoints[:]) - with open(lock_path, 'w'): # touch - os.utime(lock_path, None) + # NOTE(dkp): _decompress_dist always performed after + # _download_dist, in _download_dist sub-trainers is waiting + # for download lock file release with sleeping, if decompress + # prograss is very fast and finished with in the sleeping gap + # time, e.g in tiny dataset such as coco_ce, spine_coco, main + # trainer may finish decompress and release lock file, so we + # only craete lock file in main trainer and all sub-trainer + # wait 1s for main trainer to create lock file, for 1s is + # twice as sleeping gap, this waiting time can keep all + # trainer pipeline in order + # **change this if you have more elegent methods** if ParallelEnv().current_endpoint in unique_endpoints: + with open(lock_path, 'w'): # touch + os.utime(lock_path, None) _decompress(fname) os.remove(lock_path) else: + time.sleep(1) while os.path.exists(lock_path): - time.sleep(1) + time.sleep(0.5) else: _decompress(fname) -- GitLab