From 6b54ae80d6b66c63c4ea4d38902e918feafdaf9e Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Wed, 4 Aug 2021 15:18:24 +0800
Subject: [PATCH] fix download hang (#3881)

---
 ppdet/utils/download.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/ppdet/utils/download.py b/ppdet/utils/download.py
index 50be46c6a..4f9bfc2ce 100644
--- a/ppdet/utils/download.py
+++ b/ppdet/utils/download.py
@@ -409,7 +409,7 @@ def _download_dist(url, path, md5sum=None):
                     os.remove(lock_path)
                 else:
                     while os.path.exists(lock_path):
-                        time.sleep(1)
+                        time.sleep(0.5)
             return fullname
     else:
         return _download(url, path, md5sum)
@@ -504,14 +504,26 @@ def _decompress_dist(fname):
             from paddle.distributed import ParallelEnv
             unique_endpoints = _get_unique_endpoints(ParallelEnv()
                                                      .trainer_endpoints[:])
-            with open(lock_path, 'w'):  # touch    
-                os.utime(lock_path, None)
+            # NOTE(dkp): _decompress_dist always performed after
+            # _download_dist, in _download_dist sub-trainers is waiting
+            # for download lock file release with sleeping, if decompress
+            # prograss is very fast and finished with in the sleeping gap
+            # time, e.g in tiny dataset such as coco_ce, spine_coco, main
+            # trainer may finish decompress and release lock file, so we
+            # only craete lock file in main trainer and all sub-trainer
+            # wait 1s for main trainer to create lock file, for 1s is
+            # twice as sleeping gap, this waiting time can keep all
+            # trainer pipeline in order
+            # **change this if you have more elegent methods**
             if ParallelEnv().current_endpoint in unique_endpoints:
+                with open(lock_path, 'w'):  # touch    
+                    os.utime(lock_path, None)
                 _decompress(fname)
                 os.remove(lock_path)
             else:
+                time.sleep(1)
                 while os.path.exists(lock_path):
-                    time.sleep(1)
+                    time.sleep(0.5)
     else:
         _decompress(fname)
 
-- 
GitLab