yolov3 is a special model, if num_trainers > 1, each process trian the completed dataset (#2605)

553659a5 · chengduo · GitHub · 931b0135 · 553659a5 · 553659a5
隐藏空白更改
内联并排

Showing with 13 addition and 12 deletion

PaddleCV/yolov3/reader.py PaddleCV/yolov3/reader.py +9 -7

PaddleCV/yolov3/train.py PaddleCV/yolov3/train.py +4 -5

未找到文件。
--- a/PaddleCV/yolov3/reader.py
+++ b/PaddleCV/yolov3/reader.py
@@ -272,13 +272,15 @@ class DataSetReader(object):
                batch_out = [(im, im_id, im_shape)]
                yield batch_out
-        num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+        # NOTE: yolov3 is a special model, if num_trainers > 1, each process 
-        if mode == 'train' and num_trainers > 1:
+        # trian the completed dataset.
-            assert shuffle_seed is not None, \
+        # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
-                "If num_trainers > 1, the shuffle_seed must be set, because " \
+        # if mode == 'train' and num_trainers > 1:
-                "the order of batch data generated by reader " \
+        #     assert shuffle_seed is not None, \
-                "must be the same in the respective processes."
+        #         "If num_trainers > 1, the shuffle_seed must be set, because " \
-            reader = fluid.contrib.reader.distributed_batch_reader(reader)
+        #         "the order of batch data generated by reader " \
+        #         "must be the same in the respective processes."
+        #     reader = fluid.contrib.reader.distributed_batch_reader(reader)
        return reader

--- a/PaddleCV/yolov3/train.py
+++ b/PaddleCV/yolov3/train.py
@@ -47,7 +47,6 @@ import dist_utils
 num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
 def get_device_num():
    # NOTE(zcd): for multi-processe training, each process use one GPU card.
    if num_trainers > 1: return 1
@@ -141,10 +140,10 @@ def train():
    shuffle = True
    if args.enable_ce:
        shuffle = False
-    # NOTE: If num_trainers > 1, the shuffle_seed must be set, because
+    shuffle_seed = None
-    # the order of batch data generated by reader
+    # NOTE: yolov3 is a special model, if num_trainers > 1, each process
-    # must be the same in the respective processes.
+    # trian the completed dataset.
-    shuffle_seed = 1 if num_trainers > 1 else None
+    # if num_trainers > 1: shuffle_seed  = 1
    train_reader = reader.train(
        input_size,
        batch_size=cfg.batch_size,