commit for test

9e23a870 · typhoonzero · ad9d219a · 9e23a870 · 9e23a870
Showing with 55 addition and 37 deletion

PaddleCV/image_classification/dist_train/dist_train.py PaddleCV/image_classification/dist_train/dist_train.py +20 -7

PaddleCV/image_classification/reader.py PaddleCV/image_classification/reader.py +35 -30

未找到文件。
--- a/PaddleCV/image_classification/dist_train/dist_train.py
+++ b/PaddleCV/image_classification/dist_train/dist_train.py
@@ -86,8 +86,12 @@ def get_device_num():
    return device_num
 def prepare_reader(is_train, pyreader, args, pass_id=0):
+    # NOTE: allways set reader infinite when nccl2 mode to balance data
+    # between ranks
+    is_infinite = (args.update_method == "nccl2")
    if is_train:
-        reader = train(data_dir=args.data_dir, pass_id_as_seed=pass_id)
+        reader = train(data_dir=args.data_dir, pass_id_as_seed=pass_id,
+                       infinite=is_infinite)
    else:
        reader = val(data_dir=args.data_dir)
    if is_train:
@@ -138,6 +142,9 @@ def build_program(is_train, main_prog, startup_prog, args):
                    end_lr /= device_num_per_worker
                total_images = args.total_images / trainer_count
+                if os.getenv("FLAGS_selected_gpus"):
+                    step = int(total_images / (args.batch_size / device_num_per_worker * args.multi_batch_repeat) + 1)
+                else:
                    step = int(total_images / (args.batch_size * args.multi_batch_repeat) + 1)
                warmup_steps = step * 5  # warmup 5 passes
                epochs = [30, 60, 80]
@@ -264,7 +271,7 @@ def train_parallel(args):
    # num_iteration_per_drop_scope indicates how
    # many iterations to clean up the temp variables which
    # is generated during execution. It may make the execution faster,
-    #  because the temp variable's shape maybe the same between two iterations
+    # because the temp variable's shape are the same between two iterations.
    strategy.num_iteration_per_drop_scope = 30
    build_strategy = fluid.BuildStrategy()
@@ -317,7 +324,13 @@ def train_parallel(args):
    over_all_start = time.time()
    fetch_list = [train_cost.name, train_acc1.name, train_acc5.name]
+    # 1. MP mode, batch size for current process should be args.batch_size / GPUs
+    # 2. SP/PG mode, batch size for each process should be original args.batch_size
+    if os.getenv("FLAGS_selected_gpus"):
+        steps_per_pass = args.total_images / (args.batch_size / get_device_num()) / args.dist_env["num_trainers"]
+    else:
        steps_per_pass = args.total_images / args.batch_size / args.dist_env["num_trainers"]
    for pass_id in range(args.num_epochs):
        num_samples = 0
        start_time = time.time()
@@ -342,7 +355,7 @@ def train_parallel(args):
                break
            num_samples += args.batch_size
            batch_id += 1
-            if args.skip_unbalanced_data and batch_id >= steps_per_pass:
+            if (args.skip_unbalanced_data or args.update_method == "nccl2") and batch_id >= steps_per_pass:
                break
        print_train_time(start_time, time.time(), num_samples)

--- a/PaddleCV/image_classification/reader.py
+++ b/PaddleCV/image_classification/reader.py
@@ -131,10 +131,12 @@ def _reader_creator(file_list,
                    color_jitter=False,
                    rotate=False,
                    data_dir=DATA_DIR,
-                    pass_id_as_seed=0):
+                    pass_id_as_seed=0,
+                    infinite=False):
    def reader():
        with open(file_list) as flist:
            full_lines = [line.strip() for line in flist]
+            while True:
                if shuffle:
                    if pass_id_as_seed:
                        np.random.seed(pass_id_as_seed)
@@ -161,8 +163,11 @@ def _reader_creator(file_list,
                    elif mode == 'test':
                        img_path, label = line.split()
                        img_path = os.path.join(data_dir, img_path)
                        yield [img_path]
+                if not infinite:
+                    break
+                pass_id_as_seed += 1
+                print("passid ++, current: ", pass_id_as_seed)
    mapper = functools.partial(
        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
@@ -170,7 +175,7 @@ def _reader_creator(file_list,
    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
-def train(data_dir=DATA_DIR, pass_id_as_seed=0):
+def train(data_dir=DATA_DIR, pass_id_as_seed=0, infinite=False):
    file_list = os.path.join(data_dir, 'train_list.txt')
    return _reader_creator(
        file_list,