Transfer resnext50 multi-process multi-gpu from benchmark to models (#2604)

* Transfer resnext50 multi-process multi-gpu changes from benchmark to models Signed-off-by: N zhaoyuchen <zhaoyuchen01@baidu.com> * refine code Signed-off-by: N zhaoyuchen <zhaoyuchen01@baidu.com>

Transfer resnext50 multi-process multi-gpu from benchmark to models (#2604)
* Transfer resnext50 multi-process multi-gpu changes from benchmark to models Signed-off-by: N zhaoyuchen <zhaoyuchen01@baidu.com> * refine code Signed-off-by: N zhaoyuchen <zhaoyuchen01@baidu.com>
4d1a6f70 · zhaoyuchen2018 · GitHub · 59aef8d8 · 4d1a6f70 · 4d1a6f70
4 changed file
--- a/PaddleCV/image_classification/dist_train/dist_utils.py
+++ b/PaddleCV/image_classification/dist_train/dist_utils.py
@@ -12,6 +12,9 @@
 #See the License for the specific language governing permissions and
 #limitations under the License.

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import os
 import paddle.fluid as fluid

@@ -23,7 +26,8 @@ def nccl2_prepare(args, startup_prog, main_prog):

    envs = args.dist_env

-    t.transpile(envs["trainer_id"],
+    t.transpile(
+        envs["trainer_id"],
        trainers=','.join(envs["trainer_endpoints"]),
        current_endpoint=envs["current_endpoint"],
        startup_program=startup_prog,
@@ -47,7 +51,9 @@ def pserver_prepare(args, train_prog, startup_prog):
    if training_role == "PSERVER":
        pserver_program = t.get_pserver_program(envs["current_endpoint"])
        pserver_startup_program = t.get_startup_program(
-            envs["current_endpoint"], pserver_program, startup_program=startup_prog)
+            envs["current_endpoint"],
+            pserver_program,
+            startup_program=startup_prog)
        return pserver_program, pserver_startup_program
    elif training_role == "TRAINER":
        train_program = t.get_trainer_program()
@@ -56,3 +62,32 @@ def pserver_prepare(args, train_prog, startup_prog):
        raise ValueError(
            'PADDLE_TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
        )
+
+
+def nccl2_prepare_paddle(trainer_id, startup_prog, main_prog):
+    config = fluid.DistributeTranspilerConfig()
+    config.mode = "nccl2"
+    t = fluid.DistributeTranspiler(config=config)
+    t.transpile(
+        trainer_id,
+        trainers=os.environ.get('PADDLE_TRAINER_ENDPOINTS'),
+        current_endpoint=os.environ.get('PADDLE_CURRENT_ENDPOINT'),
+        startup_program=startup_prog,
+        program=main_prog)
+
+
+def prepare_for_multi_process(exe, build_strategy, train_prog):
+    # prepare for multi-process
+    trainer_id = int(os.environ.get('PADDLE_TRAINER_ID', 0))
+    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+    if num_trainers < 2: return
+    print("PADDLE_TRAINERS_NUM", num_trainers)
+    print("PADDLE_TRAINER_ID", trainer_id)
+    build_strategy.num_trainers = num_trainers
+    build_strategy.trainer_id = trainer_id
+    # NOTE(zcd): use multi processes to train the model,
+    # and each process use one GPU card.
+    startup_prog = fluid.Program()
+    nccl2_prepare_paddle(trainer_id, startup_prog, train_prog)
+    # the startup_prog are run two times, but it doesn't matter.
+    exe.run(startup_prog)
--- a/PaddleCV/image_classification/reader.py
+++ b/PaddleCV/image_classification/reader.py
@@ -140,75 +140,80 @@ def process_image(sample, mode, color_jitter, rotate):
        return [img]


+def process_batch_data(input_data, mode, color_jitter, rotate):
+    batch_data = []
+    for sample in input_data:
+        batch_data.append(process_image(sample, mode, color_jitter, rotate))
+    return batch_data
+
+
 def _reader_creator(file_list,
+                    batch_size,
                    mode,
                    shuffle=False,
                    color_jitter=False,
                    rotate=False,
                    data_dir=DATA_DIR,
-                    pass_id_as_seed=1,
+                    shuffle_seed=0,
                    infinite=False):
    def reader():
+        def read_file_list():
            with open(file_list) as flist:
                full_lines = [line.strip() for line in flist]
-            pass_id_as_seed_counter = pass_id_as_seed
-            while True:
                if shuffle:
-                    if pass_id_as_seed_counter:
-                        np.random.seed(pass_id_as_seed_counter)
+                    if shuffle_seed is not None:
+                        np.random.seed(shuffle_seed)
                    np.random.shuffle(full_lines)
-                if mode == 'train' and os.getenv('PADDLE_TRAINING_ROLE'):
-                    # distributed mode if the env var `PADDLE_TRAINING_ROLE` exits
-                    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-                    trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-                    per_node_lines = len(full_lines) // trainer_count
-                    lines = full_lines[trainer_id * per_node_lines:(
-                        trainer_id + 1) * per_node_lines]
-                    print(
-                        "read images from %d, length: %d, lines length: %d, total: %d"
-                        % (trainer_id * per_node_lines, per_node_lines,
-                           len(lines), len(full_lines)))
-                else:
-                    lines = full_lines
-
-                for line in lines:
-                    if mode == 'train' or mode == 'val':
+            batch_data = []
+            for line in full_lines:
                img_path, label = line.split()
                img_path = os.path.join(data_dir, img_path)
-                        yield img_path, int(label)
+                batch_data.append([img_path, int(label)])
+                if len(batch_data) == batch_size:
+                    if mode == 'train' or mode == 'val':
+                        yield batch_data
                    elif mode == 'test':
-                        img_path, label = line.split()
-                        img_path = os.path.join(data_dir, img_path)
-                        yield [img_path]
-                if not infinite:
-                    break
-                pass_id_as_seed_counter += 1
-                print("passid ++, current: ", pass_id_as_seed_counter)
+                        yield [sample[0] for sample in batch_data]
+                    batch_data = []
+
+        return read_file_list
+
+    data_reader = reader()
+    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+    if mode == 'train' and num_trainers > 1:
+        assert shuffle_seed is not None, \
+            "If num_trainers > 1, the shuffle_seed must be set, because " \
+            "the order of batch data generated by reader " \
+            "must be the same in the respective processes."
+        data_reader = fluid.contrib.reader.distributed_batch_reader(data_reader)

    mapper = functools.partial(
-        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate)
+        process_batch_data, mode=mode, color_jitter=color_jitter, rotate=rotate)

-    return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
+    return paddle.reader.xmap_readers(mapper, data_reader, THREAD, BUF_SIZE)


-def train(data_dir=DATA_DIR, pass_id_as_seed=1, infinite=False):
+def train(batch_size, data_dir=DATA_DIR, shuffle_seed=0, infinite=False):
    file_list = os.path.join(data_dir, 'train_list.txt')
    return _reader_creator(
        file_list,
+        batch_size,
        'train',
        shuffle=True,
        color_jitter=False,
        rotate=False,
        data_dir=data_dir,
-        pass_id_as_seed=pass_id_as_seed,
+        shuffle_seed=shuffle_seed,
        infinite=infinite)


-def val(data_dir=DATA_DIR):
+def val(batch_size, data_dir=DATA_DIR):
    file_list = os.path.join(data_dir, 'val_list.txt')
-    return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
+    return _reader_creator(
+        file_list, batch_size, 'val', shuffle=False, data_dir=data_dir)


-def test(data_dir=DATA_DIR):
+def test(batch_size, data_dir=DATA_DIR):
    file_list = os.path.join(data_dir, 'val_list.txt')
-    return _reader_creator(file_list, 'test', shuffle=False, data_dir=data_dir)
+    return _reader_creator(
+        file_list, batch_size, 'test', shuffle=False, data_dir=data_dir)
--- a/PaddleCV/image_classification/reader_cv2.py
+++ b/PaddleCV/image_classification/reader_cv2.py
@@ -21,6 +21,7 @@ import cv2
 import io

 import paddle
+import paddle.fluid as fluid

 random.seed(0)
 np.random.seed(0)
@@ -150,7 +151,8 @@ def create_mixup_reader(settings, rd):

    def mixup_reader():

-        for context.tmp_mix, context.tmp_l1, context.tmp_l2, context.tmp_lam in mixup_data():
+        for context.tmp_mix, context.tmp_l1, context.tmp_l2, context.tmp_lam in mixup_data(
+        ):
            for i in range(len(context.tmp_mix)):
                mixed_l = context.tmp_mix[i]
                l1 = context.tmp_l1[i]
@@ -210,82 +212,99 @@ def image_mapper(**kwargs):
    return functools.partial(process_image, **kwargs)


+def process_batch_data(input_data, settings, mode, color_jitter, rotate):
+    batch_data = []
+    for sample in input_data:
+        batch_data.append(
+            process_image(sample, settings, mode, color_jitter, rotate))
+    return batch_data
+
+
 def _reader_creator(settings,
                    file_list,
+                    batch_size,
                    mode,
                    shuffle=False,
                    color_jitter=False,
                    rotate=False,
                    data_dir=DATA_DIR,
-                    pass_id_as_seed=0):
+                    shuffle_seed=0):
    def reader():
+        def read_file_list():
            with open(file_list) as flist:
                full_lines = [line.strip() for line in flist]
                if shuffle:
-                if pass_id_as_seed:
-                    np.random.seed(pass_id_as_seed)
+                    if shuffle_seed is not None:
+                        np.random.seed(shuffle_seed)
                    np.random.shuffle(full_lines)
-            if mode == 'train' and os.getenv('PADDLE_TRAINING_ROLE'):
-                # distributed mode if the env var `PADDLE_TRAINING_ROLE` exits
-                trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-                trainer_count = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
-                per_node_lines = len(full_lines) // trainer_count
-                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
-                                   * per_node_lines]
-                print(
-                    "read images from %d, length: %d, lines length: %d, total: %d"
-                    % (trainer_id * per_node_lines, per_node_lines, len(lines),
-                       len(full_lines)))
-            else:
-                lines = full_lines
-
-            for line in lines:
-                if mode == 'train' or mode == 'val':
+            batch_data = []
+            for line in full_lines:
                img_path, label = line.split()
                img_path = os.path.join(data_dir, img_path)
-                    yield img_path, int(label)
+                batch_data.append([img_path, int(label)])
+                if len(batch_data) == batch_size:
+                    if mode == 'train' or mode == 'val':
+                        yield batch_data
                    elif mode == 'test':
-                    img_path, label = line.split()
-                    img_path = os.path.join(data_dir, img_path)
-
-                    yield [img_path]
-
-    crop_size = int(settings.image_shape.split(",")[2])
-    image_mapper = functools.partial(
-        process_image,
+                        yield [sample[0] for sample in batch_data]
+                    batch_data = []
+
+        return read_file_list
+
+    data_reader = reader()
+    num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
+    if mode == 'train' and num_trainers > 1:
+        assert shuffle_seed is not None, \
+            "If num_trainers > 1, the shuffle_seed must be set, because " \
+            "the order of batch data generated by reader " \
+            "must be the same in the respective processes."
+        data_reader = fluid.contrib.reader.distributed_batch_reader(data_reader)
+
+    mapper = functools.partial(
+        process_batch_data,
        settings=settings,
        mode=mode,
        color_jitter=color_jitter,
-        rotate=rotate,
-        crop_size=crop_size)
-    reader = paddle.reader.xmap_readers(
-        image_mapper, reader, THREAD, BUF_SIZE, order=False)
-    return reader
+        rotate=rotate)
+
+    return paddle.reader.xmap_readers(
+        mapper, data_reader, THREAD, BUF_SIZE, order=False)


-def train(settings, data_dir=DATA_DIR, pass_id_as_seed=0):
+def train(settings, batch_size, data_dir=DATA_DIR, shuffle_seed=0):
    file_list = os.path.join(data_dir, 'train_list.txt')
    reader = _reader_creator(
        settings,
        file_list,
+        batch_size,
        'train',
        shuffle=True,
        color_jitter=False,
        rotate=False,
        data_dir=data_dir,
-        pass_id_as_seed=pass_id_as_seed, )
+        shuffle_seed=shuffle_seed)
    if settings.use_mixup == True:
        reader = create_mixup_reader(settings, reader)
    return reader


-def val(settings, data_dir=DATA_DIR):
+def val(settings, batch_size, data_dir=DATA_DIR):
    file_list = os.path.join(data_dir, 'val_list.txt')
    return _reader_creator(
-        settings, file_list, 'val', shuffle=False, data_dir=data_dir)
+        settings,
+        file_list,
+        batch_size,
+        'val',
+        shuffle=False,
+        data_dir=data_dir)


-def test(settings, data_dir=DATA_DIR):
+def test(settings, batch_size, data_dir=DATA_DIR):
    file_list = os.path.join(data_dir, 'val_list.txt')
    return _reader_creator(
-        settings, file_list, 'test', shuffle=False, data_dir=data_dir)
+        settings,
+        file_list,
+        batch_size,
+        'test',
+        shuffle=False,
+        data_dir=data_dir)
--- a/PaddleCV/image_classification/train.py
+++ b/PaddleCV/image_classification/train.py
@@ -49,8 +49,10 @@ import models
 from utils.fp16_utils import create_master_params_grads, master_param_to_train_param
 from utils.utility import add_arguments, print_arguments
 from utils.learning_rate import cosine_decay_with_warmup
+from dist_train import dist_utils

 IMAGENET1000 = 1281167
+num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))

 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
@@ -335,7 +337,9 @@ def build_program(is_train, main_prog, startup_prog, args):
    return build_program_out

 def get_device_num():
-    visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
+    # NOTE(zcd): for multi-processe training, each process use one GPU card.
+    if num_trainers > 1 : return 1
+    visible_device = os.environ.get('CUDA_VISIBLE_DEVICES', None)
    if visible_device:
        device_num = len(visible_device.split(','))
    else:
@@ -365,11 +369,19 @@ def train(args):
                     args=args)
    if use_mixup:
        train_py_reader, train_cost, global_lr = b_out[0], b_out[1], b_out[2]
-        train_fetch_list = [train_cost.name, global_lr.name]
+        train_fetch_vars = [train_cost, global_lr]
+        train_fetch_list = []
+        for var in train_fetch_vars:
+            var.persistable=True
+            train_fetch_list.append(var.name)

    else:
        train_py_reader, train_cost, train_acc1, train_acc5, global_lr = b_out[0],b_out[1],b_out[2],b_out[3],b_out[4]
-        train_fetch_list = [train_cost.name, train_acc1.name, train_acc5.name, global_lr.name]
+        train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr]
+        train_fetch_list = []
+        for var in train_fetch_vars:
+            var.persistable=True
+            train_fetch_list.append(var.name)

    b_out_test = build_program(
                     is_train=False,
@@ -383,7 +395,8 @@ def train(args):
        fluid.memory_optimize(train_prog)
        fluid.memory_optimize(test_prog)

-    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
+    place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

@@ -406,9 +419,11 @@ def train(args):

    test_batch_size = 16
    if not args.enable_ce:
-        train_reader = paddle.batch(
-            reader.train(settings=args), batch_size=train_batch_size, drop_last=True)
-        test_reader = paddle.batch(reader.val(settings=args), batch_size=test_batch_size)
+        # NOTE: the order of batch data generated by batch_reader
+        # must be the same in the respective processes.
+        shuffle_seed = 1 if num_trainers > 1 else None
+        train_reader = reader.train(settings=args, batch_size=train_batch_size, shuffle_seed=shuffle_seed)
+        test_reader = reader.val(settings=args, batch_size=test_batch_size)
    else:
        # use flowers dataset for CE and set use_xmap False to avoid disorder data
        # but it is time consuming. For faster speed, need another dataset.
@@ -419,12 +434,21 @@ def train(args):
            flowers.train(use_xmap=False),
            batch_size=train_batch_size,
            drop_last=True)
+        if num_trainers > 1:
+            train_reader = fluid.contrib.reader.distributed_batch_reader(train_reader)
        test_reader = paddle.batch(
            flowers.test(use_xmap=False), batch_size=test_batch_size)

    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)

+
+    test_fetch_vars = [test_cost, test_acc1, test_acc5]
+    test_fetch_list = []
+    for var in test_fetch_vars:
+        var.persistable=True
+        test_fetch_list.append(var.name)
+
    # use_ngraph is for CPU only, please refer to README_ngraph.md for details
    use_ngraph = os.getenv('FLAGS_use_ngraph')
    if not use_ngraph:
@@ -435,7 +459,13 @@ def train(args):
        #build_strategy.fuse_all_reduce_ops=1

        exec_strategy = fluid.ExecutionStrategy()
+        exec_strategy.num_threads = device_num
        exec_strategy.num_iteration_per_drop_scope = 10
+        if num_trainers > 1 and args.use_gpu:
+            dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog)
+            # NOTE: the process is fast when num_threads is 1
+            # for multi-process training.
+            exec_strategy.num_threads = 1

        train_exe = fluid.ParallelExecutor(
            main_program=train_prog,
@@ -446,8 +476,6 @@ def train(args):
    else:
        train_exe = exe

-    test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name]
-
    params = models.__dict__[args.model]().params
    for pass_id in range(params["num_epochs"]):