From 20fb78d38abd1d4afa2b9ea79628fe4886edf4de Mon Sep 17 00:00:00 2001 From: liuyuhui Date: Wed, 2 Sep 2020 19:22:21 +0800 Subject: [PATCH] fix bugs for files partition running in collective mode --- core/engine/local_cluster.py | 3 ++- core/utils/dataloader_instance.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/core/engine/local_cluster.py b/core/engine/local_cluster.py index cf9b6032..88f21ef8 100755 --- a/core/engine/local_cluster.py +++ b/core/engine/local_cluster.py @@ -119,7 +119,8 @@ class LocalClusterEngine(Engine): "PADDLE_TRAINERS_NUM": str(worker_num), "TRAINING_ROLE": "TRAINER", "PADDLE_TRAINER_ID": str(i), - "FLAGS_selected_gpus": str(selected_gpus[i]) + "FLAGS_selected_gpus": str(selected_gpus[i]), + "PADDLEREC_GPU_NUMS": str(selected_gpus_num) }) os.system("mkdir -p {}".format(logs_dir)) diff --git a/core/utils/dataloader_instance.py b/core/utils/dataloader_instance.py index 03e6f0a6..561f91d9 100755 --- a/core/utils/dataloader_instance.py +++ b/core/utils/dataloader_instance.py @@ -47,6 +47,16 @@ def dataloader_by_name(readerclass, files.sort() + # for local cluster: discard some files if files cannot be divided equally between GPUs + if (context["device"] == "GPU"): + selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) + discard_file_nums = len(files) % selected_gpu_nums + if (discard_file_nums != 0): + print( + "Warning: beacause files cannot be divided equally between GPUs,discard these files:{}". + format(files[-discard_file_nums:])) + files = files[:len(files) - discard_file_nums] + need_split_files = False if context["engine"] == EngineMode.LOCAL_CLUSTER: # for local cluster: split files for multi process @@ -109,6 +119,16 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context): files.sort() + # for local cluster: discard some files if files cannot be divided equally between GPUs + if (context["device"] == "GPU"): + selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) + discard_file_nums = len(files) % selected_gpu_nums + if (discard_file_nums != 0): + print( + "Warning: beacause files cannot be divided equally between GPUs, discard these files:{}". + format(files[-discard_file_nums:])) + files = files[:len(files) - discard_file_nums] + need_split_files = False if context["engine"] == EngineMode.LOCAL_CLUSTER: # for local cluster: split files for multi process @@ -179,6 +199,16 @@ def slotdataloader(readerclass, train, yaml_file, context): files.sort() + # for local cluster: discard some files if files cannot be divided equally between GPUs + if (context["device"] == "GPU"): + selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) + discard_file_nums = len(files) % selected_gpu_nums + if (discard_file_nums != 0): + print( + "Warning: beacause files cannot be divided equally between GPUs,discard these files:{}". + format(files[-discard_file_nums:])) + files = files[:len(files) - discard_file_nums] + need_split_files = False if context["engine"] == EngineMode.LOCAL_CLUSTER: # for local cluster: split files for multi process -- GitLab