diff --git a/core/engine/local_cluster.py b/core/engine/local_cluster.py index cf9b6032162a61b16e4f01552c23cff7312b3965..88f21ef8bf7218a4b83db265ad534ad2266561a9 100755 --- a/core/engine/local_cluster.py +++ b/core/engine/local_cluster.py @@ -119,7 +119,8 @@ class LocalClusterEngine(Engine): "PADDLE_TRAINERS_NUM": str(worker_num), "TRAINING_ROLE": "TRAINER", "PADDLE_TRAINER_ID": str(i), - "FLAGS_selected_gpus": str(selected_gpus[i]) + "FLAGS_selected_gpus": str(selected_gpus[i]), + "PADDLEREC_GPU_NUMS": str(selected_gpus_num) }) os.system("mkdir -p {}".format(logs_dir)) diff --git a/core/utils/dataloader_instance.py b/core/utils/dataloader_instance.py index 03e6f0a67884917e9af2d02d13eb86576620ceef..561f91d9dbc73dc8491e14f72e890f6b26110bf1 100755 --- a/core/utils/dataloader_instance.py +++ b/core/utils/dataloader_instance.py @@ -47,6 +47,16 @@ def dataloader_by_name(readerclass, files.sort() + # for local cluster: discard some files if files cannot be divided equally between GPUs + if (context["device"] == "GPU"): + selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) + discard_file_nums = len(files) % selected_gpu_nums + if (discard_file_nums != 0): + print( + "Warning: beacause files cannot be divided equally between GPUs,discard these files:{}". + format(files[-discard_file_nums:])) + files = files[:len(files) - discard_file_nums] + need_split_files = False if context["engine"] == EngineMode.LOCAL_CLUSTER: # for local cluster: split files for multi process @@ -109,6 +119,16 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context): files.sort() + # for local cluster: discard some files if files cannot be divided equally between GPUs + if (context["device"] == "GPU"): + selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) + discard_file_nums = len(files) % selected_gpu_nums + if (discard_file_nums != 0): + print( + "Warning: beacause files cannot be divided equally between GPUs, discard these files:{}". + format(files[-discard_file_nums:])) + files = files[:len(files) - discard_file_nums] + need_split_files = False if context["engine"] == EngineMode.LOCAL_CLUSTER: # for local cluster: split files for multi process @@ -179,6 +199,16 @@ def slotdataloader(readerclass, train, yaml_file, context): files.sort() + # for local cluster: discard some files if files cannot be divided equally between GPUs + if (context["device"] == "GPU"): + selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) + discard_file_nums = len(files) % selected_gpu_nums + if (discard_file_nums != 0): + print( + "Warning: beacause files cannot be divided equally between GPUs,discard these files:{}". + format(files[-discard_file_nums:])) + files = files[:len(files) - discard_file_nums] + need_split_files = False if context["engine"] == EngineMode.LOCAL_CLUSTER: # for local cluster: split files for multi process