提交 20fb78d3 编写于 作者: L liuyuhui

fix bugs for files partition running in collective mode

上级 8c7d113e
......@@ -119,7 +119,8 @@ class LocalClusterEngine(Engine):
"PADDLE_TRAINERS_NUM": str(worker_num),
"TRAINING_ROLE": "TRAINER",
"PADDLE_TRAINER_ID": str(i),
"FLAGS_selected_gpus": str(selected_gpus[i])
"FLAGS_selected_gpus": str(selected_gpus[i]),
"PADDLEREC_GPU_NUMS": str(selected_gpus_num)
})
os.system("mkdir -p {}".format(logs_dir))
......
......@@ -47,6 +47,16 @@ def dataloader_by_name(readerclass,
files.sort()
# for local cluster: discard some files if files cannot be divided equally between GPUs
if (context["device"] == "GPU"):
selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
discard_file_nums = len(files) % selected_gpu_nums
if (discard_file_nums != 0):
print(
"Warning: beacause files cannot be divided equally between GPUs,discard these files:{}".
format(files[-discard_file_nums:]))
files = files[:len(files) - discard_file_nums]
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
......@@ -109,6 +119,16 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context):
files.sort()
# for local cluster: discard some files if files cannot be divided equally between GPUs
if (context["device"] == "GPU"):
selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
discard_file_nums = len(files) % selected_gpu_nums
if (discard_file_nums != 0):
print(
"Warning: beacause files cannot be divided equally between GPUs, discard these files:{}".
format(files[-discard_file_nums:]))
files = files[:len(files) - discard_file_nums]
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
......@@ -179,6 +199,16 @@ def slotdataloader(readerclass, train, yaml_file, context):
files.sort()
# for local cluster: discard some files if files cannot be divided equally between GPUs
if (context["device"] == "GPU"):
selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS"))
discard_file_nums = len(files) % selected_gpu_nums
if (discard_file_nums != 0):
print(
"Warning: beacause files cannot be divided equally between GPUs,discard these files:{}".
format(files[-discard_file_nums:]))
files = files[:len(files) - discard_file_nums]
need_split_files = False
if context["engine"] == EngineMode.LOCAL_CLUSTER:
# for local cluster: split files for multi process
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册