diff --git a/core/utils/dataloader_instance.py b/core/utils/dataloader_instance.py index d0cbc63e378a241331a22f6e475b135344a9ea03..eb241204c89331d11bcbb4f4a2dc0dd35c52b04c 100755 --- a/core/utils/dataloader_instance.py +++ b/core/utils/dataloader_instance.py @@ -49,8 +49,7 @@ def dataloader_by_name(readerclass, files.sort() # for local cluster: discard some files if files cannot be divided equally between GPUs - if (context["device"] == "GPU" - ) and os.getenv("PADDLEREC_GPU_NUMS") is not None: + if (context["device"] == "GPU") and "PADDLEREC_GPU_NUMS" in os.environ: selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) discard_file_nums = len(files) % selected_gpu_nums if (discard_file_nums != 0): @@ -122,8 +121,7 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context): files.sort() # for local cluster: discard some files if files cannot be divided equally between GPUs - if (context["device"] == "GPU" - ) and os.getenv("PADDLEREC_GPU_NUMS") is not None: + if (context["device"] == "GPU") and "PADDLEREC_GPU_NUMS" in os.environ: selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) discard_file_nums = len(files) % selected_gpu_nums if (discard_file_nums != 0): @@ -176,84 +174,3 @@ def slotdataloader_by_name(readerclass, dataset_name, yaml_file, context): if hasattr(reader, 'generate_batch_from_trainfiles'): return gen_batch_reader() return gen_reader - - -def slotdataloader(readerclass, train, yaml_file, context): - if train == "TRAIN": - reader_name = "SlotReader" - namespace = "train.reader" - data_path = get_global_env("train_data_path", None, namespace) - else: - reader_name = "SlotReader" - namespace = "evaluate.reader" - data_path = get_global_env("test_data_path", None, namespace) - - if data_path.startswith("paddlerec::"): - package_base = get_runtime_environ("PACKAGE_BASE") - assert package_base is not None - data_path = os.path.join(package_base, data_path.split("::")[1]) - - hidden_file_list, files = check_filelist( - hidden_file_list=[], data_file_list=[], train_data_path=data_path) - if (hidden_file_list is not None): - print( - "Warning:please make sure there are no hidden files in the dataset folder and check these hidden files:{}". - format(hidden_file_list)) - - files.sort() - - # for local cluster: discard some files if files cannot be divided equally between GPUs - if (context["device"] == "GPU" - ) and os.getenv("PADDLEREC_GPU_NUMS") is not None: - selected_gpu_nums = int(os.getenv("PADDLEREC_GPU_NUMS")) - discard_file_nums = len(files) % selected_gpu_nums - if (discard_file_nums != 0): - warnings.warn( - "Because files cannot be divided equally between GPUs,discard these files:{}". - format(files[-discard_file_nums:])) - files = files[:len(files) - discard_file_nums] - - need_split_files = False - if context["engine"] == EngineMode.LOCAL_CLUSTER: - # for local cluster: split files for multi process - need_split_files = True - elif context["engine"] == EngineMode.CLUSTER and context[ - "cluster_type"] == "K8S": - # for k8s mount mode, split files for every node - need_split_files = True - - if need_split_files: - files = split_files(files, context["fleet"].worker_index(), - context["fleet"].worker_num()) - - sparse = get_global_env("sparse_slots", "#", namespace) - if sparse == "": - sparse = "#" - dense = get_global_env("dense_slots", "#", namespace) - if dense == "": - dense = "#" - padding = get_global_env("padding", 0, namespace) - reader = SlotReader(yaml_file) - reader.init(sparse, dense, int(padding)) - - def gen_reader(): - for file in files: - with open(file, 'r') as f: - for line in f: - line = line.rstrip('\n') - iter = reader.generate_sample(line) - for parsed_line in iter(): - if parsed_line is None: - continue - else: - values = [] - for pased in parsed_line: - values.append(pased[1]) - yield values - - def gen_batch_reader(): - return reader.generate_batch_from_trainfiles(files) - - if hasattr(reader, 'generate_batch_from_trainfiles'): - return gen_batch_reader() - return gen_reader