From 8a521c0b4da5118098d57e34c1c4150e276f140a Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 16 Jul 2018 17:44:14 +0800 Subject: [PATCH] Remove buggy get_test_program and refine c++ reader demo --- python/paddle/fluid/io.py | 98 --------------- .../convert_data_to_recordio.py | 8 +- .../tests/demo/text_classification/train.py | 115 +++++++++--------- 3 files changed, 62 insertions(+), 159 deletions(-) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 0eb1194e2..32368d3c0 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -789,101 +789,3 @@ def get_parameter_value_by_name(name, executor, program=None): program = default_main_program() var = program.global_block().var(name) return get_parameter_value(var, executor) - - -def get_test_program(filelist, program=None, startup_program=None): - """ - Transpile current train program to a program to read test dataset - if the program is using reader ops like "open_files_op". - """ - - def _copy_reader_var_(block, var, new_name=None): - if new_name == None: - new_name = var.name - new_var = block.create_var( - name=str(new_name), type=core.VarDesc.VarType.READER) - new_var.desc.set_shapes(var.desc.shapes()) - new_var.desc.set_dtypes(var.desc.dtypes()) - new_var.persistable = True - return new_var - - def _get_test_reader_name(train_reader_name): - return train_reader_name + "_test" - - def _is_reader_op(op): - block = op.block - if "Out" in op.output_names: - reader_out = block.vars[op.output("Out")[0]] - if reader_out.type == core.VarDesc.VarType.READER: - return True - return False - - if program == None: - program = default_main_program() - if startup_program == None: - startup_program = default_startup_program() - startup_block = startup_program.global_block() - - # 1. find out the orignal reader var name - startup_reader_op_list = [] - - for op in startup_block.ops: - if _is_reader_op(op): - startup_reader_op_list.append(op) - - if len(startup_reader_op_list) == 0: - return program - - root_reader_op = startup_reader_op_list[0] - train_test_reader_map = {} - # 2. add operators to startup to read open and read test data files - for op in startup_reader_op_list: - assert (len(op.output("Out")) == 1) - train_reader_name = op.output("Out")[0] - train_reader = startup_block.vars[train_reader_name] - test_reader = _copy_reader_var_( - startup_block, - train_reader, - new_name=_get_test_reader_name(train_reader_name)) - train_test_reader_map[train_reader.name] = test_reader - - test_op_inputs = {} - for name in op.input_names: - train_arg_names = op.input(name) - test_arg_vars = [] - for arg_name in train_arg_names: - arg_var = train_test_reader_map[ - arg_name] if name == "UnderlyingReader" else startup_block.vars[ - arg_name] - test_arg_vars.append(arg_var) - test_op_inputs[name] = test_arg_vars - - test_op = startup_block.append_op( - type=op.type, - inputs=test_op_inputs, - outputs={'Out': [test_reader]}, - attrs=op.attrs) - # root reader op's filelist attr for read test files - if op.type == root_reader_op.type: - test_op.set_attr("file_names", filelist) - if op.type == "create_multi_pass_reader": - test_op.set_attr("pass_num", 1) - - # 3. rename reader vars in inference program to different name - # to avoid read from train data. - main_block = program.global_block() - for var in main_block.vars.values(): - if var.type == core.VarDesc.VarType.READER: - main_block.rename_var( - str(var.name), str(_get_test_reader_name(var.name))) - - for op in main_block.ops: - if op.type == root_reader_op.type: - test_op.set_attr("file_names", filelist) - if op.type == "create_multi_pass_reader": - test_op.set_attr("pass_num", 1) - - startup_program.sync_with_cpp() - program.sync_with_cpp() - - return program diff --git a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py index 9425d472a..2dd8f352f 100644 --- a/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py +++ b/python/paddle/fluid/tests/demo/text_classification/convert_data_to_recordio.py @@ -31,8 +31,12 @@ def load_vocab(filename): # load word dict with paddle inner function -word_dict = load_vocab(sys.argv[1]) -word_dict[""] = len(word_dict) +if len(sys.argv) > 1: + word_dict = load_vocab(sys.argv[1]) + word_dict[""] = len(word_dict) +else: + word_dict = paddle.dataset.imdb.word_dict() + print "Dict dim = ", len(word_dict) # input text data diff --git a/python/paddle/fluid/tests/demo/text_classification/train.py b/python/paddle/fluid/tests/demo/text_classification/train.py index e408684c6..9e930b67a 100644 --- a/python/paddle/fluid/tests/demo/text_classification/train.py +++ b/python/paddle/fluid/tests/demo/text_classification/train.py @@ -19,7 +19,7 @@ import sys TRAIN_FILES = ['train.recordio'] TEST_FILES = ['test.recordio'] -DICT_DIM = 89528 +DICT_DIM = 5147 # embedding dim emb_dim = 128 @@ -33,33 +33,24 @@ hid_dim2 = 96 # class num class_dim = 2 +# epoch num +epoch_num = 10 -def network_cfg(is_train, pass_num=100): - with fluid.unique_name.guard(): - train_file_obj = fluid.layers.open_files( - filenames=TRAIN_FILES, - pass_num=pass_num, - shapes=[[-1, 1], [-1, 1]], - lod_levels=[1, 0], - dtypes=['int64', 'int64'], - thread_num=1) - - test_file_obj = fluid.layers.open_files( - filenames=TEST_FILES, - pass_num=1, - shapes=[[-1, 1], [-1, 1]], - lod_levels=[1, 0], - dtypes=['int64', 'int64'], - thread_num=1) - if is_train: - file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000) - else: - file_obj = test_file_obj +def build_program(is_train): + file_obj_handle = fluid.layers.io.open_files( + filenames=TRAIN_FILES if is_train else TEST_FILES, + shapes=[[-1, 1], [-1, 1]], + lod_levels=[1, 0], + dtypes=['int64', 'int64'], + thread_num=1) + if is_train: + file_obj = fluid.layers.io.shuffle(file_obj_handle, buffer_size=1000) + else: + file_obj = file_obj_handle + file_obj = fluid.layers.io.double_buffer(file_obj) - file_obj = fluid.layers.double_buffer( - file_obj, - name="train_double_buffer" if is_train else 'test_double_buffer') + with fluid.unique_name.guard(): data, label = fluid.layers.read_file(file_obj) @@ -90,58 +81,64 @@ def network_cfg(is_train, pass_num=100): if is_train: # SGD optimizer - sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.001) sgd_optimizer.minimize(avg_cost) - return { - 'loss': avg_cost, - 'log': [avg_cost, acc], - 'file': train_file_obj if is_train else test_file_obj - } + return {'loss': avg_cost, 'log': [avg_cost, acc], 'file': file_obj_handle} def main(): train = fluid.Program() startup = fluid.Program() + test = fluid.Program() with fluid.program_guard(train, startup): - train_args = network_cfg(is_train=True) - - test = fluid.Program() + train_args = build_program(is_train=True) - with fluid.program_guard(test, fluid.Program()): - test_args = network_cfg(is_train=False) + with fluid.program_guard(test, startup): + test_args = build_program(is_train=False) + use_cuda = fluid.core.is_compiled_with_cuda() # startup - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place=place) exe.run(startup) train_exe = fluid.ParallelExecutor( - use_cuda=True, loss_name=train_args['loss'].name, main_program=train) + use_cuda=use_cuda, + loss_name=train_args['loss'].name, + main_program=train) + test_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, main_program=test, share_vars_from=train_exe) fetch_var_list = [var.name for var in train_args['log']] - for i in xrange(sys.maxint): - result = map(numpy.array, - train_exe.run(fetch_list=fetch_var_list - if i % 1000 == 0 else [])) - if len(result) != 0: - print 'Train: ', result - - if i % 1000 == 0: - test_exe = fluid.ParallelExecutor( - use_cuda=True, main_program=test, share_vars_from=train_exe) - loss = [] - acc = [] - try: - while True: - loss_np, acc_np = map( - numpy.array, test_exe.run(fetch_list=fetch_var_list)) - loss.append(loss_np[0]) - acc.append(acc_np[0]) - except: - test_args['file'].reset() - print 'TEST: ', numpy.mean(loss), numpy.mean(acc) + for epoch_id in range(epoch_num): + # train + try: + batch_id = 0 + while True: + result = map(numpy.array, + train_exe.run(fetch_list=fetch_var_list + if batch_id % 10 == 0 else [])) + if len(result) != 0: + print 'Train loss: ', result + batch_id += 1 + except fluid.core.EOFException: + print 'End of epoch', epoch_id + train_args['file'].reset() + + # test + loss = [] + acc = [] + try: + while True: + loss_np, acc_np = map(numpy.array, + test_exe.run(fetch_list=fetch_var_list)) + loss.append(loss_np[0]) + acc.append(acc_np[0]) + except: + test_args['file'].reset() + print 'TEST: ', numpy.mean(loss), numpy.mean(acc) if __name__ == '__main__': -- GitLab