From ac9d8a57dc344da7f1329fdba499d0055c9150c8 Mon Sep 17 00:00:00 2001 From: JepsonWong <2013000149@qq.com> Date: Thu, 5 Mar 2020 18:30:28 +0000 Subject: [PATCH] add dataloader for ptb_lm, test=develop --- dygraph/ptb_lm/ptb_dy.py | 48 ++++++++++++------------- dygraph/ptb_lm/reader.py | 76 ++++++++++++++++++++++++++++++++++------ 2 files changed, 89 insertions(+), 35 deletions(-) diff --git a/dygraph/ptb_lm/ptb_dy.py b/dygraph/ptb_lm/ptb_dy.py index 86411a02..0f8a36cf 100644 --- a/dygraph/ptb_lm/ptb_dy.py +++ b/dygraph/ptb_lm/ptb_dy.py @@ -310,12 +310,11 @@ def train_ptb_lm(): last_cell = None data_path = args.data_path - print("begin to load data") - ptb_data = reader.get_ptb_data(data_path) - print("finished load data") - train_data, valid_data, test_data = ptb_data + print("begin to load vocab dict") + vocab_dict = reader.get_ptb_vocab_dict(data_path) + print("finished load vocab dict") - batch_len = len(train_data) // batch_size + batch_len = reader.get_size_of_ptb_train_data(vocab_dict, data_path) // batch_size total_batch_size = (batch_len - 1) // num_steps log_interval = 200 @@ -330,7 +329,7 @@ def train_ptb_lm(): sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters()) - def eval(model, data): + def eval(model, data_iter): print("begin to eval") total_loss = 0.0 iters = 0.0 @@ -340,13 +339,8 @@ def train_ptb_lm(): (num_layers, batch_size, hidden_size), dtype='float32') model.eval() - train_data_iter = reader.get_data_iter(data, batch_size, num_steps) - for batch_id, batch in enumerate(train_data_iter): - x_data, y_data = batch - x_data = x_data.reshape((-1, num_steps, 1)) - y_data = y_data.reshape((-1, num_steps, 1)) - x = to_variable(x_data) - y = to_variable(y_data) + for batch_id, batch in enumerate(data_iter): + x, y = batch init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, @@ -367,6 +361,20 @@ def train_ptb_lm(): print("kpis\ttest_ppl\t%0.3f" % ppl[0]) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) + + train_data_iter = fluid.io.DataLoader.from_generator(capacity=32, use_double_buffer=True, iterable=True, return_list=True, use_multiprocess=True) + valid_data_iter = fluid.io.DataLoader.from_generator(capacity=32, use_double_buffer=True, iterable=True, return_list=True, use_multiprocess=True) + test_data_iter = fluid.io.DataLoader.from_generator(capacity=32, use_double_buffer=True, iterable=True, return_list=True, use_multiprocess=True) + + train_reader = reader.get_reader("train", batch_size, num_steps, data_path, vocab_dict) + train_data_iter.set_batch_generator(train_reader, place) + + valid_reader = reader.get_reader("valid", batch_size, num_steps, data_path, vocab_dict) + valid_data_iter.set_batch_generator(valid_reader, place) + + test_reader = reader.get_reader("test", batch_size, num_steps, data_path, vocab_dict) + test_data_iter.set_batch_generator(test_reader, place) + for epoch_id in range(max_epoch): ptb_model.train() total_loss = 0.0 @@ -376,19 +384,11 @@ def train_ptb_lm(): init_cell_data = np.zeros( (num_layers, batch_size, hidden_size), dtype='float32') - train_data_iter = reader.get_data_iter(train_data, batch_size, - num_steps) init_hidden = to_variable(init_hidden_data) init_cell = to_variable(init_cell_data) start_time = time.time() for batch_id, batch in enumerate(train_data_iter): - x_data, y_data = batch - - x_data = x_data.reshape((-1, num_steps, 1)) - y_data = y_data.reshape((-1, num_steps, 1)) - - x = to_variable(x_data) - y = to_variable(y_data) + x, y = batch dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, init_cell) @@ -428,8 +428,8 @@ def train_ptb_lm(): fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) print("Saved model to: %s.\n" % save_model_dir) - eval(ptb_model, valid_data) + eval(ptb_model, valid_data_iter) - eval(ptb_model, test_data) + eval(ptb_model, test_data_iter) train_ptb_lm() diff --git a/dygraph/ptb_lm/reader.py b/dygraph/ptb_lm/reader.py index 6673fec6..0d6feeca 100644 --- a/dygraph/ptb_lm/reader.py +++ b/dygraph/ptb_lm/reader.py @@ -19,6 +19,7 @@ import collections import os import sys import numpy as np +import paddle.fluid as fluid EOS = "" @@ -67,19 +68,72 @@ def get_ptb_data(data_path=None): return train_ids, valid_ids, test_ids +def get_ptb_vocab_dict(data_path=None): + train_file = os.path.join(data_path, "ptb.train.txt") + vocab_dict = build_vocab(train_file) + return vocab_dict + +def get_size_of_ptb_train_data(vocab_dict, data_path=None): + train_file = os.path.join(data_path, "ptb.train.txt") + train_ids = file_to_ids(train_file, vocab_dict) + return len(train_ids) + +def get_ptb_train_data(vocab_dict, data_path=None): + train_file = os.path.join(data_path, "ptb.train.txt") + train_ids = file_to_ids(train_file, vocab_dict) + return train_ids + +def get_ptb_valid_data(vocab_dict, data_path=None): + valid_file = os.path.join(data_path, "ptb.valid.txt") + valid_ids = file_to_ids(valid_file, vocab_dict) + return valid_ids + +def get_ptb_test_data(vocab_dict, data_path=None): + test_file = os.path.join(data_path, "ptb.test.txt") + test_ids = file_to_ids(test_file, vocab_dict) + return test_ids + +def mapper(sample): + return sample + +def get_reader(data_type, batch_size, num_steps, data_path, vocab_dict): + def get_data_reader(): + def get_data_iter(): + if data_type == "train": + raw_data = get_ptb_train_data(vocab_dict, data_path) + elif data_type == "valid": + raw_data = get_ptb_valid_data(vocab_dict, data_path) + else: + raw_data = get_ptb_test_data(vocab_dict, data_path) + + data_len = len(raw_data) + raw_data = np.asarray(raw_data, dtype="int64") + + batch_len = data_len // batch_size + + data = raw_data[0:batch_size * batch_len].reshape((batch_size, batch_len)) + + epoch_size = (batch_len - 1) // num_steps + for i in range(epoch_size): + start = i * num_steps + x = np.copy(data[:, i * num_steps:(i + 1) * num_steps]) + y = np.copy(data[:, i * num_steps + 1:(i + 1) * num_steps + 1]) + + x = x.reshape((-1, num_steps, 1)) + y = y.reshape((-1, num_steps, 1)) -def get_data_iter(raw_data, batch_size, num_steps): - data_len = len(raw_data) - raw_data = np.asarray(raw_data, dtype="int64") + yield (x, y) - batch_len = data_len // batch_size + return get_data_iter - data = raw_data[0:batch_size * batch_len].reshape((batch_size, batch_len)) + data_reader = get_data_reader() + + ret = fluid.io.xmap_readers( + mapper, + data_reader, + 8, + 64, + order=False) + return ret - epoch_size = (batch_len - 1) // num_steps - for i in range(epoch_size): - start = i * num_steps - x = np.copy(data[:, i * num_steps:(i + 1) * num_steps]) - y = np.copy(data[:, i * num_steps + 1:(i + 1) * num_steps + 1]) - yield (x, y) -- GitLab