提交 ac9d8a57 编写于 作者: J JepsonWong

add dataloader for ptb_lm, test=develop

上级 52ca7b75
...@@ -310,12 +310,11 @@ def train_ptb_lm(): ...@@ -310,12 +310,11 @@ def train_ptb_lm():
last_cell = None last_cell = None
data_path = args.data_path data_path = args.data_path
print("begin to load data") print("begin to load vocab dict")
ptb_data = reader.get_ptb_data(data_path) vocab_dict = reader.get_ptb_vocab_dict(data_path)
print("finished load data") print("finished load vocab dict")
train_data, valid_data, test_data = ptb_data
batch_len = len(train_data) // batch_size batch_len = reader.get_size_of_ptb_train_data(vocab_dict, data_path) // batch_size
total_batch_size = (batch_len - 1) // num_steps total_batch_size = (batch_len - 1) // num_steps
log_interval = 200 log_interval = 200
...@@ -330,7 +329,7 @@ def train_ptb_lm(): ...@@ -330,7 +329,7 @@ def train_ptb_lm():
sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters()) boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters())
def eval(model, data): def eval(model, data_iter):
print("begin to eval") print("begin to eval")
total_loss = 0.0 total_loss = 0.0
iters = 0.0 iters = 0.0
...@@ -340,13 +339,8 @@ def train_ptb_lm(): ...@@ -340,13 +339,8 @@ def train_ptb_lm():
(num_layers, batch_size, hidden_size), dtype='float32') (num_layers, batch_size, hidden_size), dtype='float32')
model.eval() model.eval()
train_data_iter = reader.get_data_iter(data, batch_size, num_steps) for batch_id, batch in enumerate(data_iter):
for batch_id, batch in enumerate(train_data_iter): x, y = batch
x_data, y_data = batch
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, num_steps, 1))
x = to_variable(x_data)
y = to_variable(y_data)
init_hidden = to_variable(init_hidden_data) init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data) init_cell = to_variable(init_cell_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
...@@ -367,6 +361,20 @@ def train_ptb_lm(): ...@@ -367,6 +361,20 @@ def train_ptb_lm():
print("kpis\ttest_ppl\t%0.3f" % ppl[0]) print("kpis\ttest_ppl\t%0.3f" % ppl[0])
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm) grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm)
train_data_iter = fluid.io.DataLoader.from_generator(capacity=32, use_double_buffer=True, iterable=True, return_list=True, use_multiprocess=True)
valid_data_iter = fluid.io.DataLoader.from_generator(capacity=32, use_double_buffer=True, iterable=True, return_list=True, use_multiprocess=True)
test_data_iter = fluid.io.DataLoader.from_generator(capacity=32, use_double_buffer=True, iterable=True, return_list=True, use_multiprocess=True)
train_reader = reader.get_reader("train", batch_size, num_steps, data_path, vocab_dict)
train_data_iter.set_batch_generator(train_reader, place)
valid_reader = reader.get_reader("valid", batch_size, num_steps, data_path, vocab_dict)
valid_data_iter.set_batch_generator(valid_reader, place)
test_reader = reader.get_reader("test", batch_size, num_steps, data_path, vocab_dict)
test_data_iter.set_batch_generator(test_reader, place)
for epoch_id in range(max_epoch): for epoch_id in range(max_epoch):
ptb_model.train() ptb_model.train()
total_loss = 0.0 total_loss = 0.0
...@@ -376,19 +384,11 @@ def train_ptb_lm(): ...@@ -376,19 +384,11 @@ def train_ptb_lm():
init_cell_data = np.zeros( init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32') (num_layers, batch_size, hidden_size), dtype='float32')
train_data_iter = reader.get_data_iter(train_data, batch_size,
num_steps)
init_hidden = to_variable(init_hidden_data) init_hidden = to_variable(init_hidden_data)
init_cell = to_variable(init_cell_data) init_cell = to_variable(init_cell_data)
start_time = time.time() start_time = time.time()
for batch_id, batch in enumerate(train_data_iter): for batch_id, batch in enumerate(train_data_iter):
x_data, y_data = batch x, y = batch
x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, num_steps, 1))
x = to_variable(x_data)
y = to_variable(y_data)
dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
init_cell) init_cell)
...@@ -428,8 +428,8 @@ def train_ptb_lm(): ...@@ -428,8 +428,8 @@ def train_ptb_lm():
fluid.save_dygraph(ptb_model.state_dict(), save_model_dir) fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
print("Saved model to: %s.\n" % save_model_dir) print("Saved model to: %s.\n" % save_model_dir)
eval(ptb_model, valid_data) eval(ptb_model, valid_data_iter)
eval(ptb_model, test_data) eval(ptb_model, test_data_iter)
train_ptb_lm() train_ptb_lm()
...@@ -19,6 +19,7 @@ import collections ...@@ -19,6 +19,7 @@ import collections
import os import os
import sys import sys
import numpy as np import numpy as np
import paddle.fluid as fluid
EOS = "</eos>" EOS = "</eos>"
...@@ -67,19 +68,72 @@ def get_ptb_data(data_path=None): ...@@ -67,19 +68,72 @@ def get_ptb_data(data_path=None):
return train_ids, valid_ids, test_ids return train_ids, valid_ids, test_ids
def get_ptb_vocab_dict(data_path=None):
train_file = os.path.join(data_path, "ptb.train.txt")
vocab_dict = build_vocab(train_file)
return vocab_dict
def get_size_of_ptb_train_data(vocab_dict, data_path=None):
train_file = os.path.join(data_path, "ptb.train.txt")
train_ids = file_to_ids(train_file, vocab_dict)
return len(train_ids)
def get_ptb_train_data(vocab_dict, data_path=None):
train_file = os.path.join(data_path, "ptb.train.txt")
train_ids = file_to_ids(train_file, vocab_dict)
return train_ids
def get_ptb_valid_data(vocab_dict, data_path=None):
valid_file = os.path.join(data_path, "ptb.valid.txt")
valid_ids = file_to_ids(valid_file, vocab_dict)
return valid_ids
def get_ptb_test_data(vocab_dict, data_path=None):
test_file = os.path.join(data_path, "ptb.test.txt")
test_ids = file_to_ids(test_file, vocab_dict)
return test_ids
def mapper(sample):
return sample
def get_reader(data_type, batch_size, num_steps, data_path, vocab_dict):
def get_data_reader():
def get_data_iter():
if data_type == "train":
raw_data = get_ptb_train_data(vocab_dict, data_path)
elif data_type == "valid":
raw_data = get_ptb_valid_data(vocab_dict, data_path)
else:
raw_data = get_ptb_test_data(vocab_dict, data_path)
data_len = len(raw_data)
raw_data = np.asarray(raw_data, dtype="int64")
batch_len = data_len // batch_size
data = raw_data[0:batch_size * batch_len].reshape((batch_size, batch_len))
epoch_size = (batch_len - 1) // num_steps
for i in range(epoch_size):
start = i * num_steps
x = np.copy(data[:, i * num_steps:(i + 1) * num_steps])
y = np.copy(data[:, i * num_steps + 1:(i + 1) * num_steps + 1])
x = x.reshape((-1, num_steps, 1))
y = y.reshape((-1, num_steps, 1))
def get_data_iter(raw_data, batch_size, num_steps): yield (x, y)
data_len = len(raw_data)
raw_data = np.asarray(raw_data, dtype="int64")
batch_len = data_len // batch_size return get_data_iter
data = raw_data[0:batch_size * batch_len].reshape((batch_size, batch_len)) data_reader = get_data_reader()
ret = fluid.io.xmap_readers(
mapper,
data_reader,
8,
64,
order=False)
return ret
epoch_size = (batch_len - 1) // num_steps
for i in range(epoch_size):
start = i * num_steps
x = np.copy(data[:, i * num_steps:(i + 1) * num_steps])
y = np.copy(data[:, i * num_steps + 1:(i + 1) * num_steps + 1])
yield (x, y)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册