提交 6d189911 编写于 作者: F frankwhzhang

modify gru4rec format2

上级 14139f8f
"""
imikolov's simple dataset.
This module will download dataset from
http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
into paddle reader creators.
"""
from __future__ import print_function
import paddle.dataset.common
import collections
import tarfile
import six
__all__ = ['train', 'test', 'build_dict', 'convert']
class DataType(object):
SEQ = 2
def word_count(f, word_freq=None):
if word_freq is None:
word_freq = collections.defaultdict(int)
for l in f:
for w in l.strip().split():
word_freq[w] += 1
return word_freq
def build_dict(min_word_freq=50,train_filename="",test_filename=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
with open(train_filename) as trainf:
with open(test_filename) as testf:
word_freq = word_count(testf, word_count(trainf))
if '<unk>' in word_freq:
# remove <unk> for now, since we will set it as last index
del word_freq['<unk>']
word_freq = [
x for x in six.iteritems(word_freq) if x[1] > min_word_freq
]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def reader_creator(filename, word_idx, n, data_type):
def reader():
with open(filename) as f:
for l in f:
if DataType.SEQ == data_type:
l = l.strip().split()
l = [word_idx.get(w) for w in l]
src_seq = l[:len(l)-1]
trg_seq = l[1:]
if n > 0 and len(src_seq) > n: continue
yield src_seq, trg_seq
else:
assert False, 'error data type'
return reader
def train(filename,word_idx, n, data_type=DataType.SEQ):
return reader_creator(filename, word_idx, n,
data_type)
def test(filename,word_idx, n, data_type=DataType.SEQ):
return reader_creator(filename, word_idx, n,
data_type)
import sys
import time
import math
import unittest
import contextlib
import numpy as np
import six
import paddle.fluid as fluid
import paddle
import utils
def infer(test_reader, use_cuda, model_path):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
with fluid.scope_guard(fluid.core.Scope()):
infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(model_path, exe)
accum_num_recall = 0.0
accum_num_sum = 0.0
t0 = time.time()
step_id = 0
for data in test_reader():
step_id += 1
src_wordseq = utils.to_lodtensor([dat[0] for dat in data], place)
label_data = [dat[1] for dat in data]
dst_wordseq = utils.to_lodtensor(label_data, place)
para = exe.run(
infer_program,
feed={"src_wordseq": src_wordseq,
"dst_wordseq": dst_wordseq},
fetch_list=fetch_vars,
return_numpy=False)
acc_ = para[1]._get_float_element(0)
data_length = len(np.concatenate(label_data, axis=0).astype("int64"))
accum_num_sum += (data_length)
accum_num_recall += (data_length*acc_)
if step_id % 100 == 0:
print("step:%d " % (step_id), accum_num_recall/accum_num_sum)
t1 = time.time()
print("model:%s recall@20:%.3f time_cost(s):%.2f" %
(model_path, accum_num_recall/accum_num_sum, t1 - t0))
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
exit(0)
model_dir = sys.argv[1]
try:
start_index = int(sys.argv[2])
last_index = int(sys.argv[3])
except:
iprint("Usage: %s model_dir start_ipoch last_epoch(inclusive)")
exit(-1)
train_file = "small_train.txt"
test_file = "small_test.txt"
vocab, train_reader, test_reader = utils.prepare_data(train_file, test_file,
batch_size=5, buffer_size=1000, word_freq_threshold=0)
for epoch in xrange(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch)
infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path)
此差异已折叠。
此差异已折叠。
def batch(reader, batch_size, sort_group_size, drop_last=False):
"""
Create a batched reader.
:param reader: the data reader to read from.
:type reader: callable
:param batch_size: size of each mini-batch
:type batch_size: int
:param sort_group_size: size of partial sorted batch
:type sort_group_size: int
:param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
:type drop_last: bool
:return: the batched reader.
:rtype: callable
"""
def batch_reader():
r = reader()
b = []
for instance in r:
b.append(instance)
if len(b) == sort_group_size:
sortl = sorted(b, key=lambda x:len(x[0]), reverse=True)
b = []
c = []
for sort_i in sortl:
c.append(sort_i)
if(len(c) == batch_size):
yield c
c = []
if drop_last == False and len(b) != 0:
sortl = sorted(b, key=lambda x:len(x[0]), reverse=True)
c = []
for sort_i in sortl:
c.append(sort_i)
if(len(c) == batch_size):
yield c
c = []
# Batch size check
batch_size = int(batch_size)
if batch_size <= 0:
raise ValueError("batch_size should be a positive integeral value, "
"but got batch_size={}".format(batch_size))
return batch_reader
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle.fluid as fluid
import paddle
import time
import utils
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run \
the task with continuous evaluation logs.')
parser.add_argument(
'--num_devices',
type=int,
default=1,
help='Number of GPU devices')
args = parser.parse_args()
return args
def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
""" network definition """
emb_lr_x = 10.0
gru_lr_x = 1.0
fc_lr_x = 1.0
emb = fluid.layers.embedding(
input=src,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=emb_lr_x),
is_sparse=True)
fc0 = fluid.layers.fc(input=emb,
size=hid_size * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
size=hid_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
fc = fluid.layers.fc(input=gru_h0,
size=vocab_size,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=fc_lr_x))
cost = fluid.layers.cross_entropy(input=fc, label=dst)
acc = fluid.layers.accuracy(input=fc, label=dst, k=20)
return cost, acc
def train(train_reader,
vocab,
network,
hid_size,
base_lr,
batch_size,
pass_num,
use_cuda,
parallel,
model_dir,
init_low_bound=-0.04,
init_high_bound=0.04):
""" train network """
args = parse_args()
if args.enable_ce:
# random seed must set before configuring the network.
fluid.default_startup_program().random_seed = SEED
vocab_size = len(vocab)
# Input data
src_wordseq = fluid.layers.data(
name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
dst_wordseq = fluid.layers.data(
name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
# Train program
avg_cost = None
cost, acc = network(src_wordseq, dst_wordseq,
vocab_size,hid_size, init_low_bound, init_high_bound)
avg_cost = fluid.layers.mean(x=cost)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr)
sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if parallel :
train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name)
else :
train_exe = exe
total_time = 0.0
fetch_list = [avg_cost.name]
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print "epoch_%d start" % epoch_idx
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor(
[dat[0] for dat in data], place)
lod_dst_wordseq = utils.to_lodtensor(
[dat[1] for dat in data], place)
ret_avg_cost = train_exe.run(feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % 10 == 0:
print "step:%d ppl:%.3f" % (i, newest_ppl)
t1 = time.time()
total_time += t1 - t0
print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
total_time / epoch_idx)
if pass_idx == pass_num - 1 and args.enable_ce:
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
gpu_num = get_cards(args.enable_ce)
if gpu_num == 1:
print("kpis rsc15_pass_duration %s" %
(total_time / epoch_idx))
print("kpis rsc15_avg_ppl %s" % newest_ppl)
else:
print("kpis rsc15_pass_duration_card%s %s" % \
(gpu_num, total_time / epoch_idx))
print("kpis rsc15_avg_ppl_card%s %s" %
(gpu_num, newest_ppl))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
print("finish training")
def get_cards(args):
if args.enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return args.num_devices
def train_net():
""" do training """
train_file = "small_train.txt"
test_file = "small_test.txt"
batch_size = 50
args = parse_args()
vocab, train_reader, test_reader = utils.prepare_data(
train_file, test_file,batch_size=batch_size * get_cards(args),\
buffer_size=1000, word_freq_threshold=0)
train(
train_reader=train_reader,
vocab=vocab,
network=network,
hid_size=100,
base_lr=0.01,
batch_size=batch_size,
pass_num=10,
use_cuda=True,
parallel=False,
model_dir="model_recall20",
init_low_bound=-0.1,
init_high_bound=0.1)
if __name__ == "__main__":
train_net()
import sys
import time
import numpy as np
import paddle.fluid as fluid
import paddle
import data_preprocess as dp
import sort_batch as sortb
def to_lodtensor(data, place):
""" convert to LODtensor """
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def prepare_data(train_filename, test_filename, batch_size,
buffer_size=1000, word_freq_threshold=0, enable_ce=False):
""" prepare the English Pann Treebank (PTB) data """
print("start constuct word dict")
vocab = dp.build_dict(word_freq_threshold,train_filename,test_filename)
print("construct word dict done\n")
if enable_ce:
train_reader = paddle.batch(
dp.train(train_filename,
vocab,
buffer_size,
data_type=dp.DataType.SEQ),
batch_size)
else:
train_reader = sortb.batch(
paddle.reader.shuffle(
dp.train(train_filename,
vocab,
buffer_size,
data_type=dp.DataType.SEQ),
buf_size=buffer_size),
batch_size,batch_size*20)
test_reader = sortb.batch(
dp.test(test_filename,
vocab, buffer_size, data_type=dp.DataType.SEQ),
batch_size,batch_size*20)
return vocab, train_reader, test_reader
import collections
import six
class DataType(object):
SEQ = 2
def word_count(input_file, word_freq=None):
"""
compute word count from corpus
"""
if word_freq is None:
word_freq = collections.defaultdict(int)
for l in input_file:
for w in l.strip().split():
word_freq[w] += 1
return word_freq
def build_dict(min_word_freq=50, train_filename="", test_filename=""):
"""
Build a word dictionary from the corpus, Keys of the dictionary are words,
and values are zero-based IDs of these words.
"""
with open(train_filename) as trainf:
with open(test_filename) as testf:
word_freq = word_count(testf, word_count(trainf))
word_freq = [
x for x in six.iteritems(word_freq) if x[1] > min_word_freq
]
word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*word_freq_sorted))
word_idx = dict(list(zip(words, six.moves.range(len(words)))))
return word_idx
def reader_creator(filename, word_idx, n, data_type):
def reader():
with open(filename) as f:
for l in f:
if DataType.SEQ == data_type:
l = l.strip().split()
l = [word_idx.get(w) for w in l]
src_seq = l[:len(l)-1]
trg_seq = l[1:]
if n > 0 and len(src_seq) > n: continue
yield src_seq, trg_seq
else:
assert False, 'error data type'
return reader
def train(filename,word_idx, n, data_type=DataType.SEQ):
return reader_creator(filename, word_idx, n, data_type)
def test(filename,word_idx, n, data_type=DataType.SEQ):
return reader_creator(filename, word_idx, n, data_type)
import sys
import time
import math
import unittest
import contextlib
import numpy as np
import six
import paddle.fluid as fluid
import paddle
import utils
def infer(test_reader, use_cuda, model_path):
""" inference function """
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
with fluid.scope_guard(fluid.core.Scope()):
infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(model_path, exe)
accum_num_recall = 0.0
accum_num_sum = 0.0
t0 = time.time()
step_id = 0
for data in test_reader():
step_id += 1
src_wordseq = utils.to_lodtensor([dat[0] for dat in data], place)
label_data = [dat[1] for dat in data]
dst_wordseq = utils.to_lodtensor(label_data, place)
para = exe.run(
infer_program,
feed={"src_wordseq": src_wordseq,
"dst_wordseq": dst_wordseq},
fetch_list=fetch_vars,
return_numpy=False)
acc_ = para[1]._get_float_element(0)
data_length = len(np.concatenate(label_data, axis=0).astype("int64"))
accum_num_sum += (data_length)
accum_num_recall += (data_length*acc_)
if step_id % 100 == 0:
print("step:%d " % (step_id), accum_num_recall/accum_num_sum)
t1 = time.time()
print("model:%s recall@20:%.3f time_cost(s):%.2f" %
(model_path, accum_num_recall/accum_num_sum, t1 - t0))
if __name__ == "__main__":
if len(sys.argv) != 4:
print("Usage: %s model_dir start_epoch last_epoch(inclusive)")
exit(0)
model_dir = sys.argv[1]
try:
start_index = int(sys.argv[2])
last_index = int(sys.argv[3])
except:
iprint("Usage: %s model_dir start_ipoch last_epoch(inclusive)")
exit(-1)
train_file = "small_train.txt"
test_file = "small_test.txt"
vocab, train_reader, test_reader = utils.prepare_data(train_file, test_file,
batch_size=5, buffer_size=1000, word_freq_threshold=0)
for epoch in xrange(start_index, last_index + 1):
epoch_path = model_dir + "/epoch_" + str(epoch)
infer(test_reader=test_reader, use_cuda=True, model_path=epoch_path)
此差异已折叠。
此差异已折叠。
def batch(reader, batch_size, sort_group_size, drop_last=False):
"""
Create a batched reader.
:param reader: the data reader to read from.
:type reader: callable
:param batch_size: size of each mini-batch
:type batch_size: int
:param sort_group_size: size of partial sorted batch
:type sort_group_size: int
:param drop_last: drop the last batch, if the size of last batch is not equal to batch_size.
:type drop_last: bool
:return: the batched reader.
:rtype: callable
"""
def batch_reader():
r = reader()
b = []
for instance in r:
b.append(instance)
if len(b) == sort_group_size:
sortl = sorted(b, key=lambda x:len(x[0]), reverse=True)
b = []
c = []
for sort_i in sortl:
c.append(sort_i)
if(len(c) == batch_size):
yield c
c = []
if drop_last == False and len(b) != 0:
sortl = sorted(b, key=lambda x:len(x[0]), reverse=True)
c = []
for sort_i in sortl:
c.append(sort_i)
if(len(c) == batch_size):
yield c
c = []
# Batch size check
batch_size = int(batch_size)
if batch_size <= 0:
raise ValueError("batch_size should be a positive integeral value, "
"but got batch_size={}".format(batch_size))
return batch_reader
import os
import sys
import time
import six
import numpy as np
import math
import argparse
import paddle.fluid as fluid
import paddle
import time
import utils
SEED = 102
def parse_args():
parser = argparse.ArgumentParser("gru4rec benchmark.")
parser.add_argument(
'--enable_ce',
action='store_true',
help='If set, run \
the task with continuous evaluation logs.')
parser.add_argument(
'--num_devices',
type=int,
default=1,
help='Number of GPU devices')
args = parser.parse_args()
return args
def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
""" network definition """
emb_lr_x = 10.0
gru_lr_x = 1.0
fc_lr_x = 1.0
emb = fluid.layers.embedding(
input=src,
size=[vocab_size, hid_size],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=emb_lr_x),
is_sparse=True)
fc0 = fluid.layers.fc(input=emb,
size=hid_size * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
gru_h0 = fluid.layers.dynamic_gru(
input=fc0,
size=hid_size,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=gru_lr_x))
fc = fluid.layers.fc(input=gru_h0,
size=vocab_size,
act='softmax',
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=init_low_bound, high=init_high_bound),
learning_rate=fc_lr_x))
cost = fluid.layers.cross_entropy(input=fc, label=dst)
acc = fluid.layers.accuracy(input=fc, label=dst, k=20)
return cost, acc
def train(train_reader,
vocab,
network,
hid_size,
base_lr,
batch_size,
pass_num,
use_cuda,
parallel,
model_dir,
init_low_bound=-0.04,
init_high_bound=0.04):
""" train network """
args = parse_args()
if args.enable_ce:
# random seed must set before configuring the network.
fluid.default_startup_program().random_seed = SEED
vocab_size = len(vocab)
# Input data
src_wordseq = fluid.layers.data(
name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
dst_wordseq = fluid.layers.data(
name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
# Train program
avg_cost = None
cost, acc = network(src_wordseq, dst_wordseq,
vocab_size,hid_size, init_low_bound, init_high_bound)
avg_cost = fluid.layers.mean(x=cost)
# Optimization to minimize lost
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=base_lr)
sgd_optimizer.minimize(avg_cost)
# Initialize executor
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
if parallel :
train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=avg_cost.name)
else :
train_exe = exe
total_time = 0.0
fetch_list = [avg_cost.name]
for pass_idx in six.moves.xrange(pass_num):
epoch_idx = pass_idx + 1
print "epoch_%d start" % epoch_idx
t0 = time.time()
i = 0
newest_ppl = 0
for data in train_reader():
i += 1
lod_src_wordseq = utils.to_lodtensor(
[dat[0] for dat in data], place)
lod_dst_wordseq = utils.to_lodtensor(
[dat[1] for dat in data], place)
ret_avg_cost = train_exe.run(feed={
"src_wordseq": lod_src_wordseq,
"dst_wordseq": lod_dst_wordseq
},
fetch_list=fetch_list)
avg_ppl = np.exp(ret_avg_cost[0])
newest_ppl = np.mean(avg_ppl)
if i % 10 == 0:
print "step:%d ppl:%.3f" % (i, newest_ppl)
t1 = time.time()
total_time += t1 - t0
print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
total_time / epoch_idx)
if pass_idx == pass_num - 1 and args.enable_ce:
#Note: The following logs are special for CE monitoring.
#Other situations do not need to care about these logs.
gpu_num = get_cards(args.enable_ce)
if gpu_num == 1:
print("kpis rsc15_pass_duration %s" %
(total_time / epoch_idx))
print("kpis rsc15_avg_ppl %s" % newest_ppl)
else:
print("kpis rsc15_pass_duration_card%s %s" % \
(gpu_num, total_time / epoch_idx))
print("kpis rsc15_avg_ppl_card%s %s" %
(gpu_num, newest_ppl))
save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
feed_var_names = ["src_wordseq", "dst_wordseq"]
fetch_vars = [avg_cost, acc]
fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
print("model saved in %s" % save_dir)
print("finish training")
def get_cards(args):
if args.enable_ce:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
num = len(cards.split(","))
return num
else:
return args.num_devices
def train_net():
""" do training """
train_file = "small_train.txt"
test_file = "small_test.txt"
batch_size = 50
args = parse_args()
vocab, train_reader, test_reader = utils.prepare_data(
train_file, test_file,batch_size=batch_size * get_cards(args),\
buffer_size=1000, word_freq_threshold=0)
train(
train_reader=train_reader,
vocab=vocab,
network=network,
hid_size=100,
base_lr=0.01,
batch_size=batch_size,
pass_num=10,
use_cuda=True,
parallel=False,
model_dir="model_recall20",
init_low_bound=-0.1,
init_high_bound=0.1)
if __name__ == "__main__":
train_net()
import sys
import time
import numpy as np
import paddle.fluid as fluid
import paddle
import data_preprocess as dp
import sort_batch as sortb
def to_lodtensor(data, place):
""" convert to LODtensor """
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def prepare_data(train_filename, test_filename, batch_size,
buffer_size=1000, word_freq_threshold=0, enable_ce=False):
""" prepare the English Pann Treebank (PTB) data """
print("start constuct word dict")
vocab = dp.build_dict(word_freq_threshold,train_filename,test_filename)
print("construct word dict done\n")
if enable_ce:
train_reader = paddle.batch(
dp.train(train_filename,
vocab,
buffer_size,
data_type=dp.DataType.SEQ),
batch_size)
else:
train_reader = sortb.batch(
paddle.reader.shuffle(
dp.train(train_filename,
vocab,
buffer_size,
data_type=dp.DataType.SEQ),
buf_size=buffer_size),
batch_size,batch_size*20)
test_reader = sortb.batch(
dp.test(test_filename,
vocab, buffer_size, data_type=dp.DataType.SEQ),
batch_size,batch_size*20)
return vocab, train_reader, test_reader
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册