未验证 提交 1cb6a643 编写于 作者: L liu zhengxi 提交者: GitHub

update api to 1.8 and update readme (#4609)

* update api to 1.8 for transformer and similarity_net, test=develop

* update readme, test=develop
上级 35db3d17
......@@ -32,7 +32,7 @@
1. paddle安装
本项目依赖于 PaddlePaddle 1.6及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
本项目依赖于 PaddlePaddle 1.8及以上版本或适当的develop版本,请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装
2. 下载代码
......@@ -44,7 +44,7 @@
3. 环境依赖
请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)部分的内容
请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)部分的内容
### 数据准备
......
......@@ -752,18 +752,17 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
# caches contains states of history steps in decoder self-attention
# and static encoder output projections in encoder-decoder attention
# to reduce redundant computation.
batch_size = layers.shape(start_tokens)[0]
caches = [
{
"k": # for self attention
layers.fill_constant_batch_size_like(
input=start_tokens,
shape=[-1, n_head, 0, d_key],
layers.fill_constant(
shape=[batch_size, n_head, 0, d_key],
dtype=enc_output.dtype,
value=0),
"v": # for self attention
layers.fill_constant_batch_size_like(
input=start_tokens,
shape=[-1, n_head, 0, d_value],
layers.fill_constant(
shape=[batch_size, n_head, 0, d_value],
dtype=enc_output.dtype,
value=0),
"static_k": # for encoder-decoder attention
......@@ -792,12 +791,10 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
lambda x: layers.gather(x, index=gather_idx), caches)
pre_src_attn_bias = layers.gather(
trg_src_attn_bias, index=gather_idx)
bias_batch_size = layers.shape(pre_src_attn_bias)[0]
pre_pos = layers.elementwise_mul(
x=layers.fill_constant_batch_size_like(
input=pre_src_attn_bias, # cann't use lod tensor here
value=1,
shape=[-1, 1],
dtype=pre_ids.dtype),
x=layers.fill_constant(
value=1, shape=[bias_batch_size, 1], dtype=pre_ids.dtype),
y=step_idx,
axis=0)
logits = wrap_decoder(
......
......@@ -210,7 +210,7 @@ class DataLayer(object):
"""
operation
"""
data = fluid.layers.data(
data = fluid.data(
name=name, shape=shape, dtype=dtype, lod_level=lod_level)
return data
......@@ -383,8 +383,10 @@ class ConstantLayer(object):
"""
operation
"""
constant = fluid.layers.fill_constant_batch_size_like(input, shape,
dtype, value)
shape = list(shape)
input_shape = fluid.layers.shape(input)
shape[0] = input_shape[0]
constant = fluid.layers.fill_constant(shape, dtype, value)
return constant
......
......@@ -22,7 +22,7 @@
|UNICOM|联通客服|客服|
## 快速开始
#### 版本依赖
本项目依赖于 Paddlepaddle Fluid 1.6,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
本项目依赖于 Paddlepaddle Fluid 1.8,请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick)进行安装。
python版本依赖python 2.7
#### 安装代码
......
......@@ -47,46 +47,51 @@ from models.model_check import check_version
from models.model_check import check_cuda
def create_model(args, pyreader_name, is_inference=False, is_pointwise=False):
def create_model(args, is_inference=False, is_pointwise=False):
"""
Create Model for simnet
"""
if is_inference:
inf_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1], [-1]),
dtypes=('int64', 'int64'),
lod_levels=(1, 1),
name=pyreader_name,
use_double_buffer=False)
left = fluid.data(name='left', shape=[None], dtype='int64', lod_level=1)
pos_right = fluid.data(
name='pos_right', shape=[None], dtype='int64', lod_level=1)
inf_loader = fluid.io.DataLoader.from_generator(
capacity=16,
feed_list=[left, pos_right],
iterable=False,
use_double_buffer=False)
left, pos_right = fluid.layers.read_file(inf_pyreader)
return inf_pyreader, left, pos_right
return inf_loader, left, pos_right
else:
if is_pointwise:
pointwise_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1], [-1], [-1]),
dtypes=('int64', 'int64', 'int64'),
lod_levels=(1, 1, 0),
name=pyreader_name,
use_double_buffer=False)
left, right, label = fluid.layers.read_file(pointwise_pyreader)
return pointwise_pyreader, left, right, label
left = fluid.data(
name='left', shape=[None], dtype='int64', lod_level=1)
right = fluid.data(
name='right', shape=[None], dtype='int64', lod_level=1)
label = fluid.data(name='label', shape=[None], dtype='int64')
pointwise_loader = fluid.io.DataLoader.from_generator(
capacity=16,
feed_list=[left, right, label],
iterable=False,
use_double_buffer=False)
return pointwise_loader, left, right, label
else:
pairwise_pyreader = fluid.layers.py_reader(
capacity=16,
shapes=([-1], [-1], [-1]),
dtypes=('int64', 'int64', 'int64'),
lod_levels=(1, 1, 1),
name=pyreader_name,
use_double_buffer=False)
left = fluid.data(
name='left', shape=[None], dtype='int64', lod_level=1)
pos_right = fluid.data(
name='pos_right', shape=[None], dtype='int64', lod_level=1)
neg_right = fluid.data(
name='neg_right', shape=[None], dtype='int64', lod_level=1)
pairwise_loader = fluid.io.DataLoader.from_generator(
capacity=16,
feed_list=[left, pos_right, neg_right],
iterable=False,
use_double_buffer=False)
left, pos_right, neg_right = fluid.layers.read_file(pairwise_pyreader)
return pairwise_pyreader, left, pos_right, neg_right
return pairwise_loader, left, pos_right, neg_right
def train(conf_dict, args):
......@@ -131,8 +136,7 @@ def train(conf_dict, args):
# Build network
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, left, pos_right, neg_right = create_model(
args, pyreader_name='train_reader')
train_loader, left, pos_right, neg_right = create_model(args)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
_, neg_score = net.predict(left, neg_right)
......@@ -147,8 +151,8 @@ def train(conf_dict, args):
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, pos_right = create_model(
args, pyreader_name='test_reader', is_inference=True)
test_loader, left, pos_right = create_model(
args, is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
......@@ -157,8 +161,8 @@ def train(conf_dict, args):
# Build network
with fluid.program_guard(train_program, startup_prog):
with fluid.unique_name.guard():
train_pyreader, left, right, label = create_model(
args, pyreader_name='train_reader', is_pointwise=True)
train_loader, left, right, label = create_model(
args, is_pointwise=True)
left_feat, pred = net.predict(left, right)
avg_cost = loss.compute(pred, label)
avg_cost.persistable = True
......@@ -171,15 +175,15 @@ def train(conf_dict, args):
test_prog = fluid.Program()
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, right = create_model(
args, pyreader_name='test_reader', is_inference=True)
test_loader, left, right = create_model(
args, is_inference=True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
if args.init_checkpoint is not "":
utils.init_checkpoint(exe, args.init_checkpoint, startup_prog)
def valid_and_test(test_program, test_pyreader, get_valid_examples, process,
def valid_and_test(test_program, test_loader, get_valid_examples, process,
mode, exe, fetch_list):
"""
return auc and acc
......@@ -187,15 +191,15 @@ def train(conf_dict, args):
# Get Batch Data
batch_data = fluid.io.batch(
get_valid_examples, args.batch_size, drop_last=False)
test_pyreader.decorate_paddle_reader(batch_data)
test_pyreader.start()
test_loader.set_sample_list_generator(batch_data)
test_loader.start()
pred_list = []
while True:
try:
_pred = exe.run(program=test_program, fetch_list=[pred.name])
pred_list += list(_pred)
except fluid.core.EOFException:
test_pyreader.reset()
test_loader.reset()
break
pred_list = np.vstack(pred_list)
if mode == "test":
......@@ -233,8 +237,8 @@ def train(conf_dict, args):
get_train_examples, buf_size=10000),
args.batch_size,
drop_last=False)
train_pyreader.decorate_paddle_reader(train_batch_data)
train_pyreader.start()
train_loader.set_sample_list_generator(train_batch_data)
train_loader.start()
exe.run(startup_prog)
losses = []
start_time = time.time()
......@@ -248,8 +252,8 @@ def train(conf_dict, args):
if args.do_valid and global_step % args.validation_steps == 0:
get_valid_examples = simnet_process.get_reader("valid")
valid_result = valid_and_test(
test_prog, test_pyreader, get_valid_examples,
simnet_process, "valid", exe, [pred.name])
test_prog, test_loader, get_valid_examples, simnet_process,
"valid", exe, [pred.name])
if args.compute_accuracy:
valid_auc, valid_acc = valid_result
logging.info(
......@@ -281,7 +285,7 @@ def train(conf_dict, args):
logging.info("saving infer model in %s" % model_path)
except fluid.core.EOFException:
train_pyreader.reset()
train_loader.reset()
break
end_time = time.time()
#logging.info("epoch: %d, loss: %f, used time: %d sec" %
......@@ -327,9 +331,8 @@ def train(conf_dict, args):
else:
# Get Feeder and Reader
get_test_examples = simnet_process.get_reader("test")
test_result = valid_and_test(test_prog, test_pyreader,
get_test_examples, simnet_process, "test",
exe, [pred.name])
test_result = valid_and_test(test_prog, test_loader, get_test_examples,
simnet_process, "test", exe, [pred.name])
if args.compute_accuracy:
test_auc, test_acc = test_result
logging.info("AUC of test is %f, Accuracy of test is %f" %
......@@ -371,8 +374,8 @@ def test(conf_dict, args):
if args.task_mode == "pairwise":
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, pos_right = create_model(
args, pyreader_name='test_reader', is_inference=True)
test_loader, left, pos_right = create_model(
args, is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
......@@ -380,8 +383,8 @@ def test(conf_dict, args):
else:
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
test_pyreader, left, right = create_model(
args, pyreader_name='test_reader', is_inference=True)
test_loader, left, right = create_model(
args, is_inference=True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
......@@ -390,10 +393,10 @@ def test(conf_dict, args):
utils.init_checkpoint(exe, args.init_checkpoint, main_program=test_prog)
test_exe = exe
test_pyreader.decorate_paddle_reader(batch_data)
test_loader.set_sample_list_generator(batch_data)
logging.info("start test process ...")
test_pyreader.start()
test_loader.start()
pred_list = []
fetch_list = [pred.name]
output = []
......@@ -412,7 +415,7 @@ def test(conf_dict, args):
map(lambda item: str(np.argmax(item)), output[0])) +
"\n")
except fluid.core.EOFException:
test_pyreader.reset()
test_loader.reset()
break
if args.task_mode == "pairwise":
pred_list = np.array(pred_list).reshape((-1, 1))
......@@ -468,16 +471,16 @@ def infer(conf_dict, args):
if args.task_mode == "pairwise":
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
infer_pyreader, left, pos_right = create_model(
args, pyreader_name='infer_reader', is_inference=True)
infer_loader, left, pos_right = create_model(
args, is_inference=True)
left_feat, pos_score = net.predict(left, pos_right)
pred = pos_score
test_prog = test_prog.clone(for_test=True)
else:
with fluid.program_guard(test_prog, startup_prog):
with fluid.unique_name.guard():
infer_pyreader, left, right = create_model(
args, pyreader_name='infer_reader', is_inference=True)
infer_loader, left, right = create_model(
args, is_inference=True)
left_feat, pred = net.predict(left, right)
test_prog = test_prog.clone(for_test=True)
......@@ -486,13 +489,13 @@ def infer(conf_dict, args):
utils.init_checkpoint(exe, args.init_checkpoint, main_program=test_prog)
test_exe = exe
infer_pyreader.decorate_sample_list_generator(batch_data)
infer_loader.set_sample_list_generator(batch_data)
logging.info("start test process ...")
preds_list = []
fetch_list = [pred.name]
output = []
infer_pyreader.start()
infer_loader.start()
while True:
try:
output = test_exe.run(program=test_prog, fetch_list=fetch_list)
......@@ -502,7 +505,7 @@ def infer(conf_dict, args):
else:
preds_list += map(lambda item: str(np.argmax(item)), output[0])
except fluid.core.EOFException:
infer_pyreader.reset()
infer_loader.reset()
break
with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file:
for _data, _pred in zip(simnet_process.get_infer_data(), preds_list):
......
......@@ -23,7 +23,7 @@
## 快速开始
#### 版本依赖
本项目依赖于 Paddlepaddle Fluid 1.7,请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
本项目依赖于 Paddlepaddle Fluid 1.8,请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick)进行安装。
#### 安装代码
......
......@@ -33,20 +33,21 @@ def check_cuda(use_cuda, err = \
except Exception as e:
pass
def check_version():
"""
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
err = "PaddlePaddle version 1.6 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.6.0')
except Exception as e:
print(err)
sys.exit(1)
try:
fluid.require_version('1.8.0')
except Exception as e:
print(err)
sys.exit(1)
def check_version():
......@@ -59,7 +60,7 @@ def check_version():
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.6.0')
fluid.require_version('1.8.0')
except Exception as e:
print(err)
sys.exit(1)
......
......@@ -30,6 +30,7 @@ import paddle.fluid.layers.utils as utils
from paddle.fluid.dygraph import Embedding, Conv2D, GRUUnit, Layer, to_variable
from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
class EmbeddingLayer(object):
"""
Embedding Layer class
......@@ -52,12 +53,12 @@ class EmbeddingLayer(object):
size=[self.dict_size, self.emb_dim],
is_sparse=True,
padding_idx=self.padding_idx,
param_attr=attr.ParamAttr(name=self.name, initializer=fluid.initializer.Xavier()))
param_attr=attr.ParamAttr(
name=self.name, initializer=fluid.initializer.Xavier()))
return emb
class FCLayer(object):
"""
Fully Connect Layer class
......@@ -76,9 +77,9 @@ class FCLayer(object):
operation
"""
fc = FC(size=self.fc_dim,
param_attr=attr.ParamAttr(name="%s.w" % self.name),
bias_attr=attr.ParamAttr(name="%s.b" % self.name),
act=self.act)
param_attr=attr.ParamAttr(name="%s.w" % self.name),
bias_attr=attr.ParamAttr(name="%s.b" % self.name),
act=self.act)
return fc
......@@ -93,7 +94,7 @@ class DynamicGRULayer(object):
"""
self.gru_dim = gru_dim
self.name = name
def ops(self):
"""
operation
......@@ -117,11 +118,13 @@ class DynamicLSTMLayer(object):
self.lstm_dim = lstm_dim
self.name = name
self.is_reverse = is_reverse
def ops(self):
"""
operation
"""
lstm_cell = BasicLSTMUnit(hidden_size=self.lstm_dim, input_size=self.lstm_dim*4)
lstm_cell = BasicLSTMUnit(
hidden_size=self.lstm_dim, input_size=self.lstm_dim * 4)
lstm = RNN(cell=lstm_cell, time_major=True, is_reverse=self.is_reverse)
return lstm
......@@ -141,7 +144,7 @@ class DataLayer(object):
"""
operation
"""
data = fluid.layers.data(
data = fluid.data(
name=name, shape=shape, dtype=dtype, lod_level=lod_level)
return data
......@@ -314,8 +317,10 @@ class ConstantLayer(object):
"""
operation
"""
constant = fluid.layers.fill_constant_batch_size_like(input, shape,
dtype, value)
shape = list(shape)
input_shape = fluid.layers.shape(input)
shape[0] = input_shape[0]
constant = fluid.layers.fill_constant(shape, dtype, value)
return constant
......@@ -358,26 +363,23 @@ class SoftsignLayer(object):
class SimpleConvPool(Layer):
def __init__(self,
num_channels,
num_filters,
filter_size,
use_cudnn=False
):
def __init__(self, num_channels, num_filters, filter_size, use_cudnn=False):
super(SimpleConvPool, self).__init__()
self._conv2d = Conv2D(num_channels = num_channels,
self._conv2d = Conv2D(
num_channels=num_channels,
num_filters=num_filters,
filter_size=filter_size,
padding=[1, 1],
padding=[1, 1],
use_cudnn=use_cudnn,
act='relu')
def forward(self, inputs):
x = self._conv2d(inputs)
x = fluid.layers.reduce_max(x, dim=-1)
x = fluid.layers.reshape(x, shape=[x.shape[0], -1])
x = fluid.layers.reshape(x, shape=[x.shape[0], -1])
return x
class FC(Layer):
"""
This interface is used to construct a callable object of the ``FC`` class.
......@@ -580,7 +582,7 @@ class DynamicGRU(Layer):
gate_activation='sigmoid',
candidate_activation='tanh',
origin_mode=False,
init_size = None):
init_size=None):
super(DynamicGRU, self).__init__()
self.gru_unit = GRUUnit(
size * 3,
......@@ -591,16 +593,19 @@ class DynamicGRU(Layer):
origin_mode=origin_mode)
self.size = size
self.is_reverse = is_reverse
def forward(self, inputs, h_0):
hidden = h_0
res = []
for i in range(inputs.shape[1]):
if self.is_reverse:
i = inputs.shape[1] - 1 - i
input_ = inputs[ :, i:i+1, :]
input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]], inplace=False)
input_ = inputs[:, i:i + 1, :]
input_ = fluid.layers.reshape(
input_, [-1, input_.shape[2]], inplace=False)
hidden, reset, gate = self.gru_unit(input_, hidden)
hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]], inplace=False)
hidden_ = fluid.layers.reshape(
hidden, [-1, 1, hidden.shape[1]], inplace=False)
res.append(hidden_)
if self.is_reverse:
res = res[::-1]
......@@ -786,18 +791,21 @@ class BasicLSTMUnit(RNNUnit):
self._weight = self.create_parameter(
attr=self._param_attr,
shape=[self._input_size + self._hidden_size, 4 * self._hidden_size],
shape=[
self._input_size + self._hidden_size, 4 * self._hidden_size
],
dtype=self._dtype)
self._bias = self.create_parameter(attr=self._bias_attr,
shape=[4 * self._hidden_size],
dtype=self._dtype,
is_bias=True)
self._bias = self.create_parameter(
attr=self._bias_attr,
shape=[4 * self._hidden_size],
dtype=self._dtype,
is_bias=True)
def forward(self, input, state):
pre_hidden, pre_cell = state
concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
gate_input = layers.elementwise_add(gate_input, self._bias)
......@@ -817,11 +825,7 @@ class BasicLSTMUnit(RNNUnit):
class RNN(Layer):
def __init__(self,
cell,
is_reverse=False,
time_major=False,
**kwargs):
def __init__(self, cell, is_reverse=False, time_major=False, **kwargs):
super(RNN, self).__init__()
self.cell = cell
if not hasattr(self.cell, "call"):
......@@ -831,12 +835,17 @@ class RNN(Layer):
self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
1)
def forward(self, inputs, initial_states=None, sequence_length=None, **kwargs):
def forward(self,
inputs,
initial_states=None,
sequence_length=None,
**kwargs):
if fluid.in_dygraph_mode():
class OutputArray(object):
def __init__(self, x):
self.array = [x]
def append(self, x):
self.array.append(x)
......@@ -844,9 +853,8 @@ class RNN(Layer):
# TODO: use where_op
new_state = fluid.layers.elementwise_mul(
new_state, step_mask,
axis=0) - fluid.layers.elementwise_mul(state,
(step_mask - 1),
axis=0)
axis=0) - fluid.layers.elementwise_mul(
state, (step_mask - 1), axis=0)
return new_state
flat_inputs = flatten(inputs)
......@@ -872,16 +880,20 @@ class RNN(Layer):
if self.is_reverse:
inputs = map_structure(lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
mask = fluid.layers.reverse(mask, axis=[0]) if sequence_length is not None else None
mask = fluid.layers.reverse(
mask, axis=[0]) if sequence_length is not None else None
states = initial_states
outputs = []
for i in range(time_steps):
step_inputs = map_structure(lambda x:x[i], inputs)
step_outputs, new_states = self.cell(step_inputs, states, **kwargs)
step_inputs = map_structure(lambda x: x[i], inputs)
step_outputs, new_states = self.cell(step_inputs, states,
**kwargs)
if sequence_length is not None:
new_states = map_structure(
partial(_maybe_copy, step_mask=mask[i]), states,
partial(
_maybe_copy, step_mask=mask[i]),
states,
new_states)
states = new_states
if i == 0:
......@@ -922,10 +934,9 @@ class EncoderCell(RNNUnit):
self.lstm_cells = list()
for i in range(self.num_layers):
self.lstm_cells.append(
self.add_sublayer(
"layer_%d" % i,
BasicLSTMUnit(input_size if i == 0 else hidden_size,
hidden_size)))
self.add_sublayer("layer_%d" % i,
BasicLSTMUnit(input_size if i == 0 else
hidden_size, hidden_size)))
def forward(self, step_input, states):
new_states = []
......@@ -1040,4 +1051,3 @@ class BasicGRUUnit(Layer):
new_hidden = u * pre_hidden + (1 - u) * c
return new_hidden
......@@ -47,18 +47,18 @@ from utils import load_dygraph
from model_check import check_version
from model_check import check_cuda
def train(conf_dict, args):
"""
train process
"""
# Get device
if args.use_cuda:
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
# run train
logging.info("start train process ...")
......@@ -84,7 +84,6 @@ def train(conf_dict, args):
return auc, acc
else:
return auc
with fluid.dygraph.guard(place):
# used for continuous evaluation
......@@ -100,35 +99,35 @@ def train(conf_dict, args):
conf_dict['seq_len'] = args.seq_len
# Load network structure dynamically
net = utils.import_class("./nets",
conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
net = utils.import_class("./nets", conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
if args.init_checkpoint is not "":
model, _ = load_dygraph(args.init_checkpoint)
net.set_dict(model)
# Load loss function dynamically
loss = utils.import_class("./nets/losses",
conf_dict["loss"]["module_name"],
conf_dict["loss"]["class_name"])(conf_dict)
conf_dict["loss"]["module_name"],
conf_dict["loss"]["class_name"])(conf_dict)
# Load Optimization method
learning_rate = conf_dict["optimizer"]["learning_rate"]
optimizer_name = conf_dict["optimizer"]["class_name"]
if optimizer_name=='SGDOptimizer':
optimizer = fluid.optimizer.SGDOptimizer(learning_rate,parameter_list=net.parameters())
elif optimizer_name=='AdamOptimizer':
if optimizer_name == 'SGDOptimizer':
optimizer = fluid.optimizer.SGDOptimizer(
learning_rate, parameter_list=net.parameters())
elif optimizer_name == 'AdamOptimizer':
beta1 = conf_dict["optimizer"]["beta1"]
beta2 = conf_dict["optimizer"]["beta2"]
epsilon = conf_dict["optimizer"]["epsilon"]
optimizer = fluid.optimizer.AdamOptimizer(
learning_rate,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
parameter_list=net.parameters())
learning_rate,
beta1=beta1,
beta2=beta2,
epsilon=epsilon,
parameter_list=net.parameters())
# load auc method
metric = fluid.metrics.Auc(name="auc")
simnet_process = reader.SimNetProcessor(args, vocab)
simnet_process = reader.SimNetProcessor(args, vocab)
# set global step
global_step = 0
......@@ -136,23 +135,33 @@ def train(conf_dict, args):
losses = []
start_time = time.time()
train_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
get_train_examples = simnet_process.get_reader("train",epoch=args.epoch)
train_pyreader.decorate_sample_list_generator(
paddle.batch(get_train_examples, batch_size=args.batch_size),
place)
train_loader = fluid.io.DataLoader.from_generator(
capacity=16,
return_list=True,
iterable=True,
use_double_buffer=True)
get_train_examples = simnet_process.get_reader(
"train", epoch=args.epoch)
train_loader.set_sample_list_generator(
paddle.batch(
get_train_examples, batch_size=args.batch_size), place)
if args.do_valid:
valid_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
get_valid_examples = simnet_process.get_reader("valid")
valid_pyreader.decorate_sample_list_generator(
paddle.batch(get_valid_examples, batch_size=args.batch_size),
valid_loader = fluid.io.DataLoader.from_generator(
capacity=16,
return_list=True,
iterable=True,
use_double_buffer=True)
get_valid_examples = simnet_process.get_reader("valid")
valid_loader.set_sample_list_generator(
paddle.batch(
get_valid_examples, batch_size=args.batch_size),
place)
pred_list = []
if args.task_mode == "pairwise":
for left, pos_right, neg_right in train_pyreader():
for left, pos_right, neg_right in train_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1])
......@@ -162,92 +171,98 @@ def train(conf_dict, args):
pred = pos_score
_, neg_score = net(left, neg_right)
avg_cost = loss.compute(pos_score, neg_score)
losses.append(np.mean(avg_cost.numpy()))
losses.append(np.mean(avg_cost.numpy()))
avg_cost.backward()
optimizer.minimize(avg_cost)
net.clear_gradients()
if args.do_valid and global_step % args.validation_steps == 0:
for left, pos_right in valid_pyreader():
for left, pos_right in valid_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
pos_right = fluid.layers.reshape(
pos_right, shape=[-1, 1])
net.eval()
left_feat, pos_score = net(left, pos_right)
pred = pos_score
pred_list += list(pred.numpy())
valid_result = valid_and_test(pred_list, simnet_process, "valid")
pred_list += list(pred.numpy())
valid_result = valid_and_test(pred_list, simnet_process,
"valid")
if args.compute_accuracy:
valid_auc, valid_acc = valid_result
logging.info(
"global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" %
(global_step, valid_auc, valid_acc, np.mean(losses)))
"global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f"
% (global_step, valid_auc, valid_acc,
np.mean(losses)))
else:
valid_auc = valid_result
logging.info("global_steps: %d, valid_auc: %f, valid_loss: %f" %
(global_step, valid_auc, np.mean(losses)))
logging.info(
"global_steps: %d, valid_auc: %f, valid_loss: %f" %
(global_step, valid_auc, np.mean(losses)))
if global_step % args.save_steps == 0:
model_save_dir = os.path.join(args.output_dir,
conf_dict["model_path"])
conf_dict["model_path"])
model_path = os.path.join(model_save_dir, str(global_step))
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
fluid.dygraph.save_dygraph(net.state_dict(), model_path)
logging.info("saving infer model in %s" % model_path)
else:
for left, right, label in train_pyreader():
for left, right, label in train_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
right = fluid.layers.reshape(right, shape=[-1, 1])
label = fluid.layers.reshape(label, shape=[-1, 1])
net.train()
global_step += 1
global_step += 1
left_feat, pred = net(left, right)
avg_cost = loss.compute(pred, label)
losses.append(np.mean(avg_cost.numpy()))
losses.append(np.mean(avg_cost.numpy()))
avg_cost.backward()
optimizer.minimize(avg_cost)
net.clear_gradients()
if args.do_valid and global_step % args.validation_steps == 0:
for left, right in valid_pyreader():
for left, right in valid_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
right = fluid.layers.reshape(right, shape=[-1, 1])
net.eval()
left_feat, pred = net(left, right)
pred_list += list(pred.numpy())
valid_result = valid_and_test(pred_list, simnet_process, "valid")
valid_result = valid_and_test(pred_list, simnet_process,
"valid")
if args.compute_accuracy:
valid_auc, valid_acc = valid_result
logging.info(
"global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" %
(global_step, valid_auc, valid_acc, np.mean(losses)))
"global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f"
% (global_step, valid_auc, valid_acc,
np.mean(losses)))
else:
valid_auc = valid_result
logging.info("global_steps: %d, valid_auc: %f, valid_loss: %f" %
(global_step, valid_auc, np.mean(losses)))
logging.info(
"global_steps: %d, valid_auc: %f, valid_loss: %f" %
(global_step, valid_auc, np.mean(losses)))
if global_step % args.save_steps == 0:
model_save_dir = os.path.join(args.output_dir,
conf_dict["model_path"])
conf_dict["model_path"])
model_path = os.path.join(model_save_dir, str(global_step))
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
fluid.dygraph.save_dygraph(net.state_dict(), model_path)
logging.info("saving infer model in %s" % model_path)
end_time = time.time()
end_time = time.time()
ce_info.append([np.mean(losses), end_time - start_time])
# final save
logging.info("the final step is %s" % global_step)
model_save_dir = os.path.join(args.output_dir,
conf_dict["model_path"])
logging.info("the final step is %s" % global_step)
model_save_dir = os.path.join(args.output_dir, conf_dict["model_path"])
model_path = os.path.join(model_save_dir, str(global_step))
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
fluid.dygraph.save_dygraph(net.state_dict(), model_path)
......@@ -263,19 +278,24 @@ def train(conf_dict, args):
except:
logging.info("ce info err!")
print("kpis\teach_step_duration_%s_card%s\t%s" %
(args.task_name, card_num, ce_time))
(args.task_name, card_num, ce_time))
print("kpis\ttrain_loss_%s_card%s\t%f" %
(args.task_name, card_num, ce_loss))
(args.task_name, card_num, ce_loss))
if args.do_test:
# Get Feeder and Reader
test_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
test_loader = fluid.io.DataLoader.from_generator(
capacity=16,
return_list=True,
iterable=True,
use_double_buffer=True)
get_test_examples = simnet_process.get_reader("test")
test_pyreader.decorate_sample_list_generator(
paddle.batch(get_test_examples, batch_size=args.batch_size),
test_loader.set_sample_list_generator(
paddle.batch(
get_test_examples, batch_size=args.batch_size),
place)
pred_list = []
for left, pos_right in test_pyreader():
for left, pos_right in test_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
net.eval()
......@@ -284,15 +304,15 @@ def train(conf_dict, args):
left_feat, pos_score = net(left, pos_right)
pred = pos_score
pred_list += list(pred.numpy())
test_result = valid_and_test(pred_list, simnet_process, "test")
if args.compute_accuracy:
test_result = valid_and_test(pred_list, simnet_process, "test")
if args.compute_accuracy:
test_auc, test_acc = test_result
logging.info("AUC of test is %f, Accuracy of test is %f" %
(test_auc, test_acc))
(test_auc, test_acc))
else:
test_auc = test_result
logging.info("AUC of test is %f" % test_auc)
def test(conf_dict, args):
"""
......@@ -307,47 +327,53 @@ def test(conf_dict, args):
vocab = utils.load_vocab(args.vocab_path)
simnet_process = reader.SimNetProcessor(args, vocab)
test_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
test_loader = fluid.io.DataLoader.from_generator(
capacity=16,
return_list=True,
iterable=True,
use_double_buffer=True)
get_test_examples = simnet_process.get_reader("test")
test_pyreader.decorate_sample_list_generator(
paddle.batch(get_test_examples, batch_size=args.batch_size),
place)
test_loader.set_sample_list_generator(
paddle.batch(
get_test_examples, batch_size=args.batch_size), place)
conf_dict['dict_size'] = len(vocab)
conf_dict['dict_size'] = len(vocab)
conf_dict['seq_len'] = args.seq_len
net = utils.import_class("./nets",
conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
net = utils.import_class("./nets", conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
model, _ = load_dygraph(args.init_checkpoint)
net.set_dict(model)
metric = fluid.metrics.Auc(name="auc")
pred_list = []
with io.open("predictions.txt", "w", encoding="utf8") as predictions_file:
with io.open(
"predictions.txt", "w", encoding="utf8") as predictions_file:
if args.task_mode == "pairwise":
for left, pos_right in test_pyreader():
for left, pos_right in test_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
left_feat, pos_score = net(left, pos_right)
pred = pos_score
pred_list += list(map(lambda item: float(item[0]), pred.numpy()))
pred_list += list(
map(lambda item: float(item[0]), pred.numpy()))
predictions_file.write(u"\n".join(
map(lambda item: str((item[0] + 1) / 2), pred.numpy())) + "\n")
map(lambda item: str((item[0] + 1) / 2), pred.numpy()))
+ "\n")
else:
for left, right in test_pyreader():
for left, right in test_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
right = fluid.layers.reshape(right, shape=[-1, 1])
left_feat, pred = net(left, right)
pred_list += list(map(lambda item: float(item[0]), pred.numpy()))
pred_list += list(
map(lambda item: float(item[0]), pred.numpy()))
predictions_file.write(u"\n".join(
map(lambda item: str(np.argmax(item)), pred.numpy())) + "\n")
map(lambda item: str(np.argmax(item)), pred.numpy())) +
"\n")
if args.task_mode == "pairwise":
pred_list = np.array(pred_list).reshape((-1, 1))
......@@ -361,16 +387,16 @@ def test(conf_dict, args):
metric.update(pred_list, labels)
if args.compute_accuracy:
acc = utils.get_accuracy(pred_list, labels, args.task_mode,
args.lamda)
args.lamda)
logging.info("AUC of test is %f, Accuracy of test is %f" %
(metric.eval(), acc))
(metric.eval(), acc))
else:
logging.info("AUC of test is %f" % metric.eval())
if args.verbose_result:
utils.get_result_file(args)
logging.info("test result saved in %s" %
os.path.join(os.getcwd(), args.test_result_path))
os.path.join(os.getcwd(), args.test_result_path))
def infer(conf_dict, args):
......@@ -382,50 +408,53 @@ def infer(conf_dict, args):
place = fluid.CUDAPlace(0)
else:
place = fluid.CPUPlace()
with fluid.dygraph.guard(place):
vocab = utils.load_vocab(args.vocab_path)
simnet_process = reader.SimNetProcessor(args, vocab)
get_infer_examples = simnet_process.get_infer_reader
infer_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
infer_pyreader.decorate_sample_list_generator(
paddle.batch(get_infer_examples, batch_size=args.batch_size),
place)
conf_dict['dict_size'] = len(vocab)
infer_loader = fluid.io.DataLoader.from_generator(
capacity=16,
return_list=True,
iterable=True,
use_double_buffer=True)
infer_loader.set_sample_list_generator(
paddle.batch(
get_infer_examples, batch_size=args.batch_size), place)
conf_dict['dict_size'] = len(vocab)
conf_dict['seq_len'] = args.seq_len
net = utils.import_class("./nets",
conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
net = utils.import_class("./nets", conf_dict["net"]["module_name"],
conf_dict["net"]["class_name"])(conf_dict)
model, _ = load_dygraph(args.init_checkpoint)
net.set_dict(model)
pred_list = []
if args.task_mode == "pairwise":
for left, pos_right in infer_pyreader():
for left, pos_right in infer_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
left_feat, pos_score = net(left, pos_right)
pred = pos_score
pred_list += list(
map(lambda item: str((item[0] + 1) / 2), pred.numpy()))
map(lambda item: str((item[0] + 1) / 2), pred.numpy()))
else:
for left, right in infer_pyreader():
for left, right in infer_loader():
left = fluid.layers.reshape(left, shape=[-1, 1])
pos_right = fluid.layers.reshape(right, shape=[-1, 1])
left_feat, pred = net(left, right)
pred_list += map(lambda item: str(np.argmax(item)), pred.numpy())
with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file:
pred_list += map(lambda item: str(np.argmax(item)),
pred.numpy())
with io.open(
args.infer_result_path, "w", encoding="utf8") as infer_file:
for _data, _pred in zip(simnet_process.get_infer_data(), pred_list):
infer_file.write(_data + "\t" + _pred + "\n")
logging.info("infer result saved in %s" %
os.path.join(os.getcwd(), args.infer_result_path))
os.path.join(os.getcwd(), args.infer_result_path))
def get_cards():
......@@ -435,6 +464,7 @@ def get_cards():
num = len(cards.split(","))
return num
if __name__ == "__main__":
args = ArgConfig()
......
......@@ -28,7 +28,7 @@
1. paddle安装
本项目依赖于 PaddlePaddle 1.7及以上版本或适当的develop版本,请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
本项目依赖于 PaddlePaddle 1.8及以上版本或适当的develop版本,请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装
2. 下载代码
......@@ -40,7 +40,7 @@
3. 环境依赖
请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.6/beginners_guide/install/index_cn.html)部分的内容
请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)部分的内容
### 数据准备
......
......@@ -42,12 +42,11 @@ class InferTaskConfig(object):
batch_size = 4
# the parameters for beam search.
beam_size = 4
alpha=0.6
alpha = 0.6
# max decoded length, should be less than ModelHyperParams.max_length
max_out_len = 30
class ModelHyperParams(object):
"""
ModelHyperParams
......@@ -156,38 +155,32 @@ input_descs = {
# Names of word embedding table which might be reused for weight sharing.
word_emb_param_names = (
"src_word_emb_table",
"trg_word_emb_table",
)
"trg_word_emb_table", )
# Names of position encoding table which will be initialized externally.
pos_enc_param_names = (
"src_pos_enc_table",
"trg_pos_enc_table",
)
"trg_pos_enc_table", )
# separated inputs for different usages.
encoder_data_input_fields = (
"src_word",
"src_pos",
"src_slf_attn_bias",
)
"src_slf_attn_bias", )
decoder_data_input_fields = (
"trg_word",
"trg_pos",
"trg_slf_attn_bias",
"trg_src_attn_bias",
"enc_output",
)
"enc_output", )
label_data_input_fields = (
"lbl_word",
"lbl_weight",
)
"lbl_weight", )
# In fast decoder, trg_pos (only containing the current time step) is generated
# by ops and trg_slf_attn_bias is not needed.
fast_decoder_data_input_fields = (
"trg_word",
# "init_score",
# "init_idx",
"trg_src_attn_bias",
)
"trg_src_attn_bias", )
def merge_cfg_from_list(cfg_list, g_cfgs):
......
......@@ -34,10 +34,10 @@ def position_encoding_init(n_position, d_pos_vec):
num_timescales = channels // 2
log_timescale_increment = (np.log(float(1e4) / float(1)) /
(num_timescales - 1))
inv_timescales = np.exp(
np.arange(num_timescales)) * -log_timescale_increment
scaled_time = np.expand_dims(position, 1) * np.expand_dims(
inv_timescales, 0)
inv_timescales = np.exp(np.arange(
num_timescales)) * -log_timescale_increment
scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
0)
signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
position_enc = signal
......@@ -48,6 +48,7 @@ class NoamDecay(LearningRateDecay):
"""
learning rate scheduler
"""
def __init__(self,
d_model,
warmup_steps,
......@@ -72,6 +73,7 @@ class PrePostProcessLayer(Layer):
"""
PrePostProcessLayer
"""
def __init__(self, process_cmd, d_model, dropout_rate):
super(PrePostProcessLayer, self).__init__()
self.process_cmd = process_cmd
......@@ -82,8 +84,8 @@ class PrePostProcessLayer(Layer):
elif cmd == "n": # add layer normalization
self.functors.append(
self.add_sublayer(
"layer_norm_%d" %
len(self.sublayers(include_sublayers=False)),
"layer_norm_%d" % len(
self.sublayers(include_sublayers=False)),
LayerNorm(
normalized_shape=d_model,
param_attr=fluid.ParamAttr(
......@@ -108,6 +110,7 @@ class MultiHeadAttention(Layer):
"""
Multi-Head Attention
"""
def __init__(self, d_key, d_value, d_model, n_head=1, dropout_rate=0.):
super(MultiHeadAttention, self).__init__()
self.n_head = n_head
......@@ -115,18 +118,14 @@ class MultiHeadAttention(Layer):
self.d_value = d_value
self.d_model = d_model
self.dropout_rate = dropout_rate
self.q_fc = Linear(input_dim=d_model,
output_dim=d_key * n_head,
bias_attr=False)
self.k_fc = Linear(input_dim=d_model,
output_dim=d_key * n_head,
bias_attr=False)
self.v_fc = Linear(input_dim=d_model,
output_dim=d_value * n_head,
bias_attr=False)
self.proj_fc = Linear(input_dim=d_value * n_head,
output_dim=d_model,
bias_attr=False)
self.q_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
self.k_fc = Linear(
input_dim=d_model, output_dim=d_key * n_head, bias_attr=False)
self.v_fc = Linear(
input_dim=d_model, output_dim=d_value * n_head, bias_attr=False)
self.proj_fc = Linear(
input_dim=d_value * n_head, output_dim=d_model, bias_attr=False)
def forward(self, queries, keys, values, attn_bias, cache=None):
# compute q ,k ,v
......@@ -152,17 +151,14 @@ class MultiHeadAttention(Layer):
cache["k"], cache["v"] = k, v
# scale dot product attention
product = layers.matmul(x=q,
y=k,
transpose_y=True,
alpha=self.d_model**-0.5)
product = layers.matmul(
x=q, y=k, transpose_y=True, alpha=self.d_model**-0.5)
if attn_bias is not None:
product += attn_bias
weights = layers.softmax(product)
if self.dropout_rate:
weights = layers.dropout(weights,
dropout_prob=self.dropout_rate,
is_test=False)
weights = layers.dropout(
weights, dropout_prob=self.dropout_rate, is_test=False)
out = layers.matmul(weights, v)
......@@ -179,6 +175,7 @@ class FFN(Layer):
"""
Feed-Forward Network
"""
def __init__(self, d_inner_hid, d_model, dropout_rate):
super(FFN, self).__init__()
self.dropout_rate = dropout_rate
......@@ -188,9 +185,8 @@ class FFN(Layer):
def forward(self, x):
hidden = self.fc1(x)
if self.dropout_rate:
hidden = layers.dropout(hidden,
dropout_prob=self.dropout_rate,
is_test=False)
hidden = layers.dropout(
hidden, dropout_prob=self.dropout_rate, is_test=False)
out = self.fc2(hidden)
return out
......@@ -199,6 +195,7 @@ class EncoderLayer(Layer):
"""
EncoderLayer
"""
def __init__(self,
n_head,
d_key,
......@@ -227,8 +224,8 @@ class EncoderLayer(Layer):
prepostprocess_dropout)
def forward(self, enc_input, attn_bias):
attn_output = self.self_attn(self.preprocesser1(enc_input), None, None,
attn_bias)
attn_output = self.self_attn(
self.preprocesser1(enc_input), None, None, attn_bias)
attn_output = self.postprocesser1(attn_output, enc_input)
ffn_output = self.ffn(self.preprocesser2(attn_output))
......@@ -240,6 +237,7 @@ class Encoder(Layer):
"""
encoder
"""
def __init__(self,
n_layer,
n_head,
......@@ -279,6 +277,7 @@ class Embedder(Layer):
"""
Word Embedding + Position Encoding
"""
def __init__(self, vocab_size, emb_dim, bos_idx=0):
super(Embedder, self).__init__()
......@@ -297,6 +296,7 @@ class WrapEncoder(Layer):
"""
embedder + encoder
"""
def __init__(self, src_vocab_size, max_length, n_layer, n_head, d_key,
d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd,
......@@ -324,9 +324,9 @@ class WrapEncoder(Layer):
pos_enc = self.pos_encoder(src_pos)
pos_enc.stop_gradient = True
emb = word_emb + pos_enc
enc_input = layers.dropout(emb,
dropout_prob=self.emb_dropout,
is_test=False) if self.emb_dropout else emb
enc_input = layers.dropout(
emb, dropout_prob=self.emb_dropout,
is_test=False) if self.emb_dropout else emb
enc_output = self.encoder(enc_input, src_slf_attn_bias)
return enc_output
......@@ -336,6 +336,7 @@ class DecoderLayer(Layer):
"""
decoder
"""
def __init__(self,
n_head,
d_key,
......@@ -375,8 +376,8 @@ class DecoderLayer(Layer):
self_attn_bias,
cross_attn_bias,
cache=None):
self_attn_output = self.self_attn(self.preprocesser1(dec_input), None,
None, self_attn_bias, cache)
self_attn_output = self.self_attn(
self.preprocesser1(dec_input), None, None, self_attn_bias, cache)
self_attn_output = self.postprocesser1(self_attn_output, dec_input)
cross_attn_output = self.cross_attn(
......@@ -395,6 +396,7 @@ class Decoder(Layer):
"""
decoder
"""
def __init__(self, n_layer, n_head, d_key, d_value, d_model, d_inner_hid,
prepostprocess_dropout, attention_dropout, relu_dropout,
preprocess_cmd, postprocess_cmd):
......@@ -420,8 +422,8 @@ class Decoder(Layer):
caches=None):
for i, decoder_layer in enumerate(self.decoder_layers):
dec_output = decoder_layer(dec_input, enc_output, self_attn_bias,
cross_attn_bias,
None if caches is None else caches[i])
cross_attn_bias, None
if caches is None else caches[i])
dec_input = dec_output
return self.processer(dec_output)
......@@ -431,6 +433,7 @@ class WrapDecoder(Layer):
"""
embedder + decoder
"""
def __init__(self, trg_vocab_size, max_length, n_layer, n_head, d_key,
d_value, d_model, d_inner_hid, prepostprocess_dropout,
attention_dropout, relu_dropout, preprocess_cmd,
......@@ -458,9 +461,8 @@ class WrapDecoder(Layer):
word_embedder.weight,
transpose_y=True)
else:
self.linear = Linear(input_dim=d_model,
output_dim=trg_vocab_size,
bias_attr=False)
self.linear = Linear(
input_dim=d_model, output_dim=trg_vocab_size, bias_attr=False)
def forward(self,
trg_word,
......@@ -474,15 +476,14 @@ class WrapDecoder(Layer):
pos_enc = self.pos_encoder(trg_pos)
pos_enc.stop_gradient = True
emb = word_emb + pos_enc
dec_input = layers.dropout(emb,
dropout_prob=self.emb_dropout,
is_test=False) if self.emb_dropout else emb
dec_input = layers.dropout(
emb, dropout_prob=self.emb_dropout,
is_test=False) if self.emb_dropout else emb
dec_output = self.decoder(dec_input, enc_output, trg_slf_attn_bias,
trg_src_attn_bias, caches)
dec_output = layers.reshape(
dec_output,
shape=[-1, dec_output.shape[-1]],
)
shape=[-1, dec_output.shape[-1]], )
logits = self.linear(dec_output)
return logits
......@@ -493,9 +494,10 @@ class CrossEntropyCriterion(object):
def __call__(self, predict, label, weights):
if self.label_smooth_eps:
label_out = layers.label_smooth(label=layers.one_hot(
input=label, depth=predict.shape[-1]),
epsilon=self.label_smooth_eps)
label_out = layers.label_smooth(
label=layers.one_hot(
input=label, depth=predict.shape[-1]),
epsilon=self.label_smooth_eps)
cost = layers.softmax_with_cross_entropy(
logits=predict,
......@@ -513,6 +515,7 @@ class Transformer(Layer):
"""
model
"""
def __init__(self,
src_vocab_size,
trg_vocab_size,
......@@ -532,29 +535,25 @@ class Transformer(Layer):
bos_id=0,
eos_id=1):
super(Transformer, self).__init__()
src_word_embedder = Embedder(vocab_size=src_vocab_size,
emb_dim=d_model,
bos_idx=bos_id)
self.encoder = WrapEncoder(src_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid,
prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd,
postprocess_cmd, src_word_embedder)
src_word_embedder = Embedder(
vocab_size=src_vocab_size, emb_dim=d_model, bos_idx=bos_id)
self.encoder = WrapEncoder(
src_vocab_size, max_length, n_layer, n_head, d_key, d_value,
d_model, d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd, src_word_embedder)
if weight_sharing:
assert src_vocab_size == trg_vocab_size, (
"Vocabularies in source and target should be same for weight sharing."
)
trg_word_embedder = src_word_embedder
else:
trg_word_embedder = Embedder(vocab_size=trg_vocab_size,
emb_dim=d_model,
bos_idx=bos_id)
self.decoder = WrapDecoder(trg_vocab_size, max_length, n_layer, n_head,
d_key, d_value, d_model, d_inner_hid,
prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd,
postprocess_cmd, weight_sharing,
trg_word_embedder)
trg_word_embedder = Embedder(
vocab_size=trg_vocab_size, emb_dim=d_model, bos_idx=bos_id)
self.decoder = WrapDecoder(
trg_vocab_size, max_length, n_layer, n_head, d_key, d_value,
d_model, d_inner_hid, prepostprocess_dropout, attention_dropout,
relu_dropout, preprocess_cmd, postprocess_cmd, weight_sharing,
trg_word_embedder)
self.trg_vocab_size = trg_vocab_size
self.n_layer = n_layer
......@@ -595,6 +594,7 @@ class Transformer(Layer):
and newly added finished candidates from `grow_topk`, and selects the top
`beam_size` finished candidates.
"""
def expand_to_beam_size(tensor, beam_size):
tensor = layers.reshape(tensor,
[tensor.shape[0], 1] + tensor.shape[1:])
......@@ -616,19 +616,23 @@ class Transformer(Layer):
### initialize states of beam search ###
## init for the alive ##
initial_log_probs = to_variable(
np.array([[0.] + [-inf] * (beam_size - 1)], dtype="float32"))
np.array(
[[0.] + [-inf] * (beam_size - 1)], dtype="float32"))
alive_log_probs = layers.expand(initial_log_probs, [batch_size, 1])
alive_seq = to_variable(
np.tile(np.array([[[bos_id]]], dtype="int64"),
(batch_size, beam_size, 1)))
np.tile(
np.array(
[[[bos_id]]], dtype="int64"), (batch_size, beam_size, 1)))
## init for the finished ##
finished_scores = to_variable(
np.array([[-inf] * beam_size], dtype="float32"))
np.array(
[[-inf] * beam_size], dtype="float32"))
finished_scores = layers.expand(finished_scores, [batch_size, 1])
finished_seq = to_variable(
np.tile(np.array([[[bos_id]]], dtype="int64"),
(batch_size, beam_size, 1)))
np.tile(
np.array(
[[[bos_id]]], dtype="int64"), (batch_size, beam_size, 1)))
finished_flags = layers.zeros_like(finished_scores)
### initialize inputs and states of transformer decoder ###
......@@ -640,13 +644,11 @@ class Transformer(Layer):
enc_output = merge_beam_dim(expand_to_beam_size(enc_output, beam_size))
## init states (caches) for transformer, need to be updated according to selected beam
caches = [{
"k":
layers.fill_constant(
"k": layers.fill_constant(
shape=[batch_size * beam_size, self.n_head, 0, self.d_key],
dtype=enc_output.dtype,
value=0),
"v":
layers.fill_constant(
"v": layers.fill_constant(
shape=[batch_size * beam_size, self.n_head, 0, self.d_value],
dtype=enc_output.dtype,
value=0),
......@@ -665,11 +667,11 @@ class Transformer(Layer):
beam_size,
batch_size,
need_flat=True):
batch_idx = layers.range(0, batch_size, 1,
dtype="int64") * beam_size
batch_idx = layers.range(
0, batch_size, 1, dtype="int64") * beam_size
flat_tensor = merge_beam_dim(tensor_nd) if need_flat else tensor_nd
idx = layers.reshape(layers.elementwise_add(beam_idx, batch_idx, 0),
[-1])
idx = layers.reshape(
layers.elementwise_add(beam_idx, batch_idx, 0), [-1])
new_flat_tensor = layers.gather(flat_tensor, idx)
new_tensor_nd = layers.reshape(
new_flat_tensor,
......@@ -681,7 +683,8 @@ class Transformer(Layer):
finished_in_finished):
max_length_penalty = np.power(((5. + max_len) / 6.), alpha)
# The best possible score of the most likely alive sequence
lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty
lower_bound_alive_scores = alive_log_probs[:,
0] / max_length_penalty
# Now to compute the lowest score of a finished sequence in finished
# If the sequence isn't finished, we multiply it's score by 0. since
......@@ -711,8 +714,8 @@ class Transformer(Layer):
curr_scores = log_probs / length_penalty
flat_curr_scores = layers.reshape(curr_scores, [batch_size, -1])
topk_scores, topk_ids = layers.topk(flat_curr_scores,
k=beam_size * 2)
topk_scores, topk_ids = layers.topk(
flat_curr_scores, k=beam_size * 2)
topk_log_probs = topk_scores * length_penalty
......@@ -723,13 +726,11 @@ class Transformer(Layer):
topk_seq = gather_2d_by_gather(alive_seq, topk_beam_index,
beam_size, batch_size)
topk_seq = layers.concat(
[topk_seq,
layers.reshape(topk_ids, topk_ids.shape + [1])],
[topk_seq, layers.reshape(topk_ids, topk_ids.shape + [1])],
axis=2)
states = update_states(states, topk_beam_index, beam_size)
eos = layers.fill_constant(shape=topk_ids.shape,
dtype="int64",
value=eos_id)
eos = layers.fill_constant(
shape=topk_ids.shape, dtype="int64", value=eos_id)
topk_finished = layers.cast(layers.equal(topk_ids, eos), "float32")
#topk_seq: [batch_size, 2*beam_size, i+1]
......@@ -751,37 +752,35 @@ class Transformer(Layer):
def grow_finished(finished_seq, finished_scores, finished_flags,
curr_seq, curr_scores, curr_finished):
# finished scores
finished_seq = layers.concat([
finished_seq,
layers.fill_constant(shape=[batch_size, beam_size, 1],
dtype="int64",
value=eos_id)
],
axis=2)
finished_seq = layers.concat(
[
finished_seq, layers.fill_constant(
shape=[batch_size, beam_size, 1],
dtype="int64",
value=eos_id)
],
axis=2)
# Set the scores of the unfinished seq in curr_seq to large negative
# values
curr_scores += (1. - curr_finished) * -inf
# concatenating the sequences and scores along beam axis
curr_finished_seq = layers.concat([finished_seq, curr_seq], axis=1)
curr_finished_scores = layers.concat([finished_scores, curr_scores],
axis=1)
curr_finished_flags = layers.concat([finished_flags, curr_finished],
axis=1)
curr_finished_scores = layers.concat(
[finished_scores, curr_scores], axis=1)
curr_finished_flags = layers.concat(
[finished_flags, curr_finished], axis=1)
_, topk_indexes = layers.topk(curr_finished_scores, k=beam_size)
finished_seq = gather_2d_by_gather(curr_finished_seq, topk_indexes,
beam_size * 3, batch_size)
finished_scores = gather_2d_by_gather(curr_finished_scores,
topk_indexes, beam_size * 3,
batch_size)
finished_flags = gather_2d_by_gather(curr_finished_flags,
topk_indexes, beam_size * 3,
batch_size)
finished_scores = gather_2d_by_gather(
curr_finished_scores, topk_indexes, beam_size * 3, batch_size)
finished_flags = gather_2d_by_gather(
curr_finished_flags, topk_indexes, beam_size * 3, batch_size)
return finished_seq, finished_scores, finished_flags
for i in range(max_len):
trg_pos = layers.fill_constant(shape=trg_word.shape,
dtype="int64",
value=i)
trg_pos = layers.fill_constant(
shape=trg_word.shape, dtype="int64", value=i)
logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
enc_output, caches)
topk_seq, topk_log_probs, topk_scores, topk_finished, states = grow_topk(
......@@ -818,20 +817,23 @@ class Transformer(Layer):
return layers.expand(tensor, tile_dims)
def merge_batch_beams(tensor):
return layers.reshape(tensor, [tensor.shape[0] * tensor.shape[1]] +
tensor.shape[2:])
return layers.reshape(
tensor, [tensor.shape[0] * tensor.shape[1]] + tensor.shape[2:])
def split_batch_beams(tensor):
return fluid.layers.reshape(tensor,
shape=[-1, beam_size] +
list(tensor.shape[1:]))
return fluid.layers.reshape(
tensor, shape=[-1, beam_size] + list(tensor.shape[1:]))
def mask_probs(probs, finished, noend_mask_tensor):
# TODO: use where_op
finished = layers.cast(finished, dtype=probs.dtype)
probs = layers.elementwise_mul(
layers.expand(layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]),
noend_mask_tensor, axis=-1) - layers.elementwise_mul(probs, (finished - 1), axis=0)
layers.expand(
layers.unsqueeze(finished, [2]),
[1, 1, self.trg_vocab_size]),
noend_mask_tensor,
axis=-1) - layers.elementwise_mul(
probs, (finished - 1), axis=0)
return probs
def gather(x, indices, batch_pos):
......@@ -845,54 +847,53 @@ class Transformer(Layer):
inf = float(1. * 1e7)
batch_size = enc_output.shape[0]
max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
vocab_size_tensor = layers.fill_constant(shape=[1],
dtype="int64",
value=self.trg_vocab_size)
vocab_size_tensor = layers.fill_constant(
shape=[1], dtype="int64", value=self.trg_vocab_size)
end_token_tensor = to_variable(
np.full([batch_size, beam_size], eos_id, dtype="int64"))
np.full(
[batch_size, beam_size], eos_id, dtype="int64"))
noend_array = [-inf] * self.trg_vocab_size
noend_array[eos_id] = 0
noend_mask_tensor = to_variable(np.array(noend_array,dtype="float32"))
noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32"))
batch_pos = layers.expand(
layers.unsqueeze(
to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]),
[1, beam_size])
to_variable(np.arange(
0, batch_size, 1, dtype="int64")), [1]), [1, beam_size])
predict_ids = []
parent_ids = []
### initialize states of beam search ###
log_probs = to_variable(
np.array([[0.] + [-inf] * (beam_size - 1)] * batch_size,
dtype="float32"))
finished = to_variable(np.full([batch_size, beam_size], 0,
dtype="bool"))
np.array(
[[0.] + [-inf] * (beam_size - 1)] * batch_size,
dtype="float32"))
finished = to_variable(
np.full(
[batch_size, beam_size], 0, dtype="bool"))
### initialize inputs and states of transformer decoder ###
## init inputs for decoder, shaped `[batch_size*beam_size, ...]`
trg_word = layers.fill_constant(shape=[batch_size * beam_size, 1],
dtype="int64",
value=bos_id)
trg_word = layers.fill_constant(
shape=[batch_size * beam_size, 1], dtype="int64", value=bos_id)
trg_pos = layers.zeros_like(trg_word)
trg_src_attn_bias = merge_batch_beams(
expand_to_beam_size(trg_src_attn_bias, beam_size))
enc_output = merge_batch_beams(expand_to_beam_size(enc_output, beam_size))
enc_output = merge_batch_beams(
expand_to_beam_size(enc_output, beam_size))
## init states (caches) for transformer, need to be updated according to selected beam
caches = [{
"k":
layers.fill_constant(
"k": layers.fill_constant(
shape=[batch_size * beam_size, self.n_head, 0, self.d_key],
dtype=enc_output.dtype,
value=0),
"v":
layers.fill_constant(
"v": layers.fill_constant(
shape=[batch_size * beam_size, self.n_head, 0, self.d_value],
dtype=enc_output.dtype,
value=0),
} for i in range(self.n_layer)]
for i in range(max_len):
trg_pos = layers.fill_constant(shape=trg_word.shape,
dtype="int64",
value=i)
trg_pos = layers.fill_constant(
shape=trg_word.shape, dtype="int64", value=i)
caches = map_structure( # can not be reshaped since the 0 size
lambda x: x if i == 0 else merge_batch_beams(x), caches)
logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
......@@ -902,18 +903,17 @@ class Transformer(Layer):
fluid.layers.log(fluid.layers.softmax(logits)))
step_log_probs = mask_probs(step_log_probs, finished,
noend_mask_tensor)
log_probs = layers.elementwise_add(x=step_log_probs,
y=log_probs,
axis=0)
log_probs = layers.elementwise_add(
x=step_log_probs, y=log_probs, axis=0)
log_probs = layers.reshape(log_probs,
[-1, beam_size * self.trg_vocab_size])
scores = log_probs
topk_scores, topk_indices = fluid.layers.topk(input=scores,
k=beam_size)
beam_indices = fluid.layers.elementwise_floordiv(
topk_indices, vocab_size_tensor)
token_indices = fluid.layers.elementwise_mod(
topk_indices, vocab_size_tensor)
topk_scores, topk_indices = fluid.layers.topk(
input=scores, k=beam_size)
beam_indices = fluid.layers.elementwise_floordiv(topk_indices,
vocab_size_tensor)
token_indices = fluid.layers.elementwise_mod(topk_indices,
vocab_size_tensor)
# update states
caches = map_structure(lambda x: gather(x, beam_indices, batch_pos),
......
......@@ -306,6 +306,7 @@ class DataProcessor(object):
:param seed: The seed for random.
:type seed: int
"""
def __init__(self,
src_vocab_fpath,
trg_vocab_fpath,
......@@ -360,21 +361,23 @@ class DataProcessor(object):
def load_src_trg_ids(self, fpattern, tar_fname):
converters = [
Converter(vocab=self._src_vocab,
beg=self._bos_idx,
end=self._eos_idx,
unk=self._unk_idx,
delimiter=self._token_delimiter,
add_beg=False)
Converter(
vocab=self._src_vocab,
beg=self._bos_idx,
end=self._eos_idx,
unk=self._unk_idx,
delimiter=self._token_delimiter,
add_beg=False)
]
if not self._only_src:
converters.append(
Converter(vocab=self._trg_vocab,
beg=self._bos_idx,
end=self._eos_idx,
unk=self._unk_idx,
delimiter=self._token_delimiter,
add_beg=True))
Converter(
vocab=self._trg_vocab,
beg=self._bos_idx,
end=self._eos_idx,
unk=self._unk_idx,
delimiter=self._token_delimiter,
add_beg=True))
converters = ComposedConverter(converters)
......@@ -402,9 +405,8 @@ class DataProcessor(object):
f = tarfile.open(fpaths[0], "rb")
for line in f.extractfile(tar_fname):
fields = line.strip(b"\n").split(self._field_delimiter)
if (not self._only_src
and len(fields) == 2) or (self._only_src
and len(fields) == 1):
if (not self._only_src and len(fields) == 2) or (
self._only_src and len(fields) == 1):
yield fields
else:
for fpath in fpaths:
......@@ -414,9 +416,8 @@ class DataProcessor(object):
with open(fpath, "rb") as f:
for line in f:
fields = line.strip(b"\n").split(self._field_delimiter)
if (not self._only_src
and len(fields) == 2) or (self._only_src
and len(fields) == 1):
if (not self._only_src and len(fields) == 2) or (
self._only_src and len(fields) == 1):
yield fields
@staticmethod
......@@ -512,8 +513,8 @@ class DataProcessor(object):
for item in data_reader():
inst_num_per_part = len(item) // count
for i in range(count):
yield item[inst_num_per_part * i:inst_num_per_part *
(i + 1)]
yield item[inst_num_per_part * i:inst_num_per_part * (i + 1
)]
return __impl__
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册