提交 af0ec473 编写于 作者: N Nicky Chan 提交者: daminglu

Rewrite book chapter8 machine translation documentation and train.py (#552)

上级 42d37ac3
此差异已折叠。
此差异已折叠。
import sys, os # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid
with_gpu = os.getenv('WITH_GPU', '0') != '0' import paddle.fluid.framework as framework
import paddle.fluid.layers as pd
from paddle.fluid.executor import Executor
def save_model(trainer, parameters, save_path): from functools import partial
with open(save_path, 'w') as f: import os
trainer.save_parameter_to_tar(f)
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
def seq_to_seq_net(source_dict_dim, hidden_dim = 32
target_dict_dim, word_dim = 16
is_generating, batch_size = 2
beam_size=3, max_length = 8
max_length=250): topk_size = 50
### Network Architecture beam_size = 2
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit of GRU decoder decoder_size = hidden_dim
encoder_size = 512 # dimension of hidden unit of GRU encoder
#### Encoder def encoder(is_sparse):
src_word_id = paddle.layer.data( # encoder
name='source_language_word', src_word_id = pd.data(
type=paddle.data_type.integer_value_sequence(source_dict_dim)) name="src_word_id", shape=[1], dtype='int64', lod_level=1)
src_embedding = paddle.layer.embedding( src_embedding = pd.embedding(
input=src_word_id, size=word_vector_dim) input=src_word_id,
src_forward = paddle.networks.simple_gru( size=[dict_size, word_dim],
input=src_embedding, size=encoder_size) dtype='float32',
src_backward = paddle.networks.simple_gru( is_sparse=is_sparse,
input=src_embedding, size=encoder_size, reverse=True) param_attr=fluid.ParamAttr(name='vemb'))
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
#### Decoder lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoded_proj = paddle.layer.fc( encoder_out = pd.sequence_last_step(input=lstm_hidden0)
act=paddle.activation.Linear(), return encoder_out
def train_decoder(context, is_sparse):
# decoder
trg_language_word = pd.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1)
trg_embedding = pd.embedding(
input=trg_language_word,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb'))
rnn = pd.DynamicRNN()
with rnn.block():
current_word = rnn.step_input(trg_embedding)
pre_state = rnn.memory(init=context)
current_state = pd.fc(
input=[current_word, pre_state], size=decoder_size, act='tanh')
current_score = pd.fc(
input=current_state, size=target_dict_dim, act='softmax')
rnn.update_memory(pre_state, current_state)
rnn.output(current_score)
return rnn()
def decode(context, is_sparse):
init_state = context
array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state
state_array = pd.create_array('float32')
pd.array_write(init_state, array=state_array, i=counter)
# ids, scores as memory
ids_array = pd.create_array('int64')
scores_array = pd.create_array('float32')
init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = pd.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
pd.array_write(init_ids, array=ids_array, i=counter)
pd.array_write(init_scores, array=scores_array, i=counter)
cond = pd.less_than(x=counter, y=array_len)
while_op = pd.While(cond=cond)
with while_op.block():
pre_ids = pd.array_read(array=ids_array, i=counter)
pre_state = pd.array_read(array=state_array, i=counter)
pre_score = pd.array_read(array=scores_array, i=counter)
# expand the lod of pre_state to be the same with pre_score
pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
pre_ids_emb = pd.embedding(
input=pre_ids,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=is_sparse)
# use rnn unit to update rnn
current_state = pd.fc(
input=[pre_state_expanded, pre_ids_emb],
size=decoder_size, size=decoder_size,
bias_attr=False, act='tanh')
input=encoded_vector) current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
# use score to do beam search
current_score = pd.fc(
input=current_state_with_lod, size=target_dict_dim, act='softmax')
topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
backward_first = paddle.layer.first_seq(input=src_backward) pd.increment(x=counter, value=1, in_place=True)
decoder_boot = paddle.layer.fc( # update the memories
size=decoder_size, pd.array_write(current_state, array=state_array, i=counter)
act=paddle.activation.Tanh(), pd.array_write(selected_ids, array=ids_array, i=counter)
bias_attr=False, pd.array_write(selected_scores, array=scores_array, i=counter)
input=backward_first)
pd.less_than(x=counter, y=array_len, cond=cond)
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
translation_ids, translation_scores = pd.beam_search_decode(
decoder_mem = paddle.layer.memory( ids=ids_array, scores=scores_array)
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
return translation_ids, translation_scores
context = paddle.networks.simple_attention(
encoded_sequence=enc_vec,
encoded_proj=enc_proj, def train_program(is_sparse):
decoder_state=decoder_mem) context = encoder(is_sparse)
rnn_out = train_decoder(context, is_sparse)
decoder_inputs = paddle.layer.fc( label = pd.data(
act=paddle.activation.Linear(), name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
size=decoder_size * 3, cost = pd.cross_entropy(input=rnn_out, label=label)
bias_attr=False, avg_cost = pd.mean(cost)
input=[context, current_word], return avg_cost
layer_attr=paddle.attr.ExtraLayerAttribute(
error_clipping_threshold=100.0))
def optimizer_func():
gru_step = paddle.layer.gru_step( return fluid.optimizer.Adagrad(
name='gru_decoder', learning_rate=1e-4,
input=decoder_inputs, regularization=fluid.regularizer.L2DecayRegularizer(
output_mem=decoder_mem, regularization_coeff=0.1))
size=decoder_size)
out = paddle.layer.fc( def train(use_cuda, is_sparse, is_local=True):
size=target_dict_dim, EPOCH_NUM = 1
bias_attr=True,
act=paddle.activation.Softmax(), if use_cuda and not fluid.core.is_compiled_with_cuda():
input=gru_step) return
return out place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
decoder_group_name = 'decoder_group' train_reader = paddle.batch(
group_input1 = paddle.layer.StaticInput(input=encoded_vector)
group_input2 = paddle.layer.StaticInput(input=encoded_proj)
group_inputs = [group_input1, group_input2]
if not is_generating:
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data(
name='target_language_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim)),
size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
group_inputs.append(trg_embedding)
# For decoder equipped with attention mechanism, in training,
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory.
# Here, the StaticInput defines a read-only memory
# for the recurrent_group.
decoder = paddle.layer.recurrent_group(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs)
lbl = paddle.layer.data(
name='target_language_next_word',
type=paddle.data_type.integer_value_sequence(target_dict_dim))
cost = paddle.layer.classification_cost(input=decoder, label=lbl)
return cost
else:
# In generation, the decoder predicts a next target word based on
# the encoded source sequence and the previous generated target word.
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory.
# Embedding of the previous generated word is automatically retrieved
# by GeneratedInputs initialized by a start mark <s>.
trg_embedding = paddle.layer.GeneratedInput(
size=target_dict_dim,
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim)
group_inputs.append(trg_embedding)
beam_gen = paddle.layer.beam_search(
name=decoder_group_name,
step=gru_decoder_with_attention,
input=group_inputs,
bos_id=0,
eos_id=1,
beam_size=beam_size,
max_length=max_length)
return beam_gen
def main():
paddle.init(use_gpu=with_gpu, trainer_count=1)
is_generating = False
# source and target dict dim.
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
# train the network
if not is_generating:
# define optimize method and trainer
optimizer = paddle.optimizer.Adam(
learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4))
cost = seq_to_seq_net(source_dict_dim, target_dict_dim, is_generating)
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=optimizer)
# define data reader
wmt14_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=8192), paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=4) batch_size=batch_size)
feed_order = [
'src_word_id', 'target_language_word', 'target_language_next_word'
]
# define event_handler callback
def event_handler(event): def event_handler(event):
if isinstance(event, paddle.event.EndIteration): if isinstance(event, fluid.EndStepEvent):
if event.batch_id % 10 == 0: if event.step % 10 == 0:
print("\nPass %d, Batch %d, Cost %f, %s" % print('pass_id=' + str(event.epoch) + ' batch=' + str(
(event.pass_id, event.batch_id, event.cost, event.step))
event.metrics))
else: if event.step == 20:
sys.stdout.write('.') trainer.stop()
sys.stdout.flush()
trainer = fluid.Trainer(
if not event.batch_id % 10: train_func=partial(train_program, is_sparse),
save_path = 'params_pass_%05d_batch_%05d.tar' % ( place=place,
event.pass_id, event.batch_id) optimizer_func=optimizer_func)
save_model(trainer, parameters, save_path)
if isinstance(event, paddle.event.EndPass):
# save parameters
save_path = 'params_pass_%05d.tar' % (event.pass_id)
save_model(trainer, parameters, save_path)
# start to train
trainer.train( trainer.train(
reader=wmt14_reader, event_handler=event_handler, num_passes=2) reader=train_reader,
num_epochs=EPOCH_NUM,
# generate a english sequence to french event_handler=event_handler,
else: feed_order=feed_order)
# use the first 3 samples for generation
gen_data = []
gen_num = 3 def decode_main(use_cuda, is_sparse):
for item in paddle.dataset.wmt14.gen(dict_size)(): if use_cuda and not fluid.core.is_compiled_with_cuda():
gen_data.append([item[0]]) return
if len(gen_data) == gen_num: place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
break
beam_size = 3 context = encoder(is_sparse)
beam_gen = seq_to_seq_net(source_dict_dim, target_dict_dim, translation_ids, translation_scores = decode(context, is_sparse)
is_generating, beam_size)
# get the trained model, whose bleu = 26.92 exe = Executor(place)
parameters = paddle.dataset.wmt14.model() exe.run(framework.default_startup_program())
init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
init_scores_data = np.array(
[1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1))
init_lod = [1] * batch_size
init_lod = [init_lod, init_lod]
init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
test_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.test(dict_size), buf_size=1000),
batch_size=batch_size)
# prob is the prediction probabilities, and id is the prediction word. feed_order = ['src_word_id']
beam_result = paddle.infer( feed_list = [
output_layer=beam_gen, framework.default_main_program().global_block().var(var_name)
parameters=parameters, for var_name in feed_order
input=gen_data, ]
field=['prob', 'id']) feeder = fluid.DataFeeder(feed_list, place)
# load the dictionary
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
gen_sen_idx = np.where(beam_result[1] == -1)[0] for data in test_data():
assert len(gen_sen_idx) == len(gen_data) * beam_size feed_data = map(lambda x: [x[0]], data)
feed_dict = feeder.feed(feed_data)
feed_dict['init_ids'] = init_ids
feed_dict['init_scores'] = init_scores
results = exe.run(
framework.default_main_program(),
feed=feed_dict,
fetch_list=[translation_ids, translation_scores],
return_numpy=False)
result_ids = np.array(results[0])
result_scores = np.array(results[1])
print("Original sentence:")
print(" ".join([src_dict[w] for w in feed_data[0][0]]))
print("Translated sentence:")
print(" ".join([trg_dict[w] for w in result_ids]))
print("Corresponding score: ", result_scores)
break
def inference_program():
is_sparse = False
context = encoder(is_sparse)
translation_ids, translation_scores = decode(context, is_sparse)
return translation_ids, translation_scores
# -1 is the delimiter of generated sequences. def main(use_cuda):
# the first element of each generated sequence its length. train(use_cuda, False)
start_pos, end_pos = 1, 0 decode_main(False, False) # Beam Search does not support CUDA
for i, sample in enumerate(gen_data):
print(
" ".join([src_dict[w] for w in sample[0][1:-1]])
) # skip the start and ending mark when printing the source sentence
for j in xrange(beam_size):
end_pos = gen_sen_idx[i * beam_size + j]
print("%.4f\t%s" % (beam_result[0][i][j], " ".join(
trg_dict[w] for w in beam_result[1][start_pos:end_pos])))
start_pos = end_pos + 2
print("\n")
if __name__ == '__main__': if __name__ == '__main__':
main() use_cuda = os.getenv('WITH_GPU', '0') != '0'
main(use_cuda)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册