提交 af0ec473 编写于 作者: N Nicky Chan 提交者: daminglu

Rewrite book chapter8 machine translation documentation and train.py (#552)

上级 42d37ac3
此差异已折叠。
此差异已折叠。
import sys, os # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle
import paddle.fluid as fluid
with_gpu = os.getenv('WITH_GPU', '0') != '0' import paddle.fluid.framework as framework
import paddle.fluid.layers as pd
from paddle.fluid.executor import Executor
def save_model(trainer, parameters, save_path): from functools import partial
with open(save_path, 'w') as f: import os
trainer.save_parameter_to_tar(f)
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
def seq_to_seq_net(source_dict_dim, hidden_dim = 32
target_dict_dim, word_dim = 16
is_generating, batch_size = 2
beam_size=3, max_length = 8
max_length=250): topk_size = 50
### Network Architecture beam_size = 2
word_vector_dim = 512 # dimension of word vector
decoder_size = 512 # dimension of hidden unit of GRU decoder decoder_size = hidden_dim
encoder_size = 512 # dimension of hidden unit of GRU encoder
#### Encoder def encoder(is_sparse):
src_word_id = paddle.layer.data( # encoder
name='source_language_word', src_word_id = pd.data(
type=paddle.data_type.integer_value_sequence(source_dict_dim)) name="src_word_id", shape=[1], dtype='int64', lod_level=1)
src_embedding = paddle.layer.embedding( src_embedding = pd.embedding(
input=src_word_id, size=word_vector_dim) input=src_word_id,
src_forward = paddle.networks.simple_gru( size=[dict_size, word_dim],
input=src_embedding, size=encoder_size) dtype='float32',
src_backward = paddle.networks.simple_gru( is_sparse=is_sparse,
input=src_embedding, size=encoder_size, reverse=True) param_attr=fluid.ParamAttr(name='vemb'))
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
#### Decoder lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoded_proj = paddle.layer.fc( encoder_out = pd.sequence_last_step(input=lstm_hidden0)
act=paddle.activation.Linear(), return encoder_out
size=decoder_size,
bias_attr=False,
input=encoded_vector) def train_decoder(context, is_sparse):
# decoder
backward_first = paddle.layer.first_seq(input=src_backward) trg_language_word = pd.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1)
decoder_boot = paddle.layer.fc( trg_embedding = pd.embedding(
size=decoder_size, input=trg_language_word,
act=paddle.activation.Tanh(), size=[dict_size, word_dim],
bias_attr=False, dtype='float32',
input=backward_first) is_sparse=is_sparse,
param_attr=fluid.ParamAttr(name='vemb'))
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
rnn = pd.DynamicRNN()
decoder_mem = paddle.layer.memory( with rnn.block():
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) current_word = rnn.step_input(trg_embedding)
pre_state = rnn.memory(init=context)
context = paddle.networks.simple_attention( current_state = pd.fc(
encoded_sequence=enc_vec, input=[current_word, pre_state], size=decoder_size, act='tanh')
encoded_proj=enc_proj,
decoder_state=decoder_mem) current_score = pd.fc(
input=current_state, size=target_dict_dim, act='softmax')
decoder_inputs = paddle.layer.fc( rnn.update_memory(pre_state, current_state)
act=paddle.activation.Linear(), rnn.output(current_score)
size=decoder_size * 3,
bias_attr=False, return rnn()
input=[context, current_word],
layer_attr=paddle.attr.ExtraLayerAttribute(
error_clipping_threshold=100.0)) def decode(context, is_sparse):
init_state = context
gru_step = paddle.layer.gru_step( array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
name='gru_decoder', counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
input=decoder_inputs,
output_mem=decoder_mem, # fill the first element with init_state
size=decoder_size) state_array = pd.create_array('float32')
pd.array_write(init_state, array=state_array, i=counter)
out = paddle.layer.fc(
size=target_dict_dim, # ids, scores as memory
bias_attr=True, ids_array = pd.create_array('int64')
act=paddle.activation.Softmax(), scores_array = pd.create_array('float32')
input=gru_step)
return out init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = pd.data(
decoder_group_name = 'decoder_group' name="init_scores", shape=[1], dtype="float32", lod_level=2)
group_input1 = paddle.layer.StaticInput(input=encoded_vector)
group_input2 = paddle.layer.StaticInput(input=encoded_proj) pd.array_write(init_ids, array=ids_array, i=counter)
group_inputs = [group_input1, group_input2] pd.array_write(init_scores, array=scores_array, i=counter)
if not is_generating: cond = pd.less_than(x=counter, y=array_len)
trg_embedding = paddle.layer.embedding(
input=paddle.layer.data( while_op = pd.While(cond=cond)
name='target_language_word', with while_op.block():
type=paddle.data_type.integer_value_sequence(target_dict_dim)), pre_ids = pd.array_read(array=ids_array, i=counter)
size=word_vector_dim, pre_state = pd.array_read(array=state_array, i=counter)
param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) pre_score = pd.array_read(array=scores_array, i=counter)
group_inputs.append(trg_embedding)
# expand the lod of pre_state to be the same with pre_score
# For decoder equipped with attention mechanism, in training, pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
# target embeding (the groudtruth) is the data input,
# while encoded source sequence is accessed to as an unbounded memory. pre_ids_emb = pd.embedding(
# Here, the StaticInput defines a read-only memory input=pre_ids,
# for the recurrent_group. size=[dict_size, word_dim],
decoder = paddle.layer.recurrent_group( dtype='float32',
name=decoder_group_name, is_sparse=is_sparse)
step=gru_decoder_with_attention,
input=group_inputs) # use rnn unit to update rnn
current_state = pd.fc(
lbl = paddle.layer.data( input=[pre_state_expanded, pre_ids_emb],
name='target_language_next_word', size=decoder_size,
type=paddle.data_type.integer_value_sequence(target_dict_dim)) act='tanh')
cost = paddle.layer.classification_cost(input=decoder, label=lbl) current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score)
# use score to do beam search
return cost current_score = pd.fc(
else: input=current_state_with_lod, size=target_dict_dim, act='softmax')
# In generation, the decoder predicts a next target word based on topk_scores, topk_indices = pd.topk(current_score, k=topk_size)
# the encoded source sequence and the previous generated target word. selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
# The encoded source sequence (encoder's output) must be specified by
# StaticInput, which is a read-only memory. pd.increment(x=counter, value=1, in_place=True)
# Embedding of the previous generated word is automatically retrieved
# by GeneratedInputs initialized by a start mark <s>. # update the memories
pd.array_write(current_state, array=state_array, i=counter)
trg_embedding = paddle.layer.GeneratedInput( pd.array_write(selected_ids, array=ids_array, i=counter)
size=target_dict_dim, pd.array_write(selected_scores, array=scores_array, i=counter)
embedding_name='_target_language_embedding',
embedding_size=word_vector_dim) pd.less_than(x=counter, y=array_len, cond=cond)
group_inputs.append(trg_embedding)
translation_ids, translation_scores = pd.beam_search_decode(
beam_gen = paddle.layer.beam_search( ids=ids_array, scores=scores_array)
name=decoder_group_name,
step=gru_decoder_with_attention, return translation_ids, translation_scores
input=group_inputs,
bos_id=0,
eos_id=1, def train_program(is_sparse):
beam_size=beam_size, context = encoder(is_sparse)
max_length=max_length) rnn_out = train_decoder(context, is_sparse)
label = pd.data(
return beam_gen name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = pd.cross_entropy(input=rnn_out, label=label)
avg_cost = pd.mean(cost)
def main(): return avg_cost
paddle.init(use_gpu=with_gpu, trainer_count=1)
is_generating = False
def optimizer_func():
# source and target dict dim. return fluid.optimizer.Adagrad(
dict_size = 30000 learning_rate=1e-4,
source_dict_dim = target_dict_dim = dict_size regularization=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.1))
# train the network
if not is_generating:
# define optimize method and trainer def train(use_cuda, is_sparse, is_local=True):
optimizer = paddle.optimizer.Adam( EPOCH_NUM = 1
learning_rate=5e-5,
regularization=paddle.optimizer.L2Regularization(rate=8e-4)) if use_cuda and not fluid.core.is_compiled_with_cuda():
return
cost = seq_to_seq_net(source_dict_dim, target_dict_dim, is_generating) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
parameters = paddle.parameters.create(cost)
train_reader = paddle.batch(
trainer = paddle.trainer.SGD( paddle.reader.shuffle(
cost=cost, parameters=parameters, update_equation=optimizer) paddle.dataset.wmt14.train(dict_size), buf_size=1000),
# define data reader batch_size=batch_size)
wmt14_reader = paddle.batch(
paddle.reader.shuffle( feed_order = [
paddle.dataset.wmt14.train(dict_size), buf_size=8192), 'src_word_id', 'target_language_word', 'target_language_next_word'
batch_size=4) ]
# define event_handler callback def event_handler(event):
def event_handler(event): if isinstance(event, fluid.EndStepEvent):
if isinstance(event, paddle.event.EndIteration): if event.step % 10 == 0:
if event.batch_id % 10 == 0: print('pass_id=' + str(event.epoch) + ' batch=' + str(
print("\nPass %d, Batch %d, Cost %f, %s" % event.step))
(event.pass_id, event.batch_id, event.cost,
event.metrics)) if event.step == 20:
else: trainer.stop()
sys.stdout.write('.')
sys.stdout.flush() trainer = fluid.Trainer(
train_func=partial(train_program, is_sparse),
if not event.batch_id % 10: place=place,
save_path = 'params_pass_%05d_batch_%05d.tar' % ( optimizer_func=optimizer_func)
event.pass_id, event.batch_id)
save_model(trainer, parameters, save_path) trainer.train(
reader=train_reader,
if isinstance(event, paddle.event.EndPass): num_epochs=EPOCH_NUM,
# save parameters event_handler=event_handler,
save_path = 'params_pass_%05d.tar' % (event.pass_id) feed_order=feed_order)
save_model(trainer, parameters, save_path)
# start to train def decode_main(use_cuda, is_sparse):
trainer.train( if use_cuda and not fluid.core.is_compiled_with_cuda():
reader=wmt14_reader, event_handler=event_handler, num_passes=2) return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# generate a english sequence to french
else: context = encoder(is_sparse)
# use the first 3 samples for generation translation_ids, translation_scores = decode(context, is_sparse)
gen_data = []
gen_num = 3 exe = Executor(place)
for item in paddle.dataset.wmt14.gen(dict_size)(): exe.run(framework.default_startup_program())
gen_data.append([item[0]])
if len(gen_data) == gen_num: init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
break init_scores_data = np.array(
[1. for _ in range(batch_size)], dtype='float32')
beam_size = 3 init_ids_data = init_ids_data.reshape((batch_size, 1))
beam_gen = seq_to_seq_net(source_dict_dim, target_dict_dim, init_scores_data = init_scores_data.reshape((batch_size, 1))
is_generating, beam_size) init_lod = [1] * batch_size
init_lod = [init_lod, init_lod]
# get the trained model, whose bleu = 26.92
parameters = paddle.dataset.wmt14.model() init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
# prob is the prediction probabilities, and id is the prediction word.
beam_result = paddle.infer( test_data = paddle.batch(
output_layer=beam_gen, paddle.reader.shuffle(
parameters=parameters, paddle.dataset.wmt14.test(dict_size), buf_size=1000),
input=gen_data, batch_size=batch_size)
field=['prob', 'id'])
feed_order = ['src_word_id']
# load the dictionary feed_list = [
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) framework.default_main_program().global_block().var(var_name)
for var_name in feed_order
gen_sen_idx = np.where(beam_result[1] == -1)[0] ]
assert len(gen_sen_idx) == len(gen_data) * beam_size feeder = fluid.DataFeeder(feed_list, place)
# -1 is the delimiter of generated sequences. src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
# the first element of each generated sequence its length.
start_pos, end_pos = 1, 0 for data in test_data():
for i, sample in enumerate(gen_data): feed_data = map(lambda x: [x[0]], data)
print( feed_dict = feeder.feed(feed_data)
" ".join([src_dict[w] for w in sample[0][1:-1]]) feed_dict['init_ids'] = init_ids
) # skip the start and ending mark when printing the source sentence feed_dict['init_scores'] = init_scores
for j in xrange(beam_size):
end_pos = gen_sen_idx[i * beam_size + j] results = exe.run(
print("%.4f\t%s" % (beam_result[0][i][j], " ".join( framework.default_main_program(),
trg_dict[w] for w in beam_result[1][start_pos:end_pos]))) feed=feed_dict,
start_pos = end_pos + 2 fetch_list=[translation_ids, translation_scores],
print("\n") return_numpy=False)
result_ids = np.array(results[0])
result_scores = np.array(results[1])
print("Original sentence:")
print(" ".join([src_dict[w] for w in feed_data[0][0]]))
print("Translated sentence:")
print(" ".join([trg_dict[w] for w in result_ids]))
print("Corresponding score: ", result_scores)
break
def inference_program():
is_sparse = False
context = encoder(is_sparse)
translation_ids, translation_scores = decode(context, is_sparse)
return translation_ids, translation_scores
def main(use_cuda):
train(use_cuda, False)
decode_main(False, False) # Beam Search does not support CUDA
if __name__ == '__main__': if __name__ == '__main__':
main() use_cuda = os.getenv('WITH_GPU', '0') != '0'
main(use_cuda)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册