diff --git a/08.machine_translation/README.md b/08.machine_translation/README.md index 20c5f0acef0daca0a48b343e148ac0af7fc8ab4f..6ac82a74034d98b1b8212e4acc0df15f91ce8ffe 100644 --- a/08.machine_translation/README.md +++ b/08.machine_translation/README.md @@ -421,7 +421,7 @@ translation_ids, translation_scores = decode(context, is_sparse) ### Define DataSet -We initialize ids and scores and create tensors for input. This test we are using first record data from `wmt14.test` for inference. At the end we get src dict and target dict for printing out results later. +We initialize ids and scores and create tensors for input. In this test we are using first record data from `wmt14.test` for inference. At the end we get src dict and target dict for printing out results later. ```python init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') diff --git a/08.machine_translation/index.html b/08.machine_translation/index.html index 42e4f154d0ec31d64d2952eb2c7cb5554b155f71..0a04d7a65da627916b568e504bb2c0e59cbc0ec4 100644 --- a/08.machine_translation/index.html +++ b/08.machine_translation/index.html @@ -463,7 +463,7 @@ translation_ids, translation_scores = decode(context, is_sparse) ### Define DataSet -We initialize ids and scores and create tensors for input. This test we are using first record data from `wmt14.test` for inference. At the end we get src dict and target dict for printing out results later. +We initialize ids and scores and create tensors for input. In this test we are using first record data from `wmt14.test` for inference. At the end we get src dict and target dict for printing out results later. ```python init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') diff --git a/08.machine_translation/infer.py b/08.machine_translation/infer.py new file mode 100644 index 0000000000000000000000000000000000000000..263bb730b0c6128c4378e7cdd779ae7765076414 --- /dev/null +++ b/08.machine_translation/infer.py @@ -0,0 +1,191 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework +import paddle.fluid.layers as pd +from paddle.fluid.executor import Executor +import os + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +hidden_dim = 32 +word_dim = 32 +batch_size = 2 +max_length = 8 +topk_size = 50 +beam_size = 2 + +is_sparse = True +decoder_size = hidden_dim +model_save_dir = "machine_translation.inference.model" + + +def encoder(): + src_word_id = pd.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = pd.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = pd.sequence_last_step(input=lstm_hidden0) + return encoder_out + + +def decode(context): + init_state = context + array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) + counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) + + # fill the first element with init_state + state_array = pd.create_array('float32') + pd.array_write(init_state, array=state_array, i=counter) + + # ids, scores as memory + ids_array = pd.create_array('int64') + scores_array = pd.create_array('float32') + + init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = pd.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + pd.array_write(init_ids, array=ids_array, i=counter) + pd.array_write(init_scores, array=scores_array, i=counter) + + cond = pd.less_than(x=counter, y=array_len) + + while_op = pd.While(cond=cond) + with while_op.block(): + pre_ids = pd.array_read(array=ids_array, i=counter) + pre_state = pd.array_read(array=state_array, i=counter) + pre_score = pd.array_read(array=scores_array, i=counter) + + # expand the lod of pre_state to be the same with pre_score + pre_state_expanded = pd.sequence_expand(pre_state, pre_score) + + pre_ids_emb = pd.embedding( + input=pre_ids, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse, + param_attr=fluid.ParamAttr(name='vemb')) + + # use rnn unit to update rnn + current_state = pd.fc( + input=[pre_state_expanded, pre_ids_emb], + size=decoder_size, + act='tanh') + current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) + # use score to do beam search + current_score = pd.fc( + input=current_state_with_lod, size=target_dict_dim, act='softmax') + topk_scores, topk_indices = pd.topk(current_score, k=topk_size) + selected_ids, selected_scores = pd.beam_search( + pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + + with pd.Switch() as switch: + with switch.case(pd.is_empty(selected_ids)): + pd.fill_constant( + shape=[1], value=0, dtype='bool', force_cpu=True, out=cond) + with switch.default(): + pd.increment(x=counter, value=1, in_place=True) + + # update the memories + pd.array_write(current_state, array=state_array, i=counter) + pd.array_write(selected_ids, array=ids_array, i=counter) + pd.array_write(selected_scores, array=scores_array, i=counter) + + pd.less_than(x=counter, y=array_len, cond=cond) + + translation_ids, translation_scores = pd.beam_search_decode( + ids=ids_array, scores=scores_array) + + return translation_ids, translation_scores + + +def decode_main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + exe = Executor(place) + exe.run(framework.default_startup_program()) + + context = encoder() + translation_ids, translation_scores = decode(context) + fluid.io.load_persistables(executor=exe, dirname=model_save_dir) + + init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') + init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') + init_ids_data = init_ids_data.reshape((batch_size, 1)) + init_scores_data = init_scores_data.reshape((batch_size, 1)) + init_lod = [1] * batch_size + init_lod = [init_lod, init_lod] + + init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place) + init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place) + + test_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.test(dict_size), buf_size=1000), + batch_size=batch_size) + + feed_order = ['src_word_id'] + feed_list = [ + framework.default_main_program().global_block().var(var_name) + for var_name in feed_order + ] + feeder = fluid.DataFeeder(feed_list, place) + + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) + + for data in test_data(): + feed_data = map(lambda x: [x[0]], data) + feed_dict = feeder.feed(feed_data) + feed_dict['init_ids'] = init_ids + feed_dict['init_scores'] = init_scores + + results = exe.run( + framework.default_main_program(), + feed=feed_dict, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + + result_ids = np.array(results[0]) + result_scores = np.array(results[1]) + + print("Original sentence:") + print(" ".join([src_dict[w] for w in feed_data[0][0]])) + print("Translated sentence:") + print(" ".join([trg_dict[w] for w in result_ids])) + print("Corresponding score: ", result_scores) + + break + + +def main(use_cuda): + decode_main(False) # Beam Search does not support CUDA + + +if __name__ == '__main__': + use_cuda = os.getenv('WITH_GPU', '0') != '0' + main(use_cuda) diff --git a/08.machine_translation/train.py b/08.machine_translation/train.py index c417d9f773686ad5bc95287fe7cb3dec9a2e2ac4..e5d318d79a3bc720fbcfe00d10216d8fc98c2fe8 100644 --- a/08.machine_translation/train.py +++ b/08.machine_translation/train.py @@ -11,31 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import contextlib - -import numpy as np import paddle import paddle.fluid as fluid -import paddle.fluid.framework as framework import paddle.fluid.layers as pd -from paddle.fluid.executor import Executor -from functools import partial import os dict_size = 30000 source_dict_dim = target_dict_dim = dict_size hidden_dim = 32 -word_dim = 16 +word_dim = 32 batch_size = 2 max_length = 8 topk_size = 50 beam_size = 2 +is_sparse = True decoder_size = hidden_dim +model_save_dir = "machine_translation.inference.model" -def encoder(is_sparse): - # encoder +def encoder(): src_word_id = pd.data( name="src_word_id", shape=[1], dtype='int64', lod_level=1) src_embedding = pd.embedding( @@ -51,8 +46,7 @@ def encoder(is_sparse): return encoder_out -def train_decoder(context, is_sparse): - # decoder +def train_decoder(context): trg_language_word = pd.data( name="target_language_word", shape=[1], dtype='int64', lod_level=1) trg_embedding = pd.embedding( @@ -65,7 +59,7 @@ def train_decoder(context, is_sparse): rnn = pd.DynamicRNN() with rnn.block(): current_word = rnn.step_input(trg_embedding) - pre_state = rnn.memory(init=context) + pre_state = rnn.memory(init=context, need_reorder=True) current_state = pd.fc( input=[current_word, pre_state], size=decoder_size, act='tanh') @@ -77,74 +71,9 @@ def train_decoder(context, is_sparse): return rnn() -def decode(context, is_sparse): - init_state = context - array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) - counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) - - # fill the first element with init_state - state_array = pd.create_array('float32') - pd.array_write(init_state, array=state_array, i=counter) - - # ids, scores as memory - ids_array = pd.create_array('int64') - scores_array = pd.create_array('float32') - - init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) - init_scores = pd.data( - name="init_scores", shape=[1], dtype="float32", lod_level=2) - - pd.array_write(init_ids, array=ids_array, i=counter) - pd.array_write(init_scores, array=scores_array, i=counter) - - cond = pd.less_than(x=counter, y=array_len) - - while_op = pd.While(cond=cond) - with while_op.block(): - pre_ids = pd.array_read(array=ids_array, i=counter) - pre_state = pd.array_read(array=state_array, i=counter) - pre_score = pd.array_read(array=scores_array, i=counter) - - # expand the lod of pre_state to be the same with pre_score - pre_state_expanded = pd.sequence_expand(pre_state, pre_score) - - pre_ids_emb = pd.embedding( - input=pre_ids, - size=[dict_size, word_dim], - dtype='float32', - is_sparse=is_sparse) - - # use rnn unit to update rnn - current_state = pd.fc( - input=[pre_state_expanded, pre_ids_emb], - size=decoder_size, - act='tanh') - current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) - # use score to do beam search - current_score = pd.fc( - input=current_state_with_lod, size=target_dict_dim, act='softmax') - topk_scores, topk_indices = pd.topk(current_score, k=topk_size) - selected_ids, selected_scores = pd.beam_search( - pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) - - pd.increment(x=counter, value=1, in_place=True) - - # update the memories - pd.array_write(current_state, array=state_array, i=counter) - pd.array_write(selected_ids, array=ids_array, i=counter) - pd.array_write(selected_scores, array=scores_array, i=counter) - - pd.less_than(x=counter, y=array_len, cond=cond) - - translation_ids, translation_scores = pd.beam_search_decode( - ids=ids_array, scores=scores_array) - - return translation_ids, translation_scores - - -def train_program(is_sparse): - context = encoder(is_sparse) - rnn_out = train_decoder(context, is_sparse) +def train_program(): + context = encoder() + rnn_out = train_decoder(context) label = pd.data( name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) cost = pd.cross_entropy(input=rnn_out, label=label) @@ -159,7 +88,7 @@ def optimizer_func(): regularization_coeff=0.1)) -def train(use_cuda, is_sparse, is_local=True): +def train(use_cuda): EPOCH_NUM = 1 if use_cuda and not fluid.core.is_compiled_with_cuda(): @@ -181,13 +110,11 @@ def train(use_cuda, is_sparse, is_local=True): print('pass_id=' + str(event.epoch) + ' batch=' + str( event.step)) - if event.step == 20: - trainer.stop() + if isinstance(event, fluid.EndEpochEvent): + trainer.save_params(model_save_dir) trainer = fluid.Trainer( - train_func=partial(train_program, is_sparse), - place=place, - optimizer_func=optimizer_func) + train_func=train_program, place=place, optimizer_func=optimizer_func) trainer.train( reader=train_reader, @@ -196,76 +123,8 @@ def train(use_cuda, is_sparse, is_local=True): feed_order=feed_order) -def decode_main(use_cuda, is_sparse): - if use_cuda and not fluid.core.is_compiled_with_cuda(): - return - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - - context = encoder(is_sparse) - translation_ids, translation_scores = decode(context, is_sparse) - - exe = Executor(place) - exe.run(framework.default_startup_program()) - - init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') - init_scores_data = np.array( - [1. for _ in range(batch_size)], dtype='float32') - init_ids_data = init_ids_data.reshape((batch_size, 1)) - init_scores_data = init_scores_data.reshape((batch_size, 1)) - init_lod = [1] * batch_size - init_lod = [init_lod, init_lod] - - init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place) - init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place) - - test_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.wmt14.test(dict_size), buf_size=1000), - batch_size=batch_size) - - feed_order = ['src_word_id'] - feed_list = [ - framework.default_main_program().global_block().var(var_name) - for var_name in feed_order - ] - feeder = fluid.DataFeeder(feed_list, place) - - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) - - for data in test_data(): - feed_data = map(lambda x: [x[0]], data) - feed_dict = feeder.feed(feed_data) - feed_dict['init_ids'] = init_ids - feed_dict['init_scores'] = init_scores - - results = exe.run( - framework.default_main_program(), - feed=feed_dict, - fetch_list=[translation_ids, translation_scores], - return_numpy=False) - - result_ids = np.array(results[0]) - result_scores = np.array(results[1]) - - print("Original sentence:") - print(" ".join([src_dict[w] for w in feed_data[0][0]])) - print("Translated sentence:") - print(" ".join([trg_dict[w] for w in result_ids])) - print("Corresponding score: ", result_scores) - - break - - -def inference_program(): - is_sparse = False - context = encoder(is_sparse) - translation_ids, translation_scores = decode(context, is_sparse) - return translation_ids, translation_scores - - def main(use_cuda): - train(use_cuda, False) - decode_main(False, False) # Beam Search does not support CUDA + train(use_cuda) if __name__ == '__main__':