From b0868af5d719359a8590610fbcfee0419bd81910 Mon Sep 17 00:00:00 2001 From: Nicky Date: Wed, 23 May 2018 18:28:02 -0700 Subject: [PATCH] Simplify Machine Translation demo by using Trainer API --- .../tests/book/high-level-api/CMakeLists.txt | 1 + .../machine_translation/CMakeLists.txt | 7 + .../test_machine_translation.py | 319 ++++++++++++++++++ 3 files changed, 327 insertions(+) create mode 100644 python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt index c2a15bdb3..024b16cb2 100644 --- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt @@ -8,3 +8,4 @@ endforeach() add_subdirectory(fit_a_line) add_subdirectory(recognize_digits) +add_subdirectory(machine_translation) diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt new file mode 100644 index 000000000..673c965b6 --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt @@ -0,0 +1,7 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +# default test +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py new file mode 100644 index 000000000..7204c7b3c --- /dev/null +++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py @@ -0,0 +1,319 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework +import paddle.fluid.layers as pd +from paddle.fluid.executor import Executor +from functools import partial +import unittest +import os + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +hidden_dim = 32 +word_dim = 16 +batch_size = 2 +max_length = 8 +topk_size = 50 +trg_dic_size = 10000 +beam_size = 2 + +decoder_size = hidden_dim + + +def encoder(is_sparse): + # encoder + src_word_id = pd.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = pd.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = pd.sequence_last_step(input=lstm_hidden0) + return encoder_out + + +def decoder_train(context, is_sparse): + # decoder + trg_language_word = pd.data( + name="target_language_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = pd.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse, + param_attr=fluid.ParamAttr(name='vemb')) + + rnn = pd.DynamicRNN() + with rnn.block(): + current_word = rnn.step_input(trg_embedding) + pre_state = rnn.memory(init=context) + current_state = pd.fc(input=[current_word, pre_state], + size=decoder_size, + act='tanh') + + current_score = pd.fc(input=current_state, + size=target_dict_dim, + act='softmax') + rnn.update_memory(pre_state, current_state) + rnn.output(current_score) + + return rnn() + + +def decoder_decode(context, is_sparse): + init_state = context + array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) + counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) + + # fill the first element with init_state + state_array = pd.create_array('float32') + pd.array_write(init_state, array=state_array, i=counter) + + # ids, scores as memory + ids_array = pd.create_array('int64') + scores_array = pd.create_array('float32') + + init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = pd.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + pd.array_write(init_ids, array=ids_array, i=counter) + pd.array_write(init_scores, array=scores_array, i=counter) + + cond = pd.less_than(x=counter, y=array_len) + + while_op = pd.While(cond=cond) + with while_op.block(): + pre_ids = pd.array_read(array=ids_array, i=counter) + pre_state = pd.array_read(array=state_array, i=counter) + pre_score = pd.array_read(array=scores_array, i=counter) + + # expand the lod of pre_state to be the same with pre_score + pre_state_expanded = pd.sequence_expand(pre_state, pre_score) + + pre_ids_emb = pd.embedding( + input=pre_ids, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=is_sparse) + + # use rnn unit to update rnn + current_state = pd.fc(input=[pre_state_expanded, pre_ids_emb], + size=decoder_size, + act='tanh') + current_state_with_lod = pd.lod_reset(x=current_state, y=pre_score) + # use score to do beam search + current_score = pd.fc(input=current_state_with_lod, + size=target_dict_dim, + act='softmax') + topk_scores, topk_indices = pd.topk(current_score, k=topk_size) + selected_ids, selected_scores = pd.beam_search( + pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + + pd.increment(x=counter, value=1, in_place=True) + + # update the memories + pd.array_write(current_state, array=state_array, i=counter) + pd.array_write(selected_ids, array=ids_array, i=counter) + pd.array_write(selected_scores, array=scores_array, i=counter) + + pd.less_than(x=counter, y=array_len, cond=cond) + + translation_ids, translation_scores = pd.beam_search_decode( + ids=ids_array, scores=scores_array) + + # return init_ids, init_scores + + return translation_ids, translation_scores + + +def set_init_lod(data, lod, place): + res = fluid.LoDTensor() + res.set(data, place) + res.set_lod(lod) + return res + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def train_program(is_sparse): + context = encoder(is_sparse) + rnn_out = decoder_train(context, is_sparse) + label = pd.data( + name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) + cost = pd.cross_entropy(input=rnn_out, label=label) + avg_cost = pd.mean(cost) + return avg_cost + + +def train(use_cuda, is_sparse, is_local=True): + EPOCH_NUM = 1 + + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + + feed_order = [ + 'src_word_id', 'target_language_word', 'target_language_next_word' + ] + + def event_handler(event): + if isinstance(event, fluid.EndStepEvent): + print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step)) + if event.step == 10: + trainer.stop() + + trainer = fluid.Trainer( + train_func=partial(train_program, is_sparse), + optimizer=fluid.optimizer.Adagrad( + learning_rate=1e-4, + regularization=fluid.regularizer.L2DecayRegularizer( + regularization_coeff=0.1)), + place=place) + + trainer.train( + reader=train_reader, + num_epochs=EPOCH_NUM, + event_handler=event_handler, + feed_order=feed_order) + + +def decode_main(use_cuda, is_sparse): + + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + context = encoder(is_sparse) + translation_ids, translation_scores = decoder_decode(context, is_sparse) + + exe = Executor(place) + exe.run(framework.default_startup_program()) + + init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') + init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') + init_ids_data = init_ids_data.reshape((batch_size, 1)) + init_scores_data = init_scores_data.reshape((batch_size, 1)) + init_lod = [i for i in range(batch_size)] + [batch_size] + init_lod = [init_lod, init_lod] + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + for _, data in enumerate(train_data()): + init_ids = set_init_lod(init_ids_data, init_lod, place) + init_scores = set_init_lod(init_scores_data, init_lod, place) + + src_word_data = to_lodtensor(map(lambda x: x[0], data), place) + + result_ids, result_scores = exe.run( + framework.default_main_program(), + feed={ + 'src_word_id': src_word_data, + 'init_ids': init_ids, + 'init_scores': init_scores + }, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + print result_ids.lod() + break + + +class TestMachineTranslation(unittest.TestCase): + pass + + +@contextlib.contextmanager +def scope_prog_guard(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +def inject_test_train(use_cuda, is_sparse): + f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse' + if is_sparse else 'dense') + + def f(*args): + with scope_prog_guard(): + train(use_cuda, is_sparse) + + setattr(TestMachineTranslation, f_name, f) + + +def inject_test_decode(use_cuda, is_sparse, decorator=None): + f_name = 'test_{0}_{1}_decode'.format('cuda' + if use_cuda else 'cpu', 'sparse' + if is_sparse else 'dense') + + def f(*args): + with scope_prog_guard(): + decode_main(use_cuda, is_sparse) + + if decorator is not None: + f = decorator(f) + + setattr(TestMachineTranslation, f_name, f) + + +for _use_cuda_ in (False, True): + for _is_sparse_ in (False, True): + inject_test_train(_use_cuda_, _is_sparse_) + +for _use_cuda_ in (False, True): + for _is_sparse_ in (False, True): + + _decorator_ = None + if _use_cuda_: + _decorator_ = unittest.skip( + reason='Beam Search does not support CUDA!') + + inject_test_decode( + is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_) + +if __name__ == '__main__': + unittest.main() -- GitLab