seqToseq_net.py 5.6 KB
Newer Older
L
Luo Tao 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
# edit-mode: -*- python -*-

# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from paddle.trainer_config_helpers import *

### Data Definiation
data_dir = "./data/pre-wmt14"
src_lang_dict = os.path.join(data_dir, 'src.dict')
trg_lang_dict = os.path.join(data_dir, 'trg.dict')
is_generating = get_config_arg("is_generating", bool, False)

if not is_generating:
    train_list = os.path.join(data_dir, 'train.list')
    test_list = os.path.join(data_dir, 'test.list')
else:
    train_list = None
    test_list = os.path.join(data_dir, 'gen.list')

define_py_data_sources2(
    train_list,
    test_list,
    module="dataprovider",
    obj="process",
    args={
        "src_dict_path": src_lang_dict,
        "trg_dict_path": trg_lang_dict,
        "is_generating": is_generating
    })

### Algorithm Configuration
L
Luo Tao 已提交
45
settings(learning_method=AdamOptimizer(), batch_size=50, learning_rate=5e-4)
L
Luo Tao 已提交
46 47 48 49

### Network Architecture
source_dict_dim = len(open(src_lang_dict, "r").readlines())
target_dict_dim = len(open(trg_lang_dict, "r").readlines())
50 51 52
word_vector_dim = 512  # dimension of word vector
decoder_size = 512  # dimension of hidden unit in GRU Decoder network
encoder_size = 512  # dimension of hidden unit in GRU Encoder network
L
Luo Tao 已提交
53 54

if is_generating:
55 56
    beam_size = 3  # expand width in beam search
    max_length = 250  # a stop condition of sequence generation
L
Luo Tao 已提交
57 58 59 60 61 62 63 64 65
    gen_trans_file = get_config_arg("gen_trans_file", str, None)

#### Encoder
src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
src_embedding = embedding_layer(
    input=src_word_id,
    size=word_vector_dim,
    param_attr=ParamAttr(name='_source_language_embedding'))
src_forward = simple_gru(input=src_embedding, size=encoder_size)
66
src_backward = simple_gru(input=src_embedding, size=encoder_size, reverse=True)
L
Luo Tao 已提交
67 68
encoded_vector = concat_layer(input=[src_forward, src_backward])

69
#### Decoder
L
Luo Tao 已提交
70 71 72 73 74 75 76 77 78
with mixed_layer(size=decoder_size) as encoded_proj:
    encoded_proj += full_matrix_projection(input=encoded_vector)

backward_first = first_seq(input=src_backward)
with mixed_layer(
        size=decoder_size,
        act=TanhActivation(), ) as decoder_boot:
    decoder_boot += full_matrix_projection(input=backward_first)

79

L
Luo Tao 已提交
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
    decoder_mem = memory(
        name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)

    context = simple_attention(
        encoded_sequence=enc_vec,
        encoded_proj=enc_proj,
        decoder_state=decoder_mem, )

    with mixed_layer(size=decoder_size * 3) as decoder_inputs:
        decoder_inputs += full_matrix_projection(input=context)
        decoder_inputs += full_matrix_projection(input=current_word)

    gru_step = gru_step_layer(
        name='gru_decoder',
        input=decoder_inputs,
        output_mem=decoder_mem,
        size=decoder_size)

    with mixed_layer(
            size=target_dict_dim, bias_attr=True,
            act=SoftmaxActivation()) as out:
        out += full_matrix_projection(input=gru_step)
    return out

105

L
Luo Tao 已提交
106
decoder_group_name = "decoder_group"
107 108 109
group_input1 = StaticInput(input=encoded_vector, is_seq=True)
group_input2 = StaticInput(input=encoded_proj, is_seq=True)
group_inputs = [group_input1, group_input2]
L
Luo Tao 已提交
110 111 112

if not is_generating:
    trg_embedding = embedding_layer(
113
        input=data_layer(name='target_language_word', size=target_dict_dim),
L
Luo Tao 已提交
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
        size=word_vector_dim,
        param_attr=ParamAttr(name='_target_language_embedding'))
    group_inputs.append(trg_embedding)

    # For decoder equipped with attention mechanism, in training,
    # target embeding (the groudtruth) is the data input,
    # while encoded source sequence is accessed to as an unbounded memory.
    # Here, the StaticInput defines a read-only memory
    # for the recurrent_group.
    decoder = recurrent_group(
        name=decoder_group_name,
        step=gru_decoder_with_attention,
        input=group_inputs)

    lbl = data_layer(name='target_language_next_word', size=target_dict_dim)
    cost = classification_cost(input=decoder, label=lbl)
    outputs(cost)
else:
    # In generation, the decoder predicts a next target word based on
    # the encoded source sequence and the last generated target word.

    # The encoded source sequence (encoder's output) must be specified by
    # StaticInput, which is a read-only memory.
    # Embedding of the last generated word is automatically gotten by
    # GeneratedInputs, which is initialized by a start mark, such as <s>,
    # and must be included in generation.

    trg_embedding = GeneratedInput(
        size=target_dict_dim,
        embedding_name='_target_language_embedding',
        embedding_size=word_vector_dim)
    group_inputs.append(trg_embedding)

    beam_gen = beam_search(
        name=decoder_group_name,
        step=gru_decoder_with_attention,
        input=group_inputs,
        bos_id=0,
        eos_id=1,
        beam_size=beam_size,
        max_length=max_length)

    seqtext_printer_evaluator(
        input=beam_gen,
158
        id_input=data_layer(name="sent_id", size=1),
L
Luo Tao 已提交
159 160 161
        dict_file=trg_lang_dict,
        result_file=gen_trans_file)
    outputs(beam_gen)