api_train_v2.py 6.3 KB
Newer Older
1
import sys
Q
qiaolongfei 已提交
2

3 4 5
import paddle.v2 as paddle


Q
qiaolongfei 已提交
6
def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
Q
qiaolongfei 已提交
7 8 9 10 11
    ### Network Architecture
    word_vector_dim = 512  # dimension of word vector
    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
    encoder_size = 512  # dimension of hidden unit in GRU Encoder network

Q
qiaolongfei 已提交
12 13 14
    beam_size = 3
    max_length = 250

Q
qiaolongfei 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
    #### Encoder
    src_word_id = paddle.layer.data(
        name='source_language_word',
        type=paddle.data_type.integer_value_sequence(source_dict_dim))
    src_embedding = paddle.layer.embedding(
        input=src_word_id,
        size=word_vector_dim,
        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
    src_forward = paddle.networks.simple_gru(
        input=src_embedding, size=encoder_size)
    src_backward = paddle.networks.simple_gru(
        input=src_embedding, size=encoder_size, reverse=True)
    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])

    #### Decoder
    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
        encoded_proj += paddle.layer.full_matrix_projection(
            input=encoded_vector)

    backward_first = paddle.layer.first_seq(input=src_backward)

    with paddle.layer.mixed(
            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
        decoder_boot += paddle.layer.full_matrix_projection(
            input=backward_first)

    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):

        decoder_mem = paddle.layer.memory(
            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)

        context = paddle.networks.simple_attention(
            encoded_sequence=enc_vec,
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)

        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
            decoder_inputs += paddle.layer.full_matrix_projection(
                input=current_word)

        gru_step = paddle.layer.gru_step(
            name='gru_decoder',
            input=decoder_inputs,
            output_mem=decoder_mem,
            size=decoder_size)

        with paddle.layer.mixed(
                size=target_dict_dim,
                bias_attr=True,
                act=paddle.activation.Softmax()) as out:
            out += paddle.layer.full_matrix_projection(input=gru_step)
        return out

    decoder_group_name = "decoder_group"
    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
    group_inputs = [group_input1, group_input2]

Q
qiaolongfei 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
    if not is_generating:
        trg_embedding = paddle.layer.embedding(
            input=paddle.layer.data(
                name='target_language_word',
                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
            size=word_vector_dim,
            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
        group_inputs.append(trg_embedding)

        # For decoder equipped with attention mechanism, in training,
        # target embeding (the groudtruth) is the data input,
        # while encoded source sequence is accessed to as an unbounded memory.
        # Here, the StaticInput defines a read-only memory
        # for the recurrent_group.
        decoder = paddle.layer.recurrent_group(
            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs)

        lbl = paddle.layer.data(
            name='target_language_next_word',
            type=paddle.data_type.integer_value_sequence(target_dict_dim))
        cost = paddle.layer.classification_cost(input=decoder, label=lbl)

        return cost
    else:
        # In generation, the decoder predicts a next target word based on
        # the encoded source sequence and the last generated target word.

        # The encoded source sequence (encoder's output) must be specified by
        # StaticInput, which is a read-only memory.
        # Embedding of the last generated word is automatically gotten by
        # GeneratedInputs, which is initialized by a start mark, such as <s>,
        # and must be included in generation.

Q
qiaolongfei 已提交
109
        trg_embedding = paddle.layer.GeneratedInputV2(
Q
qiaolongfei 已提交
110 111 112 113 114
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
        group_inputs.append(trg_embedding)

Q
qiaolongfei 已提交
115
        beam_gen = paddle.layer.beam_search(
Q
qiaolongfei 已提交
116 117 118 119 120 121 122
            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs,
            bos_id=0,
            eos_id=1,
            beam_size=beam_size,
            max_length=max_length)
Q
qiaolongfei 已提交
123

Q
qiaolongfei 已提交
124
        return beam_gen
125 126 127 128 129


def main():
    paddle.init(use_gpu=False, trainer_count=1)

Q
qiaolongfei 已提交
130 131 132 133
    # source and target dict dim.
    dict_size = 30000
    source_dict_dim = target_dict_dim = dict_size

134
    # define network topology
Q
qiaolongfei 已提交
135
    cost = seqToseq_net(source_dict_dim, target_dict_dim)
136
    parameters = paddle.parameters.create(cost)
137

138
    # define optimize method and trainer
139 140 141
    optimizer = paddle.optimizer.Adam(
        learning_rate=5e-5,
        regularization=paddle.optimizer.L2Regularization(rate=1e-3))
142 143 144 145
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=optimizer)

146
    # define data reader
Y
Yu Yang 已提交
147
    feeding = {
148 149 150 151 152
        'source_language_word': 0,
        'target_language_word': 1,
        'target_language_next_word': 2
    }

Q
qiaolongfei 已提交
153
    wmt14_reader = paddle.batch(
154
        paddle.reader.shuffle(
Q
qiaolongfei 已提交
155
            paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
Q
qiaolongfei 已提交
156
        batch_size=5)
157

158 159 160 161
    # define event_handler callback
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 10 == 0:
162
                print "\nPass %d, Batch %d, Cost %f, %s" % (
163
                    event.pass_id, event.batch_id, event.cost, event.metrics)
164 165 166
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
167 168

    # start to train
169
    trainer.train(
170
        reader=wmt14_reader,
171
        event_handler=event_handler,
Q
qiaolongfei 已提交
172
        num_passes=10000,
Y
Yu Yang 已提交
173
        feeding=feeding)
174 175 176 177


if __name__ == '__main__':
    main()