api_train_v2.py 6.6 KB
Newer Older
1
import sys
2
import paddle.v2 as paddle
3
import paddle.v2.layers.beam_search as beam_search
4 5


Q
qiaolongfei 已提交
6
def seqToseq_net(source_dict_dim, target_dict_dim, is_generating):
Q
qiaolongfei 已提交
7 8 9 10 11
    ### Network Architecture
    word_vector_dim = 512  # dimension of word vector
    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
    encoder_size = 512  # dimension of hidden unit in GRU Encoder network

Q
qiaolongfei 已提交
12 13 14
    beam_size = 3
    max_length = 250

Q
qiaolongfei 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
    #### Encoder
    src_word_id = paddle.layer.data(
        name='source_language_word',
        type=paddle.data_type.integer_value_sequence(source_dict_dim))
    src_embedding = paddle.layer.embedding(
        input=src_word_id,
        size=word_vector_dim,
        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
    src_forward = paddle.networks.simple_gru(
        input=src_embedding, size=encoder_size)
    src_backward = paddle.networks.simple_gru(
        input=src_embedding, size=encoder_size, reverse=True)
    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])

    #### Decoder
    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
        encoded_proj += paddle.layer.full_matrix_projection(
            input=encoded_vector)

    backward_first = paddle.layer.first_seq(input=src_backward)

    with paddle.layer.mixed(
            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
        decoder_boot += paddle.layer.full_matrix_projection(
            input=backward_first)

    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):

        decoder_mem = paddle.layer.memory(
            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)

        context = paddle.networks.simple_attention(
            encoded_sequence=enc_vec,
            encoded_proj=enc_proj,
            decoder_state=decoder_mem)

        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
            decoder_inputs += paddle.layer.full_matrix_projection(
                input=current_word)

        gru_step = paddle.layer.gru_step(
            name='gru_decoder',
            input=decoder_inputs,
            output_mem=decoder_mem,
            size=decoder_size)

        with paddle.layer.mixed(
                size=target_dict_dim,
                bias_attr=True,
                act=paddle.activation.Softmax()) as out:
            out += paddle.layer.full_matrix_projection(input=gru_step)
        return out

    decoder_group_name = "decoder_group"
    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
    group_inputs = [group_input1, group_input2]

Q
qiaolongfei 已提交
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
    if not is_generating:
        trg_embedding = paddle.layer.embedding(
            input=paddle.layer.data(
                name='target_language_word',
                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
            size=word_vector_dim,
            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
        group_inputs.append(trg_embedding)

        # For decoder equipped with attention mechanism, in training,
        # target embeding (the groudtruth) is the data input,
        # while encoded source sequence is accessed to as an unbounded memory.
        # Here, the StaticInput defines a read-only memory
        # for the recurrent_group.
        decoder = paddle.layer.recurrent_group(
            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs)

        lbl = paddle.layer.data(
            name='target_language_next_word',
            type=paddle.data_type.integer_value_sequence(target_dict_dim))
        cost = paddle.layer.classification_cost(input=decoder, label=lbl)

        return cost
    else:
        # In generation, the decoder predicts a next target word based on
        # the encoded source sequence and the last generated target word.

        # The encoded source sequence (encoder's output) must be specified by
        # StaticInput, which is a read-only memory.
        # Embedding of the last generated word is automatically gotten by
        # GeneratedInputs, which is initialized by a start mark, such as <s>,
        # and must be included in generation.

        trg_embedding = beam_search.GeneratedInputV2(
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
        group_inputs.append(trg_embedding)

        beam_gen = beam_search.beam_search(
            name=decoder_group_name,
            step=gru_decoder_with_attention,
            input=group_inputs,
            bos_id=0,
            eos_id=1,
            beam_size=beam_size,
            max_length=max_length)
        #
        # seqtext_printer_evaluator(
        #     input=beam_gen,
        #     id_input=data_layer(
        #         name="sent_id", size=1),
        #     dict_file=trg_dict_path,
        #     result_file=gen_trans_file)
        return beam_gen
131 132 133 134 135


def main():
    paddle.init(use_gpu=False, trainer_count=1)

Q
qiaolongfei 已提交
136 137 138 139
    # source and target dict dim.
    dict_size = 30000
    source_dict_dim = target_dict_dim = dict_size

140
    # define network topology
141
    cost = seqToseq_net(source_dict_dim, target_dict_dim, False)
142
    parameters = paddle.parameters.create(cost)
143

144
    # define optimize method and trainer
145 146 147
    optimizer = paddle.optimizer.Adam(
        learning_rate=5e-5,
        regularization=paddle.optimizer.L2Regularization(rate=1e-3))
148 149 150 151
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=optimizer)

152
    # define data reader
Y
Yu Yang 已提交
153
    feeding = {
154 155 156 157 158
        'source_language_word': 0,
        'target_language_word': 1,
        'target_language_next_word': 2
    }

Q
qiaolongfei 已提交
159
    wmt14_reader = paddle.batch(
160
        paddle.reader.shuffle(
Q
qiaolongfei 已提交
161
            paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
Q
qiaolongfei 已提交
162
        batch_size=5)
163

164 165 166 167
    # define event_handler callback
    def event_handler(event):
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 10 == 0:
168
                print "\nPass %d, Batch %d, Cost %f, %s" % (
169
                    event.pass_id, event.batch_id, event.cost, event.metrics)
170 171 172
            else:
                sys.stdout.write('.')
                sys.stdout.flush()
173 174

    # start to train
175
    trainer.train(
176
        reader=wmt14_reader,
177
        event_handler=event_handler,
Q
qiaolongfei 已提交
178
        num_passes=10000,
Y
Yu Yang 已提交
179
        feeding=feeding)
180 181 182 183


if __name__ == '__main__':
    main()