# edit-mode: -*- python -*- # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os from paddle.trainer_config_helpers import * ### Data Definiation data_dir = "./data/pre-wmt14" src_lang_dict = os.path.join(data_dir, 'src.dict') trg_lang_dict = os.path.join(data_dir, 'trg.dict') is_generating = get_config_arg("is_generating", bool, False) if not is_generating: train_list = os.path.join(data_dir, 'train.list') test_list = os.path.join(data_dir, 'test.list') else: train_list = None test_list = os.path.join(data_dir, 'gen.list') define_py_data_sources2( train_list, test_list, module="dataprovider", obj="process", args={ "src_dict_path": src_lang_dict, "trg_dict_path": trg_lang_dict, "is_generating": is_generating }) ### Algorithm Configuration settings(learning_method=AdamOptimizer(), batch_size=50, learning_rate=5e-4) ### Network Architecture source_dict_dim = len(open(src_lang_dict, "r").readlines()) target_dict_dim = len(open(trg_lang_dict, "r").readlines()) word_vector_dim = 512 # dimension of word vector decoder_size = 512 # dimension of hidden unit in GRU Decoder network encoder_size = 512 # dimension of hidden unit in GRU Encoder network if is_generating: beam_size = 3 # expand width in beam search max_length = 250 # a stop condition of sequence generation gen_trans_file = get_config_arg("gen_trans_file", str, None) #### Encoder src_word_id = data_layer(name='source_language_word', size=source_dict_dim) src_embedding = embedding_layer( input=src_word_id, size=word_vector_dim, param_attr=ParamAttr(name='_source_language_embedding')) src_forward = simple_gru(input=src_embedding, size=encoder_size) src_backward = simple_gru(input=src_embedding, size=encoder_size, reverse=True) encoded_vector = concat_layer(input=[src_forward, src_backward]) #### Decoder with mixed_layer(size=decoder_size) as encoded_proj: encoded_proj += full_matrix_projection(input=encoded_vector) backward_first = first_seq(input=src_backward) with mixed_layer( size=decoder_size, act=TanhActivation(), ) as decoder_boot: decoder_boot += full_matrix_projection(input=backward_first) def gru_decoder_with_attention(enc_vec, enc_proj, current_word): decoder_mem = memory( name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) context = simple_attention( encoded_sequence=enc_vec, encoded_proj=enc_proj, decoder_state=decoder_mem, ) with mixed_layer(size=decoder_size * 3) as decoder_inputs: decoder_inputs += full_matrix_projection(input=context) decoder_inputs += full_matrix_projection(input=current_word) gru_step = gru_step_layer( name='gru_decoder', input=decoder_inputs, output_mem=decoder_mem, size=decoder_size) with mixed_layer( size=target_dict_dim, bias_attr=True, act=SoftmaxActivation()) as out: out += full_matrix_projection(input=gru_step) return out decoder_group_name = "decoder_group" group_input1 = StaticInput(input=encoded_vector, is_seq=True) group_input2 = StaticInput(input=encoded_proj, is_seq=True) group_inputs = [group_input1, group_input2] if not is_generating: trg_embedding = embedding_layer( input=data_layer(name='target_language_word', size=target_dict_dim), size=word_vector_dim, param_attr=ParamAttr(name='_target_language_embedding')) group_inputs.append(trg_embedding) # For decoder equipped with attention mechanism, in training, # target embeding (the groudtruth) is the data input, # while encoded source sequence is accessed to as an unbounded memory. # Here, the StaticInput defines a read-only memory # for the recurrent_group. decoder = recurrent_group( name=decoder_group_name, step=gru_decoder_with_attention, input=group_inputs) lbl = data_layer(name='target_language_next_word', size=target_dict_dim) cost = classification_cost(input=decoder, label=lbl) outputs(cost) else: # In generation, the decoder predicts a next target word based on # the encoded source sequence and the last generated target word. # The encoded source sequence (encoder's output) must be specified by # StaticInput, which is a read-only memory. # Embedding of the last generated word is automatically gotten by # GeneratedInputs, which is initialized by a start mark, such as , # and must be included in generation. trg_embedding = GeneratedInput( size=target_dict_dim, embedding_name='_target_language_embedding', embedding_size=word_vector_dim) group_inputs.append(trg_embedding) beam_gen = beam_search( name=decoder_group_name, step=gru_decoder_with_attention, input=group_inputs, bos_id=0, eos_id=1, beam_size=beam_size, max_length=max_length) seqtext_printer_evaluator( input=beam_gen, id_input=data_layer(name="sent_id", size=1), dict_file=trg_lang_dict, result_file=gen_trans_file) outputs(beam_gen)