diff --git a/scheduled_sampling/README.md b/scheduled_sampling/README.md index 016f040e8986bccab0043dfc4579fedf17bb2bba..4691c1f8be868bb9c4af837307c60cf3c9443b7b 100644 --- a/scheduled_sampling/README.md +++ b/scheduled_sampling/README.md @@ -37,7 +37,7 @@ Scheduled Sampling主要应用在序列到序列模型的训练阶段,而生 ## 模型实现 -由于Scheduled Sampling是对序列到序列模型的改进,其整体实现框架与序列到序列模型较为相似。为突出本文重点,这里仅介绍与Scheduled Sampling相关的部分,完整的代码见`scheduled_sampling.py`。 +由于Scheduled Sampling是对序列到序列模型的改进,其整体实现框架与序列到序列模型较为相似。为突出本文重点,这里仅介绍与Scheduled Sampling相关的部分,完整的代码见`network_conf.py`。 首先导入需要的包,并定义控制衰减概率的类`RandomScheduleGenerator`,如下: @@ -119,9 +119,10 @@ true_token_flags = paddle.layer.data( 这里还需要对原始reader进行封装,增加`true_token_flag`的数据生成器。下面以线性衰减为例说明如何调用上面定义的`RandomScheduleGenerator`产生`true_token_flag`的输入数据。 ```python -schedule_generator = RandomScheduleGenerator("linear", 0.75, 1000000) - -def gen_schedule_data(reader): +def gen_schedule_data(reader, + schedule_type="linear", + decay_a=0.75, + decay_b=1000000): """ Creates a data reader for scheduled sampling. @@ -130,10 +131,17 @@ def gen_schedule_data(reader): :param reader: the original reader. :type reader: callable + :param schedule_type: the type of sampling rate decay. + :type schedule_type: str + :param decay_a: the decay parameter a. + :type decay_a: float + :param decay_b: the decay parameter b. + :type decay_b: float :return: the new reader with the field "true_token_flag". :rtype: callable """ + schedule_generator = RandomScheduleGenerator(schedule_type, decay_a, decay_b) def data_reader(): for src_ids, trg_ids, trg_ids_next in reader(): @@ -149,61 +157,60 @@ def gen_schedule_data(reader): ```python def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word, - true_token_flag): - """ - The decoder step for training. - :param enc_vec: the encoder vector for attention - :type enc_vec: LayerOutput - :param enc_proj: the encoder projection for attention - :type enc_proj: LayerOutput - :param true_word: the ground-truth target word - :type true_word: LayerOutput - :param true_token_flag: the flag of using the ground-truth target word - :type true_token_flag: LayerOutput - :return: the softmax output layer - :rtype: LayerOutput - """ - - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - - gru_out_memory = paddle.layer.memory( - name='gru_out', size=target_dict_dim) - - generated_word = paddle.layer.max_id(input=gru_out_memory) - - generated_word_emb = paddle.layer.embedding( - input=generated_word, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - - current_word = paddle.layer.multiplex( - input=[true_token_flag, true_word, generated_word_emb]) - - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - with paddle.layer.mixed( - name='gru_out', - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) - - return out + true_token_flag): + """ + The decoder step for training. + :param enc_vec: the encoder vector for attention + :type enc_vec: LayerOutput + :param enc_proj: the encoder projection for attention + :type enc_proj: LayerOutput + :param true_word: the ground-truth target word + :type true_word: LayerOutput + :param true_token_flag: the flag of using the ground-truth target word + :type true_token_flag: LayerOutput + :return: the softmax output layer + :rtype: LayerOutput + """ + + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) + + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + + gru_out_memory = paddle.layer.memory( + name='gru_out', size=target_dict_dim) + + generated_word = paddle.layer.max_id(input=gru_out_memory) + + generated_word_emb = paddle.layer.embedding( + input=generated_word, + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + + current_word = paddle.layer.multiplex( + input=[true_token_flag, true_word, generated_word_emb]) + + decoder_inputs = paddle.layer.fc( + input=[context, current_word], + size=decoder_size * 3, + act=paddle.activation.Linear(), + bias_attr=False) + + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + out = paddle.layer.fc( + name='gru_out', + input=gru_step, + size=target_dict_dim, + act=paddle.activation.Softmax()) + return out ``` 该函数使用`memory`层`gru_out_memory`记忆上一时刻生成的元素,根据`gru_out_memory`选择概率最大的词语`generated_word`作为生成的词语。`multiplex`层会在真实元素`true_word`和生成的元素`generated_word`之间做出选择,并将选择的结果作为解码器输入。`multiplex`层使用了三个输入,分别为`true_token_flag`、`true_word`和`generated_word_emb`。对于这三个输入中每个元素,若`true_token_flag`中的值为`0`,则`multiplex`层输出`true_word`中的相应元素;若`true_token_flag`中的值为`1`,则`multiplex`层输出`generated_word_emb`中的相应元素。 diff --git a/scheduled_sampling/generate.py b/scheduled_sampling/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..adde133744329636baabc8a409fc737f3b9106a3 --- /dev/null +++ b/scheduled_sampling/generate.py @@ -0,0 +1,91 @@ +import gzip +import argparse +import distutils.util +import paddle.v2 as paddle + +from network_conf import seqToseq_net + + +def parse_args(): + parser = argparse.ArgumentParser( + description="PaddlePaddle Scheduled Sampling") + parser.add_argument( + '--model_path', + type=str, + required=True, + help="The path for trained model to load.") + parser.add_argument( + '--beam_size', + type=int, + default=3, + help='The width of beam expansion. (default: %(default)s)') + parser.add_argument( + "--use_gpu", + type=distutils.util.strtobool, + default=False, + help="Use gpu or not. (default: %(default)s)") + parser.add_argument( + "--trainer_count", + type=int, + default=1, + help="Trainer number. (default: %(default)s)") + + return parser.parse_args() + + +def generate(gen_data, dict_size, model_path, beam_size): + beam_gen = seqToseq_net(dict_size, dict_size, beam_size, is_generating=True) + + with gzip.open(model_path, 'r') as f: + parameters = paddle.parameters.Parameters.from_tar(f) + + # prob is the prediction probabilities, and id is the prediction word. + beam_result = paddle.infer( + output_layer=beam_gen, + parameters=parameters, + input=gen_data, + field=['prob', 'id']) + + # get the dictionary + src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) + + # the delimited element of generated sequences is -1, + # the first element of each generated sequence is the sequence length + seq_list = [] + seq = [] + for w in beam_result[1]: + if w != -1: + seq.append(w) + else: + seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) + seq = [] + + prob = beam_result[0] + for i in xrange(gen_num): + print "\n*******************************************************\n" + print "src:", ' '.join([src_dict.get(w) for w in gen_data[i][0]]), "\n" + for j in xrange(beam_size): + print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] + + +if __name__ == '__main__': + args = parse_args() + + dict_size = 30000 + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + + # use the first 3 samples for generation + gen_creator = paddle.dataset.wmt14.gen(dict_size) + gen_data = [] + gen_num = 3 + for item in gen_creator(): + gen_data.append((item[0], )) + if len(gen_data) == gen_num: + break + + generate( + gen_data, + dict_size=dict_size, + model_path=args.model_path, + beam_size=args.beam_size) diff --git a/scheduled_sampling/network_conf.py b/scheduled_sampling/network_conf.py new file mode 100644 index 0000000000000000000000000000000000000000..295603313b53170e4b6e574a8ca096e7968d2e13 --- /dev/null +++ b/scheduled_sampling/network_conf.py @@ -0,0 +1,202 @@ +import paddle.v2 as paddle + +__all__ = ["seqToseq_net"] + +### Network Architecture +word_vector_dim = 512 # dimension of word vector +decoder_size = 512 # dimension of hidden unit in GRU Decoder network +encoder_size = 512 # dimension of hidden unit in GRU Encoder network + +max_length = 250 + + +def seqToseq_net(source_dict_dim, + target_dict_dim, + beam_size, + is_generating=False): + """ + The definition of the sequence to sequence model + :param source_dict_dim: the dictionary size of the source language + :type source_dict_dim: int + :param target_dict_dim: the dictionary size of the target language + :type target_dict_dim: int + :param beam_size: The width of beam expansion + :type beam_size: int + :param is_generating: whether in generating mode + :type is_generating: Bool + :return: the last layer of the network + :rtype: LayerOutput + """ + + #### Encoder + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) + src_embedding = paddle.layer.embedding( + input=src_word_id, size=word_vector_dim) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) + src_reverse = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) + encoded_vector = paddle.layer.concat(input=[src_forward, src_reverse]) + + #### Decoder + encoded_proj = paddle.layer.fc( + input=encoded_vector, + size=decoder_size, + act=paddle.activation.Linear(), + bias_attr=False) + + reverse_first = paddle.layer.first_seq(input=src_reverse) + + decoder_boot = paddle.layer.fc( + input=reverse_first, + size=decoder_size, + act=paddle.activation.Tanh(), + bias_attr=False) + + def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word, + true_token_flag): + """ + The decoder step for training. + :param enc_vec: the encoder vector for attention + :type enc_vec: LayerOutput + :param enc_proj: the encoder projection for attention + :type enc_proj: LayerOutput + :param true_word: the ground-truth target word + :type true_word: LayerOutput + :param true_token_flag: the flag of using the ground-truth target word + :type true_token_flag: LayerOutput + :return: the softmax output layer + :rtype: LayerOutput + """ + + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) + + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + + gru_out_memory = paddle.layer.memory( + name='gru_out', size=target_dict_dim) + + generated_word = paddle.layer.max_id(input=gru_out_memory) + + generated_word_emb = paddle.layer.embedding( + input=generated_word, + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + + current_word = paddle.layer.multiplex( + input=[true_token_flag, true_word, generated_word_emb]) + + decoder_inputs = paddle.layer.fc( + input=[context, current_word], + size=decoder_size * 3, + act=paddle.activation.Linear(), + bias_attr=False) + + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + out = paddle.layer.fc( + name='gru_out', + input=gru_step, + size=target_dict_dim, + act=paddle.activation.Softmax()) + return out + + def gru_decoder_with_attention_gen(enc_vec, enc_proj, current_word): + """ + The decoder step for generating. + :param enc_vec: the encoder vector for attention + :type enc_vec: LayerOutput + :param enc_proj: the encoder projection for attention + :type enc_proj: LayerOutput + :param current_word: the previously generated word + :type current_word: LayerOutput + :return: the softmax output layer + :rtype: LayerOutput + """ + + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) + + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) + + decoder_inputs = paddle.layer.fc( + input=[context, current_word], + size=decoder_size * 3, + act=paddle.activation.Linear(), + bias_attr=False) + + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) + + out = paddle.layer.fc( + name='gru_out', + input=gru_step, + size=target_dict_dim, + act=paddle.activation.Softmax()) + return out + + decoder_group_name = "decoder_group" + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + + if not is_generating: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + + true_token_flags = paddle.layer.data( + name='true_token_flag', + type=paddle.data_type.integer_value_sequence(2)) + + group_inputs = [ + group_input1, group_input2, trg_embedding, true_token_flags + ] + + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention_train, + input=group_inputs) + + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost + else: + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) + + group_inputs = [group_input1, group_input2, trg_embedding] + + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention_gen, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=max_length) + return beam_gen diff --git a/scheduled_sampling/reader.py b/scheduled_sampling/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..c751aa91a01902c95cb8968367ff00e1afed7c96 --- /dev/null +++ b/scheduled_sampling/reader.py @@ -0,0 +1,42 @@ +from utils import RandomScheduleGenerator + + +def gen_schedule_data(reader, + schedule_type="linear", + decay_a=0.75, + decay_b=1000000): + """ + Creates a data reader for scheduled sampling. + + Output from the iterator that created by original reader will be + appended with "true_token_flag" to indicate whether to use true token. + + :param reader: the original reader. + :type reader: callable + :param schedule_type: the type of sampling rate decay. + :type schedule_type: str + :param decay_a: the decay parameter a. + :type decay_a: float + :param decay_b: the decay parameter b. + :type decay_b: float + + :return: the new reader with the field "true_token_flag". + :rtype: callable + """ + schedule_generator = RandomScheduleGenerator(schedule_type, decay_a, + decay_b) + + def data_reader(): + for src_ids, trg_ids, trg_ids_next in reader(): + yield src_ids, trg_ids, trg_ids_next, \ + [0] + schedule_generator.processBatch(len(trg_ids) - 1) + + return data_reader + + +feeding = { + 'source_language_word': 0, + 'target_language_word': 1, + 'target_language_next_word': 2, + 'true_token_flag': 3 +} diff --git a/scheduled_sampling/scheduled_sampling.py b/scheduled_sampling/scheduled_sampling.py deleted file mode 100644 index 9e7017806d6007a1c77539d7c0bbba7194bd3f79..0000000000000000000000000000000000000000 --- a/scheduled_sampling/scheduled_sampling.py +++ /dev/null @@ -1,323 +0,0 @@ -import sys -import paddle.v2 as paddle -from random_schedule_generator import RandomScheduleGenerator - -schedule_generator = RandomScheduleGenerator("linear", 0.75, 1000000) - - -def gen_schedule_data(reader): - """ - Creates a data reader for scheduled sampling. - - Output from the iterator that created by original reader will be - appended with "true_token_flag" to indicate whether to use true token. - - :param reader: the original reader. - :type reader: callable - - :return: the new reader with the field "true_token_flag". - :rtype: callable - """ - - def data_reader(): - for src_ids, trg_ids, trg_ids_next in reader(): - yield src_ids, trg_ids, trg_ids_next, \ - [0] + schedule_generator.processBatch(len(trg_ids) - 1) - - return data_reader - - -def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): - """ - The definition of the sequence to sequence model - :param source_dict_dim: the dictionary size of the source language - :type source_dict_dim: int - :param target_dict_dim: the dictionary size of the target language - :type target_dict_dim: int - :param is_generating: whether in generating mode - :type is_generating: Bool - :return: the last layer of the network - :rtype: LayerOutput - """ - ### Network Architecture - word_vector_dim = 512 # dimension of word vector - decoder_size = 512 # dimension of hidden unit in GRU Decoder network - encoder_size = 512 # dimension of hidden unit in GRU Encoder network - - beam_size = 3 - max_length = 250 - - #### Encoder - src_word_id = paddle.layer.data( - name='source_language_word', - type=paddle.data_type.integer_value_sequence(source_dict_dim)) - src_embedding = paddle.layer.embedding( - input=src_word_id, size=word_vector_dim) - src_forward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size) - src_backward = paddle.networks.simple_gru( - input=src_embedding, size=encoder_size, reverse=True) - encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) - - #### Decoder - with paddle.layer.mixed(size=decoder_size) as encoded_proj: - encoded_proj += paddle.layer.full_matrix_projection( - input=encoded_vector) - - backward_first = paddle.layer.first_seq(input=src_backward) - - with paddle.layer.mixed( - size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: - decoder_boot += paddle.layer.full_matrix_projection( - input=backward_first) - - def gru_decoder_with_attention_train(enc_vec, enc_proj, true_word, - true_token_flag): - """ - The decoder step for training. - :param enc_vec: the encoder vector for attention - :type enc_vec: LayerOutput - :param enc_proj: the encoder projection for attention - :type enc_proj: LayerOutput - :param true_word: the ground-truth target word - :type true_word: LayerOutput - :param true_token_flag: the flag of using the ground-truth target word - :type true_token_flag: LayerOutput - :return: the softmax output layer - :rtype: LayerOutput - """ - - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - - gru_out_memory = paddle.layer.memory( - name='gru_out', size=target_dict_dim) - - generated_word = paddle.layer.max_id(input=gru_out_memory) - - generated_word_emb = paddle.layer.embedding( - input=generated_word, - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - - current_word = paddle.layer.multiplex( - input=[true_token_flag, true_word, generated_word_emb]) - - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - with paddle.layer.mixed( - name='gru_out', - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) - - return out - - def gru_decoder_with_attention_test(enc_vec, enc_proj, current_word): - """ - The decoder step for generating. - :param enc_vec: the encoder vector for attention - :type enc_vec: LayerOutput - :param enc_proj: the encoder projection for attention - :type enc_proj: LayerOutput - :param current_word: the previously generated word - :type current_word: LayerOutput - :return: the softmax output layer - :rtype: LayerOutput - """ - - decoder_mem = paddle.layer.memory( - name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) - - context = paddle.networks.simple_attention( - encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) - - with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: - decoder_inputs += paddle.layer.full_matrix_projection(input=context) - decoder_inputs += paddle.layer.full_matrix_projection( - input=current_word) - - gru_step = paddle.layer.gru_step( - name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) - - with paddle.layer.mixed( - size=target_dict_dim, - bias_attr=True, - act=paddle.activation.Softmax()) as out: - out += paddle.layer.full_matrix_projection(input=gru_step) - return out - - decoder_group_name = "decoder_group" - group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) - group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) - group_inputs = [group_input1, group_input2] - - if not is_generating: - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - true_token_flags = paddle.layer.data( - name='true_token_flag', - type=paddle.data_type.integer_value_sequence(2)) - group_inputs.append(true_token_flags) - - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention_train, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost - else: - trg_embedding = paddle.layer.GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) - group_inputs.append(trg_embedding) - - beam_gen = paddle.layer.beam_search( - name=decoder_group_name, - step=gru_decoder_with_attention_test, - input=group_inputs, - bos_id=0, - eos_id=1, - beam_size=beam_size, - max_length=max_length) - - return beam_gen - - -def main(): - paddle.init(use_gpu=False, trainer_count=1) - is_generating = False - model_path_for_generating = 'params_pass_1.tar.gz' - - # source and target dict dim. - dict_size = 30000 - source_dict_dim = target_dict_dim = dict_size - - # train the network - if not is_generating: - cost = seqToseq_net(source_dict_dim, target_dict_dim) - parameters = paddle.parameters.create(cost) - - # define optimize method and trainer - optimizer = paddle.optimizer.Adam( - learning_rate=5e-5, - regularization=paddle.optimizer.L2Regularization(rate=8e-4)) - trainer = paddle.trainer.SGD( - cost=cost, parameters=parameters, update_equation=optimizer) - # define data reader - wmt14_reader = paddle.batch( - gen_schedule_data( - paddle.reader.shuffle( - paddle.dataset.wmt14.train(dict_size), buf_size=8192)), - batch_size=5) - - feeding = { - 'source_language_word': 0, - 'target_language_word': 1, - 'target_language_next_word': 2, - 'true_token_flag': 3 - } - - # define event_handler callback - def event_handler(event): - if isinstance(event, paddle.event.EndIteration): - if event.batch_id % 10 == 0: - print "\nPass %d, Batch %d, Cost %f, %s" % ( - event.pass_id, event.batch_id, event.cost, - event.metrics) - else: - sys.stdout.write('.') - sys.stdout.flush() - if isinstance(event, paddle.event.EndPass): - # save parameters - with gzip.open('params_pass_%d.tar.gz' % event.pass_id, - 'w') as f: - trainer.save_parameter_to_tar(f) - - # start to train - trainer.train( - reader=wmt14_reader, - event_handler=event_handler, - feeding=feeding, - num_passes=2) - - # generate a english sequence to french - else: - # use the first 3 samples for generation - gen_creator = paddle.dataset.wmt14.gen(dict_size) - gen_data = [] - gen_num = 3 - for item in gen_creator(): - gen_data.append((item[0], )) - if len(gen_data) == gen_num: - break - - beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating) - # get the trained model - with gzip.open(model_path_for_generating, 'r') as f: - parameters = Parameters.from_tar(f) - # prob is the prediction probabilities, and id is the prediction word. - beam_result = paddle.infer( - output_layer=beam_gen, - parameters=parameters, - input=gen_data, - field=['prob', 'id']) - - # get the dictionary - src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) - - # the delimited element of generated sequences is -1, - # the first element of each generated sequence is the sequence length - seq_list = [] - seq = [] - for w in beam_result[1]: - if w != -1: - seq.append(w) - else: - seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]])) - seq = [] - - prob = beam_result[0] - beam_size = 3 - for i in xrange(gen_num): - print "\n*******************************************************\n" - print "src:", ' '.join( - [src_dict.get(w) for w in gen_data[i][0]]), "\n" - for j in xrange(beam_size): - print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j] - - -if __name__ == '__main__': - main() diff --git a/scheduled_sampling/train.py b/scheduled_sampling/train.py new file mode 100644 index 0000000000000000000000000000000000000000..34fc20d2039115fa6c5af8087574028801311a79 --- /dev/null +++ b/scheduled_sampling/train.py @@ -0,0 +1,123 @@ +import os +import sys +import gzip +import argparse +import distutils.util +import paddle.v2 as paddle + +import reader +from network_conf import seqToseq_net + + +def parse_args(): + parser = argparse.ArgumentParser( + description="PaddlePaddle Scheduled Sampling") + parser.add_argument( + '--schedule_type', + type=str, + default="linear", + help='The type of sampling rate decay. Supported type: constant, linear, exponential, inverse_sigmoid. (default: %(default)s)' + ) + parser.add_argument( + '--decay_a', + type=float, + default=0.75, + help='The sampling rate decay parameter a. (default: %(default)s)') + parser.add_argument( + '--decay_b', + type=float, + default=1000000, + help='The sampling rate decay parameter b. (default: %(default)s)') + parser.add_argument( + '--beam_size', + type=int, + default=3, + help='The width of beam expansion. (default: %(default)s)') + parser.add_argument( + "--use_gpu", + type=distutils.util.strtobool, + default=False, + help="Use gpu or not. (default: %(default)s)") + parser.add_argument( + "--trainer_count", + type=int, + default=1, + help="Trainer number. (default: %(default)s)") + parser.add_argument( + '--batch_size', + type=int, + default=32, + help="Size of a mini-batch. (default: %(default)s)") + parser.add_argument( + '--num_passes', + type=int, + default=10, + help="Number of passes to train. (default: %(default)s)") + parser.add_argument( + '--model_output_dir', + type=str, + default='models', + help="The path for model to store. (default: %(default)s)") + + return parser.parse_args() + + +def train(dict_size, batch_size, num_passes, beam_size, schedule_type, decay_a, + decay_b, model_dir): + optimizer = paddle.optimizer.Adam( + learning_rate=1e-4, + regularization=paddle.optimizer.L2Regularization(rate=1e-5)) + + cost = seqToseq_net(dict_size, dict_size, beam_size) + + parameters = paddle.parameters.create(cost) + + trainer = paddle.trainer.SGD( + cost=cost, parameters=parameters, update_equation=optimizer) + + wmt14_reader = reader.gen_schedule_data( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=8192), + schedule_type, decay_a, decay_b) + + # define event_handler callback + def event_handler(event): + if isinstance(event, paddle.event.EndIteration): + if event.batch_id % 10 == 0: + print "\nPass %d, Batch %d, Cost %f, %s" % ( + event.pass_id, event.batch_id, event.cost, event.metrics) + else: + sys.stdout.write('.') + sys.stdout.flush() + if isinstance(event, paddle.event.EndPass): + # save parameters + with gzip.open( + os.path.join(model_dir, 'params_pass_%d.tar.gz' % + event.pass_id), 'w') as f: + trainer.save_parameter_to_tar(f) + + # start to train + trainer.train( + reader=paddle.batch(wmt14_reader, batch_size=batch_size), + event_handler=event_handler, + feeding=reader.feeding, + num_passes=num_passes) + + +if __name__ == '__main__': + args = parse_args() + + if not os.path.isdir(args.model_output_dir): + os.mkdir(args.model_output_dir) + + paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) + + train( + dict_size=30000, + batch_size=args.batch_size, + num_passes=args.num_passes, + beam_size=args.beam_size, + schedule_type=args.schedule_type, + decay_a=args.decay_a, + decay_b=args.decay_b, + model_dir=args.model_output_dir) diff --git a/scheduled_sampling/random_schedule_generator.py b/scheduled_sampling/utils.py similarity index 98% rename from scheduled_sampling/random_schedule_generator.py rename to scheduled_sampling/utils.py index 7af99685140993f0c40779808cc0b3200e1b45b8..80a56f21298174459737c94ea3aa3902fbc348c5 100644 --- a/scheduled_sampling/random_schedule_generator.py +++ b/scheduled_sampling/utils.py @@ -1,10 +1,10 @@ -import numpy as np import math +import numpy as np class RandomScheduleGenerator: """ - The random sampling rate for scheduled sampling algoithm, which uses devcayed + The random sampling rate for scheduled sampling algoithm, which uses decayed sampling rate. """