diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py index 5d138a8c4f91976d90b19441781248f7b67c854a..c53714cefd08fd3b2b738786f51e7dcc7a0eedfa 100644 --- a/demo/seqToseq/api_train_v2.py +++ b/demo/seqToseq/api_train_v2.py @@ -1,13 +1,17 @@ import sys import paddle.v2 as paddle +import paddle.v2.layer.beam_search as beam_search -def seqToseq_net(source_dict_dim, target_dict_dim): +def seqToseq_net(source_dict_dim, target_dict_dim, is_generating): ### Network Architecture word_vector_dim = 512 # dimension of word vector decoder_size = 512 # dimension of hidden unit in GRU Decoder network encoder_size = 512 # dimension of hidden unit in GRU Encoder network + beam_size = 3 + max_length = 250 + #### Encoder src_word_id = paddle.layer.data( name='source_language_word', @@ -67,30 +71,63 @@ def seqToseq_net(source_dict_dim, target_dict_dim): group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) group_inputs = [group_input1, group_input2] - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost + if not is_generating: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost + else: + # In generation, the decoder predicts a next target word based on + # the encoded source sequence and the last generated target word. + + # The encoded source sequence (encoder's output) must be specified by + # StaticInput, which is a read-only memory. + # Embedding of the last generated word is automatically gotten by + # GeneratedInputs, which is initialized by a start mark, such as , + # and must be included in generation. + + trg_embedding = beam_search.GeneratedInputV2( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) + group_inputs.append(trg_embedding) + + beam_gen = beam_search.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=max_length) + # + # seqtext_printer_evaluator( + # input=beam_gen, + # id_input=data_layer( + # name="sent_id", size=1), + # dict_file=trg_dict_path, + # result_file=gen_trans_file) + return beam_gen def main(): diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py index e523a34d5a95120d1f0a583be8bbdbff5678d1ab..a85928f822d86eb44a32467865248416354a1a16 100644 --- a/demo/seqToseq/seqToseq_net.py +++ b/demo/seqToseq/seqToseq_net.py @@ -81,8 +81,10 @@ def gru_encoder_decoder(data_conf, """ for k, v in data_conf.iteritems(): globals()[k] = v - source_dict_dim = len(open(src_dict_path, "r").readlines()) - target_dict_dim = len(open(trg_dict_path, "r").readlines()) + #source_dict_dim = len(open(src_dict_path, "r").readlines()) + #target_dict_dim = len(open(trg_dict_path, "r").readlines()) + source_dict_dim = 1000 + target_dict_dim = 2000 gen_trans_file = gen_result src_word_id = data_layer(name='source_language_word', size=source_dict_dim) @@ -131,9 +133,8 @@ def gru_encoder_decoder(data_conf, decoder_group_name = "decoder_group" group_inputs = [ - StaticInput( - input=encoded_vector, is_seq=True), StaticInput( - input=encoded_proj, is_seq=True) + StaticInput(input=encoded_vector, is_seq=True), + StaticInput(input=encoded_proj, is_seq=True) ] if not is_generating: diff --git a/demo/seqToseq/translation/train.conf b/demo/seqToseq/translation/train.conf index 72b7ccdbb95dbda8f06674079db9a3257bb31622..c0c9ebc6b1d66f46491cf816ac63bf9a80c0b64b 100644 --- a/demo/seqToseq/translation/train.conf +++ b/demo/seqToseq/translation/train.conf @@ -19,7 +19,8 @@ sys.path.append("..") from seqToseq_net import * # whether this config is used for generating -is_generating = False +#is_generating = False +is_generating = True ### Data Definiation data_dir = "./data/pre-wmt14" diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py index cb98866d874e35b9fda8e170004d687329b7d3e3..a116cf0033fb76f5cd44c30c3d8b6a60bde8e493 100644 --- a/python/paddle/v2/config_base.py +++ b/python/paddle/v2/config_base.py @@ -76,6 +76,10 @@ class Layer(object): """ function to set proto attribute """ + print "======" + # print self.name + print self.__parent_layers__ + # print self.__context__ self.__context__ = context # short cut if myself is parsed before. diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 1e4efedde363f20fde168941adcb6e8a594b533a..d46d3bc343911ea0bd9df541e20fd306407cab37 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -135,6 +135,10 @@ class WithExtraParent(Layer): """ function to set proto attribute """ + print "*************" + # print context + print self.name + print self.__extra_parent__ kwargs = dict() for p in self.__extra_parent__: p.to_proto(context=context) @@ -162,11 +166,12 @@ class WithExtraParent(Layer): class MemoryV2(WithExtraParent): - def __init__(self, name, **kwargs): + def __init__(self, name, extra_input=None, **kwargs): self.name = name super(MemoryV2, self).__init__(name=name, parent_layers=dict()) self.__kwargs__ = kwargs self.__boot_layer_name__ = None + if 'boot_layer' in kwargs: begin_of_current_rnn = [] # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a @@ -223,22 +228,6 @@ class MemoryV2(WithExtraParent): return True -class LayerOutputV2(Layer): - """ - LayerOutputV2 is used to store the result of LayerOutput in v1 api. - It will not store it's parents because layer_output has been parsed already. - """ - - def __init__(self, layer_output): - assert isinstance(layer_output, conf_helps.LayerOutput) - self.layer_output = layer_output - super(LayerOutputV2, self).__init__( - name=layer_output.name, parent_layers=dict()) - - def to_proto_impl(self): - return self.layer_output - - class StaticInputV2(object): def __init__(self, input, is_seq=False, size=None): assert isinstance(input, LayerV2) @@ -330,10 +319,15 @@ def mixed(size=0, class RecurrentLayerInput(WithExtraParent): def __init__(self, recurrent_name, index, parent_layers): - assert len(parent_layers) == 1 - self.__parents__ = parent_layers.values()[0] + parents_len = len(parent_layers) + assert parents_len <= 1 + if parents_len == 0: + self.__parents__ = [] + else: + self.__parents__ = parent_layers.values()[0] + name = self.__parents__[index].name if index >= 0 else None super(RecurrentLayerInput, self).__init__( - name=self.__parents__[index].name, parent_layers=parent_layers) + name=name, parent_layers=parent_layers) self.__recurrent_name__ = recurrent_name def context_name(self): @@ -346,6 +340,10 @@ class RecurrentLayerInput(WithExtraParent): in_links=map(lambda x: x.name, self.__parents__)) return self + def use_context_name(self): + return True + + class RecurrentLayerOutput(Layer): def __init__(self, recurrent_name, index, parent_layers): @@ -428,6 +426,9 @@ def recurrent_group(step, input, name=None): non_static_inputs = filter(lambda x: not isinstance(x, StaticInputV2), input) + static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input) + static_inputs = [static_input.input for static_input in static_inputs] + actual_input = [ RecurrentLayerInput( recurrent_name=name, @@ -436,6 +437,13 @@ def recurrent_group(step, input, name=None): for i in xrange(len(non_static_inputs)) ] + extra_input = None + if len(non_static_inputs) == 0: + extra_input = RecurrentLayerInput( + recurrent_name=name, + index=-1, + parent_layers={}) + def __real_step__(*args): rnn_input = list(args) static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input) @@ -443,6 +451,7 @@ def recurrent_group(step, input, name=None): mem_name = "__%s_memory__" % static_input.input.name mem = memory( name=mem_name, + extra_input=extra_input, is_seq=static_input.is_seq, size=static_input.input.calculate_size, boot_layer=static_input.input) diff --git a/python/paddle/v2/layers/__init__.py b/python/paddle/v2/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d4b5679397f46fe05d2d7cada97b1eae527b79f8 --- /dev/null +++ b/python/paddle/v2/layers/__init__.py @@ -0,0 +1 @@ +import beam_search \ No newline at end of file diff --git a/python/paddle/v2/layers/beam_search.py b/python/paddle/v2/layers/beam_search.py new file mode 100644 index 0000000000000000000000000000000000000000..56beae7e5e17faae1e41515fae3cd8ab4cf5f1af --- /dev/null +++ b/python/paddle/v2/layers/beam_search.py @@ -0,0 +1,132 @@ +import paddle.v2 as paddle +from paddle.v2.config_base import Layer +from paddle.trainer_config_helpers.default_decorators import wrap_name_default +from paddle.trainer_config_helpers.layers import RecurrentLayerGroupSetGenerator, Generator + + +class BaseGeneratedInputV2(object): + def __init__(self): + self.bos_id = None + self.eos_id = None + + def before_real_step(self): + raise NotImplementedError() + + def after_real_step(self, *args): + raise NotImplementedError() + + +class GeneratedInputV2(BaseGeneratedInputV2): + def __init__(self, size, embedding_name, embedding_size): + super(GeneratedInputV2, self).__init__() + self.size = size + self.embedding_name = embedding_name + self.embedding_size = embedding_size + + def after_real_step(self, input): + return paddle.layer.max_id(input=input, name='__beam_search_predict__') + + def before_real_step(self): + predict_id = paddle.layer.memory( + name='__beam_search_predict__', + size=self.size, + boot_with_const_id=self.bos_id) + + trg_emb = paddle.layer.embedding( + input=predict_id, + size=self.embedding_size, + param_attr=paddle.attr.ParamAttr(name=self.embedding_name)) + return trg_emb + + +class RecurrentLayerGroupSetGeneratorV2(Layer): + def __init__(self, eos_name, max_length, beam_size, num_results_per_sample): + self.eos_name = eos_name + self.max_length = max_length + self.beam_size = beam_size + self.num_results_per_sample = num_results_per_sample + super(RecurrentLayerGroupSetGeneratorV2, self).__init__( + name=eos_name, parent_layers={}) + + def to_proto_impl(self, **kwargs): + RecurrentLayerGroupSetGenerator( + Generator( + eos_layer_name=self.eos_name, + max_num_frames=self.max_length, + beam_size=self.beam_size, + num_results_per_sample=self.num_results_per_sample)) + return self + + def context_name(self): + return self.eos_name + ".fake" + + def use_context_name(self): + return True + +@wrap_name_default() +def beam_search(step, + input, + bos_id, + eos_id, + beam_size, + max_length=500, + name=None, + num_results_per_sample=None): + if num_results_per_sample is None: + num_results_per_sample = beam_size + assert num_results_per_sample <= beam_size + # logger.warning("num_results_per_sample should be less than beam_size") + + if isinstance(input, paddle.layer.StaticInputV2) or isinstance(input, BaseGeneratedInputV2): + input = [input] + + generated_input_index = -1 + + real_input = [] + for i, each_input in enumerate(input): + assert isinstance(each_input, paddle.layer.StaticInputV2) or isinstance( + each_input, BaseGeneratedInputV2) + if isinstance(each_input, BaseGeneratedInputV2): + assert generated_input_index == -1 + generated_input_index = i + else: + real_input.append(each_input) + + assert generated_input_index != -1 + + gipt = input[generated_input_index] + assert isinstance(gipt, BaseGeneratedInputV2) + + gipt.bos_id = bos_id + gipt.eos_id = eos_id + + def __real_step__(*args): + eos_name = "__%s_eos_layer__" % name + generator = RecurrentLayerGroupSetGeneratorV2( + eos_name, max_length, beam_size, num_results_per_sample) + + args = list(args) + before_step_layer = gipt.before_real_step() + before_step_layer.append_child(layer=generator, + parent_names=[before_step_layer.name]) + args.insert(generated_input_index, before_step_layer) + + predict = gipt.after_real_step(step(*args)) + + eos = paddle.layer.eos(input=predict, eos_id=eos_id, name=eos_name) + predict.append_child(layer=eos, parent_names=[predict.name]) + + return predict + + # tmp = paddle.layer.recurrent_group( + # step=__real_step__, + # input=real_input, + # reverse=False, + # name=name, + # is_generating=True) + tmp = paddle.layer.recurrent_group( + step=__real_step__, + input=real_input, + name=name) + + return tmp