diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py index 5d138a8c4f91976d90b19441781248f7b67c854a..2809054e7d3a367f441188fe7f91037cfa5f1579 100644 --- a/demo/seqToseq/api_train_v2.py +++ b/demo/seqToseq/api_train_v2.py @@ -1,13 +1,17 @@ import sys + import paddle.v2 as paddle -def seqToseq_net(source_dict_dim, target_dict_dim): +def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ### Network Architecture word_vector_dim = 512 # dimension of word vector decoder_size = 512 # dimension of hidden unit in GRU Decoder network encoder_size = 512 # dimension of hidden unit in GRU Encoder network + beam_size = 3 + max_length = 250 + #### Encoder src_word_id = paddle.layer.data( name='source_language_word', @@ -67,30 +71,57 @@ def seqToseq_net(source_dict_dim, target_dict_dim): group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True) group_inputs = [group_input1, group_input2] - trg_embedding = paddle.layer.embedding( - input=paddle.layer.data( - name='target_language_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)), - size=word_vector_dim, - param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) - group_inputs.append(trg_embedding) - - # For decoder equipped with attention mechanism, in training, - # target embeding (the groudtruth) is the data input, - # while encoded source sequence is accessed to as an unbounded memory. - # Here, the StaticInput defines a read-only memory - # for the recurrent_group. - decoder = paddle.layer.recurrent_group( - name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) - - lbl = paddle.layer.data( - name='target_language_next_word', - type=paddle.data_type.integer_value_sequence(target_dict_dim)) - cost = paddle.layer.classification_cost(input=decoder, label=lbl) - - return cost + if not is_generating: + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) + + # For decoder equipped with attention mechanism, in training, + # target embeding (the groudtruth) is the data input, + # while encoded source sequence is accessed to as an unbounded memory. + # Here, the StaticInput defines a read-only memory + # for the recurrent_group. + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) + + lbl = paddle.layer.data( + name='target_language_next_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)) + cost = paddle.layer.classification_cost(input=decoder, label=lbl) + + return cost + else: + # In generation, the decoder predicts a next target word based on + # the encoded source sequence and the last generated target word. + + # The encoded source sequence (encoder's output) must be specified by + # StaticInput, which is a read-only memory. + # Embedding of the last generated word is automatically gotten by + # GeneratedInputs, which is initialized by a start mark, such as , + # and must be included in generation. + + trg_embedding = paddle.layer.GeneratedInputV2( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) + group_inputs.append(trg_embedding) + + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, + eos_id=1, + beam_size=beam_size, + max_length=max_length) + + return beam_gen def main(): diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py index cb98866d874e35b9fda8e170004d687329b7d3e3..b0e8da563e0d65d534d3f224fe5f1c39a67eeb54 100644 --- a/python/paddle/v2/config_base.py +++ b/python/paddle/v2/config_base.py @@ -67,7 +67,16 @@ class Layer(object): self.name = name self.__context__ = {} self.__parent_layers__ = parent_layers - self.__children_layers__ = [] # used for evaluator. + # some layer may have some extra parent layer + self.__extra_parent__ = [] + # used for evaluator. + self.__children_layers__ = [] + + def extra_parent(self): + return self.__extra_parent__ + + def append_extra_parent(self, parent): + self.__extra_parent__.append(parent) def append_child(self, layer, parent_names): self.__children_layers__.append((layer, parent_names)) @@ -78,14 +87,20 @@ class Layer(object): """ self.__context__ = context - # short cut if myself is parsed before. + # STEP: short cut if this layer is parsed before. if self.context_name() in context: if self.use_context_name(): return context[self.context_name()] else: return context[self.name] - # parse parent before myself + # STEP: parse extra_parent that is not used by this layer but must + # be parsed before this layer. + for p in self.__extra_parent__: + p.to_proto(context=context) + + # STEP: parse parent that is used by this layer, get the result and + # insert into kwargs of the next layer's to_proto_impl method. kwargs = dict() for layer_name in self.__parent_layers__: if not isinstance(self.__parent_layers__[layer_name], @@ -97,14 +112,13 @@ class Layer(object): self.__parent_layers__[layer_name]) kwargs[layer_name] = v1_layer - # parse myself. + # STEP: parse myself and add myself into context. ret_val = self.to_proto_impl(**kwargs) - - if self.context_name() is not None and \ - self.context_name() not in context: + if self.context_name() is not None \ + and self.context_name() not in context: context[self.context_name()] = ret_val - # parse children. + # STEP: parse children that should be pased after this layer. for layer, pnames in self.__children_layers__: drop = False @@ -117,6 +131,7 @@ class Layer(object): continue layer.to_proto(context=context) + # STEP: return v1 layer result if self.context_name() is None: return ret_val elif self.use_context_name(): diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 141f7b359196499ee61e6151320fd75dc8b78c31..384de9b9d57f88e84ab6067846174bb037502dc0 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -33,22 +33,25 @@ The primary usage shows below. import collections import inspect -from config_base import Layer, __convert_to_v2__ +import re + import paddle.trainer_config_helpers as conf_helps +from paddle.trainer.config_parser import \ + RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \ + RecurrentLayerGroupEnd, model_type from paddle.trainer_config_helpers.config_parser_utils import \ parse_network_config as __parse__ from paddle.trainer_config_helpers.default_decorators import wrap_act_default from paddle.trainer_config_helpers.default_decorators import \ wrap_bias_attr_default from paddle.trainer_config_helpers.default_decorators import wrap_name_default +from paddle.trainer_config_helpers.layers import RecurrentLayerGroupSetGenerator, Generator from paddle.trainer_config_helpers.layers import layer_support -from paddle.trainer.config_parser import \ - RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \ - RecurrentLayerGroupEnd, model_type import activation -import re +import attr import data_type +from config_base import Layer, __convert_to_v2__ __all__ = ['parse_network', 'data'] @@ -132,54 +135,23 @@ class DataLayerV2(Layer): return doc -class WithExtraParent(Layer): - def extra_parent(self): - return self.__extra_parent__ - - def __init__(self, name=None, parent_layers=None): - self.__extra_parent__ = [] - super(WithExtraParent, self).__init__( - name=name, parent_layers=parent_layers) - - def append_extra_parent(self, parent): - self.__extra_parent__.append(parent) - - def to_proto(self, context): +class MemoryV2(Layer): + def __init__(self, name, extra_input=None, **kwargs): """ - function to set proto attribute + Init memory object, if memory is inited inside recurrent_group step + function, it may depend on a boot_layer that should be initialized + outside recurrent_group, so we: + 1. add RecurrentLayerInput to extra_parent of self. + 2. add boot_layer to the extra_parent of RecurrentLayerInput. + + :param extra_input: list of RecurrentLayerInput + :type extra_input: [RecurrentLayerInput] """ - kwargs = dict() - for p in self.__extra_parent__: - p.to_proto(context=context) - - for layer_name in self.__parent_layers__: - if not isinstance(self.__parent_layers__[layer_name], - collections.Sequence): - v1_layer = self.__parent_layers__[layer_name].to_proto( - context=context) - else: - v1_layer = map(lambda x: x.to_proto(context=context), - self.__parent_layers__[layer_name]) - kwargs[layer_name] = v1_layer - - if self.context_name() is None: - return self.to_proto_impl(context=context, **kwargs) - elif self.context_name() not in context: - context[self.context_name()] = self.to_proto_impl( - context=context, **kwargs) - - if self.use_context_name(): - return context[self.context_name()] - else: - return context[self.name] - - -class MemoryV2(WithExtraParent): - def __init__(self, name, **kwargs): self.name = name super(MemoryV2, self).__init__(name=name, parent_layers=dict()) self.__kwargs__ = kwargs self.__boot_layer_name__ = None + if 'boot_layer' in kwargs: begin_of_current_rnn = [] # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a @@ -202,11 +174,10 @@ class MemoryV2(WithExtraParent): assert begin_of_current_rnn is not None for extra in begin_of_current_rnn: self.append_extra_parent(extra) - assert isinstance(extra, WithExtraParent) extra.append_extra_parent(kwargs['boot_layer']) self.__boot_layer_name__ = kwargs['boot_layer'].name - def to_proto_impl(self, context, **kwargs): + def to_proto_impl(self, **kwargs): args = dict() for each in kwargs: args[each] = kwargs[each] @@ -214,7 +185,7 @@ class MemoryV2(WithExtraParent): args[each] = self.__kwargs__[each] if self.__boot_layer_name__ is not None: - args['boot_layer'] = context[self.__boot_layer_name__] + args['boot_layer'] = self.__context__[self.__boot_layer_name__] size = args.get('size', None) if size is not None: @@ -236,22 +207,6 @@ class MemoryV2(WithExtraParent): return True -class LayerOutputV2(Layer): - """ - LayerOutputV2 is used to store the result of LayerOutput in v1 api. - It will not store it's parents because layer_output has been parsed already. - """ - - def __init__(self, layer_output): - assert isinstance(layer_output, conf_helps.LayerOutput) - self.layer_output = layer_output - super(LayerOutputV2, self).__init__( - name=layer_output.name, parent_layers=dict()) - - def to_proto_impl(self): - return self.layer_output - - class StaticInputV2(object): def __init__(self, input, is_seq=False, size=None): assert isinstance(input, LayerV2) @@ -263,6 +218,66 @@ class StaticInputV2(object): # assert input.size is not None or size is not None +class BaseGeneratedInputV2(object): + def __init__(self): + self.bos_id = None + self.eos_id = None + + def before_real_step(self): + raise NotImplementedError() + + def after_real_step(self, *args): + raise NotImplementedError() + + +class GeneratedInputV2(BaseGeneratedInputV2): + def __init__(self, size, embedding_name, embedding_size): + super(GeneratedInputV2, self).__init__() + self.size = size + self.embedding_name = embedding_name + self.embedding_size = embedding_size + + def after_real_step(self, input): + return max_id(input=input, name='__beam_search_predict__') + + def before_real_step(self): + predict_id = memory( + name='__beam_search_predict__', + size=self.size, + boot_with_const_id=self.bos_id) + + trg_emb = embedding( + input=predict_id, + size=self.embedding_size, + param_attr=attr.ParamAttr(name=self.embedding_name)) + return trg_emb + + +class RecurrentLayerGroupSetGeneratorV2(Layer): + def __init__(self, eos_name, max_length, beam_size, num_results_per_sample): + self.eos_name = eos_name + self.max_length = max_length + self.beam_size = beam_size + self.num_results_per_sample = num_results_per_sample + super(RecurrentLayerGroupSetGeneratorV2, self).__init__( + name=eos_name, parent_layers={}) + + def to_proto_impl(self, **kwargs): + RecurrentLayerGroupSetGenerator( + Generator( + eos_layer_name=self.eos_name, + max_num_frames=self.max_length, + beam_size=self.beam_size, + num_results_per_sample=self.num_results_per_sample)) + return self + + def context_name(self): + return self.eos_name + ".fake" + + def use_context_name(self): + return True + + class MixedLayerV2(Layer): """ This class is use to support `with` grammar. If not, the following code @@ -341,18 +356,24 @@ def mixed(size=0, return MixedLayerV2(size, input, name, act, bias_attr, layer_attr) -class RecurrentLayerInput(WithExtraParent): +class RecurrentLayerInput(Layer): def __init__(self, recurrent_name, index, parent_layers): - assert len(parent_layers) == 1 - self.__parents__ = parent_layers.values()[0] - super(RecurrentLayerInput, self).__init__( - name=self.__parents__[index].name, parent_layers=parent_layers) + parents_len = len(parent_layers) + assert parents_len <= 1 + if parents_len == 0: + self.__parents__ = [] + else: + self.__parents__ = parent_layers.values()[0] self.__recurrent_name__ = recurrent_name + name = self.__parents__[ + index].name if index >= 0 else self.context_name() + super(RecurrentLayerInput, self).__init__( + name=name, parent_layers=parent_layers) def context_name(self): return self.__recurrent_name__ + ".begin" - def to_proto_impl(self, context, **kwargs): + def to_proto_impl(self, **kwargs): model_type('recurrent_nn') RecurrentLayerGroupWithoutOutLinksBegin( name=self.__recurrent_name__, @@ -449,6 +470,11 @@ def recurrent_group(step, input, name=None): for i in xrange(len(non_static_inputs)) ] + extra_input = None + if len(non_static_inputs) == 0: + extra_input = RecurrentLayerInput( + recurrent_name=name, index=-1, parent_layers={}) + def __real_step__(*args): rnn_input = list(args) static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input) @@ -456,6 +482,7 @@ def recurrent_group(step, input, name=None): mem_name = "__%s_memory__" % static_input.input.name mem = memory( name=mem_name, + extra_input=extra_input, is_seq=static_input.is_seq, size=static_input.input.calculate_size, boot_layer=static_input.input) @@ -485,6 +512,73 @@ def recurrent_group(step, input, name=None): return retv +@wrap_name_default() +def beam_search(step, + input, + bos_id, + eos_id, + beam_size, + max_length=500, + name=None, + num_results_per_sample=None): + if num_results_per_sample is None: + num_results_per_sample = beam_size + assert num_results_per_sample <= beam_size + # logger.warning("num_results_per_sample should be less than beam_size") + + if isinstance(input, StaticInputV2) or isinstance(input, + BaseGeneratedInputV2): + input = [input] + + generated_input_index = -1 + + real_input = [] + for i, each_input in enumerate(input): + assert isinstance(each_input, StaticInputV2) or isinstance( + each_input, BaseGeneratedInputV2) + if isinstance(each_input, BaseGeneratedInputV2): + assert generated_input_index == -1 + generated_input_index = i + else: + real_input.append(each_input) + + assert generated_input_index != -1 + + gipt = input[generated_input_index] + assert isinstance(gipt, BaseGeneratedInputV2) + + gipt.bos_id = bos_id + gipt.eos_id = eos_id + + def __real_step__(*args): + eos_name = "__%s_eos_layer__" % name + generator = RecurrentLayerGroupSetGeneratorV2( + eos_name, max_length, beam_size, num_results_per_sample) + + args = list(args) + before_step_layer = gipt.before_real_step() + before_step_layer.append_child( + layer=generator, parent_names=[before_step_layer.name]) + args.insert(generated_input_index, before_step_layer) + + predict = gipt.after_real_step(step(*args)) + + eos_layer = eos(input=predict, eos_id=eos_id, name=eos_name) + predict.append_child(layer=eos_layer, parent_names=[predict.name]) + + return predict + + # tmp = paddle.layer.recurrent_group( + # step=__real_step__, + # input=real_input, + # reverse=False, + # name=name, + # is_generating=True) + tmp = recurrent_group(step=__real_step__, input=real_input, name=name) + + return tmp + + __projection_names__ = filter(lambda x: x.endswith('_projection'), dir(conf_helps)) diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py index 07e0fb74289b649653b42488f0e1c65af8430f72..737b6bf1e2eb60281d4d6e92667d9fe91e243704 100644 --- a/python/paddle/v2/topology.py +++ b/python/paddle/v2/topology.py @@ -17,7 +17,6 @@ import collections from paddle.proto.ModelConfig_pb2 import ModelConfig import layer as v2_layer -from layer import WithExtraParent __all__ = ['Topology'] @@ -41,9 +40,8 @@ def __bfs_travel__(callback, *layers): __break__ = callback(each_layer) if __break__: return - __layers__ = each_layer.__parent_layers__.values() - if isinstance(each_layer, WithExtraParent): - __layers__ = __layers__ + each_layer.extra_parent() + __layers__ = each_layer.__parent_layers__.values() + \ + each_layer.extra_parent() __bfs_travel__(callback, *__layers__)