diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index fdb6f83f2ba510232714fb8a9c7c1af837a753ff..21eba71527e60833e0c69b344ecc639626faa529 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -3173,11 +3173,11 @@ def memory(name, @wrap_bias_attr_default() -@wrap_act_default( - param_names=['gate_act', 'state_act'], act=SigmoidActivation()) +@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation()) +@wrap_act_default(param_names=['state_act'], act=TanhActivation()) @wrap_act_default(act=TanhActivation()) @wrap_name_default('lstm_step') -@layer_support() +@layer_support(ERROR_CLIPPING, DROPOUT) def lstm_step_layer(input, state, size=None, @@ -3531,12 +3531,7 @@ def SubsequenceInput(input): @wrap_name_default("recurrent_group") -def recurrent_group(step, - input, - reverse=False, - name=None, - targetInlink=None, - is_generating=False): +def recurrent_group(step, input, reverse=False, name=None, targetInlink=None): """ Recurrent layer group is an extremely flexible recurrent unit in PaddlePaddle. As long as the user defines the calculation done within a @@ -3602,21 +3597,12 @@ def recurrent_group(step, :type targetInlink: LayerOutput|SubsequenceInput - :param is_generating: If is generating, none of input type should be LayerOutput; - else, for training or testing, one of the input type must - be LayerOutput. - - :type is_generating: bool - :return: LayerOutput object. :rtype: LayerOutput """ model_type('recurrent_nn') - def is_single_input(x): - return isinstance(x, LayerOutput) or isinstance(x, StaticInput) - - if is_single_input(input): + if isinstance(input, LayerOutput) or isinstance(input, StaticInput): input = [input] assert isinstance(input, collections.Sequence) @@ -3630,13 +3616,8 @@ def recurrent_group(step, in_links=map(lambda x: x.name, in_links), seq_reversed=reverse) in_args = [] - has_LayerOutput = False for each_input in input: - assert is_single_input(each_input) - if isinstance(each_input, LayerOutput): - in_args.append(each_input) - has_LayerOutput = True - else: # StaticInput + if isinstance(each_input, StaticInput): # StaticInput mem_name = "__%s_memory__" % each_input.input.name mem = memory( name=None, @@ -3644,24 +3625,26 @@ def recurrent_group(step, boot_layer=each_input.input) mem.set_input(mem) in_args.append(mem) - - assert (is_generating != has_LayerOutput) + else: + in_args.append(each_input) layer_outs = step(*in_args) if isinstance(layer_outs, LayerOutput): layer_outs = [layer_outs] - for ot in layer_outs: - assert isinstance(ot, LayerOutput) - ot.reverse = reverse - RecurrentLayerGroupSetOutLink(ot.name) + for layer_out in layer_outs: + assert isinstance( + layer_out, LayerOutput + ), "Type of step function's return value must be LayerOutput." + layer_out.reverse = reverse + RecurrentLayerGroupSetOutLink(layer_out.name) RecurrentLayerGroupEnd(name=name) for layer_out in layer_outs: - # Thee previous full_name is the name is the rnn group - # We need a full_name outside the rnn group + # The previous full_name is the name inside the recurrent group. + # We need a full_name outside the recurrent group. layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name) if len(layer_outs) == 1: @@ -3684,7 +3667,20 @@ class BaseGeneratedInput(object): class GeneratedInput(BaseGeneratedInput): def after_real_step(self, input): - return maxid_layer(input=input, name='__beam_search_predict__') + if isinstance(input, LayerOutput): + input = [input] + elif isinstance(input, collections.Sequence): + input = list(input) + if len(input) > 1: + logger.info( + ("More than one layers inside the recurrent_group " + "are returned as outputs of the entire recurrent_group " + "PLEASE garantee the first output is probability of " + "the predicted next word.")) + + return [maxid_layer( + input=input[0], name='__beam_search_predict__')] + ( + input[1:] if len(input) > 1 else []) def before_real_step(self): predict_id = memory( @@ -3871,6 +3867,7 @@ def beam_search(step, :type step: callable :param input: Input data for the recurrent unit, which should include the previously generated words as a GeneratedInput object. + In beam_search, none of the input's type should be LayerOutput. :type input: list :param bos_id: Index of the start symbol in the dictionary. The start symbol is a special token for NLP task, which indicates the @@ -3912,15 +3909,18 @@ def beam_search(step, real_input = [] for i, each_input in enumerate(input): - assert isinstance(each_input, StaticInput) or isinstance( - each_input, BaseGeneratedInput) + assert not isinstance(each_input, LayerOutput), ( + "in beam_search, " + "none of the input should has a type of LayerOutput.") if isinstance(each_input, BaseGeneratedInput): - assert generated_input_index == -1 + assert generated_input_index == -1, ("recurrent_group accepts " + "only one GeneratedInput.") generated_input_index = i + else: real_input.append(each_input) - assert generated_input_index != -1 + assert generated_input_index != -1, "No GeneratedInput is given." gipt = input[generated_input_index] @@ -3941,17 +3941,11 @@ def beam_search(step, predict = gipt.after_real_step(step(*args)) - eos_layer(input=predict, eos_id=eos_id, name=eos_name) + eos_layer(input=predict[0], eos_id=eos_id, name=eos_name) return predict - tmp = recurrent_group( - step=__real_step__, - input=real_input, - reverse=False, - name=name, - is_generating=True) - - return tmp + return recurrent_group( + step=__real_step__, input=real_input, reverse=False, name=name) def __cost_input__(input, label, weight=None): diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 810bea913ec79b2df0eb63ed5a4fd411549ff2e9..dcc4fec4f3313f2ad10073dcecbc015be4021abd 100755 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -614,18 +614,17 @@ def simple_lstm(input, @wrap_name_default('lstm_unit') def lstmemory_unit(input, - memory_boot=None, + out_memory=None, name=None, size=None, param_attr=None, act=None, gate_act=None, state_act=None, - mixed_bias_attr=None, + input_proj_bias_attr=None, + input_proj_layer_attr=None, lstm_bias_attr=None, - mixed_layer_attr=None, - lstm_layer_attr=None, - get_output_layer_attr=None): + lstm_layer_attr=None): """ Define calculations that a LSTM unit performs during a single time step. This function itself is not a recurrent layer, so it can not be @@ -662,8 +661,8 @@ def lstmemory_unit(input, :param input: input layer name. :type input: LayerOutput - :param memory_boot: the initialization state of the LSTM cell. - :type memory_boot: LayerOutput | None + :param out_memory: output of previous time step + :type out_memory: LayerOutput | None :param name: lstmemory unit name. :type name: basestring :param size: lstmemory unit size. @@ -676,33 +675,35 @@ def lstmemory_unit(input, :type gate_act: BaseActivation :param state_act: lstm state activiation type. :type state_act: BaseActivation - :param mixed_bias_attr: bias parameter attribute of mixed layer. - False means no bias, None means default bias. - :type mixed_bias_attr: ParameterAttribute|False + :param input_proj_bias_attr: bias attribute for input-to-hidden projection. + False means no bias, None means default bias. + :type input_proj_bias_attr: ParameterAttribute|False|None + :param input_proj_layer_attr: extra layer attribute for input to hidden + projection of the LSTM unit, such as dropout, error clipping. + :type input_proj_layer_attr: ExtraLayerAttribute :param lstm_bias_attr: bias parameter attribute of lstm layer. - False means no bias, None means default bias. + False means no bias, None means default bias. :type lstm_bias_attr: ParameterAttribute|False - :param mixed_layer_attr: mixed layer's extra attribute. - :type mixed_layer_attr: ExtraLayerAttribute :param lstm_layer_attr: lstm layer's extra attribute. :type lstm_layer_attr: ExtraLayerAttribute - :param get_output_layer_attr: get output layer's extra attribute. - :type get_output_layer_attr: ExtraLayerAttribute :return: lstmemory unit name. :rtype: LayerOutput """ if size is None: assert input.size % 4 == 0 size = input.size / 4 - out_mem = memory(name=name, size=size) - state_mem = memory( - name="%s_state" % name, size=size, boot_layer=memory_boot) + if out_memory is None: + out_mem = memory(name=name, size=size) + else: + out_mem = out_memory + + state_mem = memory(name="%s_state" % name, size=size) with mixed_layer( name="%s_input_recurrent" % name, size=size * 4, - bias_attr=mixed_bias_attr, - layer_attr=mixed_layer_attr, + bias_attr=input_proj_bias_attr, + layer_attr=input_proj_layer_attr, act=IdentityActivation()) as m: m += identity_projection(input=input) m += full_matrix_projection(input=out_mem, param_attr=param_attr) @@ -717,11 +718,7 @@ def lstmemory_unit(input, gate_act=gate_act, state_act=state_act, layer_attr=lstm_layer_attr) - get_output_layer( - name='%s_state' % name, - input=lstm_out, - arg_name='state', - layer_attr=get_output_layer_attr) + get_output_layer(name='%s_state' % name, input=lstm_out, arg_name='state') return lstm_out @@ -730,17 +727,16 @@ def lstmemory_unit(input, def lstmemory_group(input, size=None, name=None, - memory_boot=None, + out_memory=None, reverse=False, param_attr=None, act=None, gate_act=None, state_act=None, - mixed_bias_attr=None, + input_proj_bias_attr=None, + input_proj_layer_attr=None, lstm_bias_attr=None, - mixed_layer_attr=None, - lstm_layer_attr=None, - get_output_layer_attr=None): + lstm_layer_attr=None): """ lstm_group is a recurrent_group version of Long Short Term Memory. It does exactly the same calculation as the lstmemory layer (see lstmemory in @@ -774,8 +770,8 @@ def lstmemory_group(input, :type size: int :param name: name of the lstmemory group. :type name: basestring - :param memory_boot: the initialization state of LSTM cell. - :type memory_boot: LayerOutput | None + :param out_memory: output of previous time step + :type out_memory: LayerOutput | None :param reverse: is lstm reversed :type reverse: bool :param param_attr: Parameter config, None if use default. @@ -786,18 +782,17 @@ def lstmemory_group(input, :type gate_act: BaseActivation :param state_act: lstm state activiation type. :type state_act: BaseActivation - :param mixed_bias_attr: bias parameter attribute of mixed layer. - False means no bias, None means default bias. - :type mixed_bias_attr: ParameterAttribute|False :param lstm_bias_attr: bias parameter attribute of lstm layer. False means no bias, None means default bias. :type lstm_bias_attr: ParameterAttribute|False - :param mixed_layer_attr: mixed layer's extra attribute. - :type mixed_layer_attr: ExtraLayerAttribute + :param input_proj_bias_attr: bias attribute for input-to-hidden projection. + False means no bias, None means default bias. + :type input_proj_bias_attr: ParameterAttribute|False|None + :param input_proj_layer_attr: extra layer attribute for input to hidden + projection of the LSTM unit, such as dropout, error clipping. + :type input_proj_layer_attr: ExtraLayerAttribute :param lstm_layer_attr: lstm layer's extra attribute. :type lstm_layer_attr: ExtraLayerAttribute - :param get_output_layer_attr: get output layer's extra attribute. - :type get_output_layer_attr: ExtraLayerAttribute :return: the lstmemory group. :rtype: LayerOutput """ @@ -805,18 +800,17 @@ def lstmemory_group(input, def __lstm_step__(ipt): return lstmemory_unit( input=ipt, - memory_boot=memory_boot, name=name, size=size, - mixed_bias_attr=mixed_bias_attr, - mixed_layer_attr=mixed_layer_attr, - param_attr=param_attr, - lstm_bias_attr=lstm_bias_attr, act=act, gate_act=gate_act, state_act=state_act, + out_memory=out_memory, + input_proj_bias_attr=input_proj_bias_attr, + input_proj_layer_attr=input_proj_layer_attr, + param_attr=param_attr, lstm_layer_attr=lstm_layer_attr, - get_output_layer_attr=get_output_layer_attr) + lstm_bias_attr=lstm_bias_attr) return recurrent_group( name='%s_recurrent_group' % name, diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr index 7f2aa5a0fea1f4628e4effca5ce9af896f6e6c2c..75cf2312032e187dafc66199e933d3ad0fa33050 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr @@ -104,7 +104,7 @@ layers { } bias_parameter_name: "lstm_bias" active_gate_type: "sigmoid" - active_state_type: "sigmoid" + active_state_type: "tanh" } layers { name: "__lstm_group_0___state@__lstm_group_0___recurrent_group" @@ -183,7 +183,7 @@ layers { } bias_parameter_name: "lstm_bias" active_gate_type: "sigmoid" - active_state_type: "sigmoid" + active_state_type: "tanh" } layers { name: "__lstm_group_1___state@__lstm_group_1___recurrent_group" diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr index af1b63c5dfbf0984a20eda02d608f76a454613c6..711785be37dbe7f2decc161d1b8e1ead62927b20 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr @@ -258,7 +258,7 @@ layers { } bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias" active_gate_type: "sigmoid" - active_state_type: "sigmoid" + active_state_type: "tanh" } layers { name: "__lstm_group_0___state@__lstm_group_0___recurrent_group" diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py index 05810597b3154c3b287441465db16ee6e24b0ca2..565e281a6e1deff18aa48f97eb2f0e39ca79752f 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py +++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py @@ -20,12 +20,13 @@ lstm1 = lstmemory_group( input=m1, param_attr=lstm_param, lstm_bias_attr=lstm_bias, - mixed_bias_attr=False) + input_proj_bias_attr=False) + lstm2 = lstmemory_group( input=m2, param_attr=lstm_param, lstm_bias_attr=lstm_bias, - mixed_bias_attr=False) + input_proj_bias_attr=False) softmax_param = ParamAttr(name='softmax_param')