From 3f1151a54cf06acb7176b0c59671585b8276e650 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 25 Apr 2017 19:04:22 +0800 Subject: [PATCH] Add error clipping to MT demo. * Compose GRU step naive layer in trainer config helpers. * It is uses mixed_layer for gate. * It supports ERROR_CLIPPING, DROPOUT * Add error clipping in MT demo. * Fix #1143 * Fix #1891 --- demo/seqToseq/seqToseq_net.py | 23 ++++-- demo/seqToseq/translation/train.conf | 1 - python/paddle/trainer_config_helpers/attrs.py | 15 ++-- .../paddle/trainer_config_helpers/layers.py | 75 ++++++++++++++++++- .../paddle/trainer_config_helpers/networks.py | 22 ++++-- 5 files changed, 117 insertions(+), 19 deletions(-) diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py index e523a34d5a9..3d1f86ec3b7 100644 --- a/demo/seqToseq/seqToseq_net.py +++ b/demo/seqToseq/seqToseq_net.py @@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf, encoder_size=512, decoder_size=512, beam_size=3, - max_length=250): + max_length=250, + error_clipping=50): """ A wrapper for an attention version of GRU Encoder-Decoder network is_generating: whether this config is used for generating @@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf, input=src_word_id, size=word_vector_dim, param_attr=ParamAttr(name='_source_language_embedding')) - src_forward = simple_gru(input=src_embedding, size=encoder_size) + src_forward = simple_gru( + input=src_embedding, + size=encoder_size, + naive=True, + gru_layer_attr=ExtraLayerAttribute( + error_clipping_threshold=error_clipping)) src_backward = simple_gru( - input=src_embedding, size=encoder_size, reverse=True) + input=src_embedding, + size=encoder_size, + reverse=True, + naive=True, + gru_layer_attr=ExtraLayerAttribute( + error_clipping_threshold=error_clipping)) encoded_vector = concat_layer(input=[src_forward, src_backward]) with mixed_layer(size=decoder_size) as encoded_proj: @@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf, decoder_inputs += full_matrix_projection(input=context) decoder_inputs += full_matrix_projection(input=current_word) - gru_step = gru_step_layer( + gru_step = gru_step_naive_layer( name='gru_decoder', input=decoder_inputs, output_mem=decoder_mem, - size=decoder_size) + size=decoder_size, + layer_attr=ExtraLayerAttribute( + error_clipping_threshold=error_clipping)) with mixed_layer( size=target_dict_dim, bias_attr=True, diff --git a/demo/seqToseq/translation/train.conf b/demo/seqToseq/translation/train.conf index 72b7ccdbb95..0718f00f681 100644 --- a/demo/seqToseq/translation/train.conf +++ b/demo/seqToseq/translation/train.conf @@ -28,7 +28,6 @@ train_conf = seq_to_seq_data(data_dir = data_dir, ### Algorithm Configuration settings( - learning_method = AdamOptimizer(), batch_size = 50, learning_rate = 5e-4) diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index bf020883460..7b76e87f045 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -208,12 +208,15 @@ class ExtraLayerAttribute(object): drop_rate=None, device=None): self.attr = dict() - if isinstance(error_clipping_threshold, float): - assert error_clipping_threshold > 0 - self.attr["error_clipping_threshold"] = error_clipping_threshold - - if isinstance(drop_rate, float): - assert drop_rate > 0 + if error_clipping_threshold is not None: + error_clipping_threshold = float(error_clipping_threshold) + if error_clipping_threshold < 0: + raise ValueError("Error clipping must > 0") + self.attr['error_clipping_threshold'] = error_clipping_threshold + if drop_rate is not None: + drop_rate = float(drop_rate) + if drop_rate < 0: + raise ValueError("Dropout rate must > 0") self.attr["drop_rate"] = drop_rate if isinstance(device, int): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index f906126d879..635e280ca51 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -84,6 +84,7 @@ __all__ = [ 'GeneratedInput', 'SubsequenceInput', 'gru_step_layer', + 'gru_step_naive_layer', 'recurrent_layer', 'BaseGeneratedInput', 'conv_operator', @@ -2284,7 +2285,7 @@ def img_pool_layer(input, type_name = pool_type.name + '-projection' \ if ( - isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ + isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ else pool_type.name pool_size_y = pool_size if pool_size_y is None else pool_size_y @@ -3084,6 +3085,78 @@ def gru_step_layer(input, activation=act) +@wrap_bias_attr_default() +@wrap_param_attr_default() +@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation()) +@wrap_act_default(act=TanhActivation()) +@wrap_name_default('gru_step') +@layer_support(ERROR_CLIPPING, DROPOUT) +def gru_step_naive_layer(input, + output_mem, + size=None, + name=None, + act=None, + gate_act=None, + bias_attr=None, + param_attr=None, + layer_attr=None): + """ + GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING + and DROPOUT. + + :param input: + :param output_mem: + :param size: + :param name: + :param act: + :param gate_act: + :param bias_attr: + :param param_attr: + :param layer_attr: + :return: + """ + if input.size % 3 != 0: + raise ValueError("GruStep input size must be divided by 3") + if size is None: + size = input.size / 3 + + def __gate__(gate_name, offset): + with mixed_layer( + name=name + "_" + gate_name, + size=size, + layer_attr=layer_attr, + bias_attr=bias_attr, + act=gate_act) as gate: + gate += identity_projection(input=input, offset=offset) + gate += full_matrix_projection( + input=output_mem, param_attr=param_attr) + return gate + + update_gate = __gate__("update", 0) + reset_gate = __gate__("reset", size) + + with mixed_layer( + name=name + "_reset_output", bias_attr=False) as reset_output: + reset_output += dotmul_operator(a=output_mem, b=reset_gate) + + with mixed_layer( + name=name + "_output_candidate", + size=size, + layer_attr=layer_attr, + bias_attr=bias_attr, + act=act) as output_candidate: + output_candidate += identity_projection(input=input, offset=2 * size) + output_candidate += full_matrix_projection( + input=reset_output, param_attr=param_attr) + + with mixed_layer(name=name) as output: + output += identity_projection(output_mem) + output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0) + output += dotmul_operator(a=output_candidate, b=update_gate) + + return output + + @wrap_name_default() @layer_support() def get_output_layer(input, arg_name, name=None, layer_attr=None): diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index cadde11ff81..fb533a47e0b 100755 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -825,7 +825,8 @@ def gru_unit(input, gru_param_attr=None, act=None, gate_act=None, - gru_layer_attr=None): + gru_layer_attr=None, + naive=False): """ Define calculations that a gated recurrent unit performs in a single time step. This function itself is not a recurrent layer, so that it can not be @@ -857,7 +858,12 @@ def gru_unit(input, out_mem = memory(name=name, size=size) - gru_out = gru_step_layer( + if naive: + __step__ = gru_step_naive_layer + else: + __step__ = gru_step_layer + + gru_out = __step__( name=name, input=input, output_mem=out_mem, @@ -879,7 +885,8 @@ def gru_group(input, gru_param_attr=None, act=None, gate_act=None, - gru_layer_attr=None): + gru_layer_attr=None, + naive=False): """ gru_group is a recurrent layer group version of Gated Recurrent Unit. It does exactly the same calculation as the grumemory layer does. A promising @@ -928,7 +935,8 @@ def gru_group(input, gru_param_attr=gru_param_attr, act=act, gate_act=gate_act, - gru_layer_attr=gru_layer_attr) + gru_layer_attr=gru_layer_attr, + naive=naive) return recurrent_group( name='%s_recurrent_group' % name, @@ -949,7 +957,8 @@ def simple_gru(input, gru_param_attr=None, act=None, gate_act=None, - gru_layer_attr=None): + gru_layer_attr=None, + naive=False): """ You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group, simple_gru in network.py. The reason why there are so many interfaces is @@ -1018,7 +1027,8 @@ def simple_gru(input, gru_param_attr=gru_param_attr, act=act, gate_act=gate_act, - gru_layer_attr=gru_layer_attr) + gru_layer_attr=gru_layer_attr, + naive=naive) @wrap_name_default('simple_gru2') -- GitLab