From 6da7283475355537be2a0775bd9df670351eb1c5 Mon Sep 17 00:00:00 2001 From: wangyang59 Date: Fri, 27 Jan 2017 16:11:53 -0800 Subject: [PATCH] make gru_group parameters sharable --- paddle/gserver/layers/GruStepLayer.cpp | 4 +- python/paddle/trainer/config_parser.py | 2 +- python/paddle/trainer/recurrent_units.py | 3 + .../paddle/trainer_config_helpers/layers.py | 4 +- .../paddle/trainer_config_helpers/networks.py | 6 + .../tests/configs/file_list.sh | 2 +- .../configs/protostr/shared_gru.protostr | 295 ++++++++++++++++++ .../configs/protostr/test_rnn_group.protostr | 8 +- .../tests/configs/shared_gru.py | 40 +++ 9 files changed, 355 insertions(+), 9 deletions(-) create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr create mode 100644 python/paddle/trainer_config_helpers/tests/configs/shared_gru.py diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp index 4a1006aa941..ce692c49088 100644 --- a/paddle/gserver/layers/GruStepLayer.cpp +++ b/paddle/gserver/layers/GruStepLayer.cpp @@ -68,8 +68,8 @@ bool GruStepLayer::init(const LayerMap& layerMap, if (!Layer::init(layerMap, parameterMap)) return false; CHECK_EQ(2U, inputLayers_.size()); - CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize()); - weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0])); + CHECK_EQ(getSize() * getSize() * 3, parameters_[1]->getSize()); + weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[1])); if (biasParameter_.get() != NULL) { CHECK_EQ(getSize() * 3, biasParameter_->getSize()); diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 6701eced60d..4fbf076ae98 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2996,7 +2996,7 @@ class GruStepLayer(LayerBase): config_assert(input_layer1.size == size, 'input_layer1.size != layer.size') self.config.active_gate_type = active_gate_type - self.create_input_parameter(0, size * size * 3, [size, size * 3]) + self.create_input_parameter(1, size * size * 3, [size, size * 3]) self.create_bias_parameter(bias, size * 3) diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py index edca279dcad..ff7e8932dca 100644 --- a/python/paddle/trainer/recurrent_units.py +++ b/python/paddle/trainer/recurrent_units.py @@ -19,6 +19,9 @@ # to use these units, import this module in your config_file: # import trainer.recurrent_units # +# The modules in this file are DEPRECATED. +# If you would like to use lstm/gru +# please use the functions defined in paddle.trainer_config_helpers. from paddle.trainer.config_parser import * diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 85a28e14aeb..f0b5d7c3b4f 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2682,6 +2682,7 @@ def lstm_step_layer(input, @wrap_bias_attr_default() +@wrap_param_attr_default() @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation()) @wrap_act_default(act=TanhActivation()) @wrap_name_default('gru_step') @@ -2693,6 +2694,7 @@ def gru_step_layer(input, name=None, gate_act=None, bias_attr=None, + param_attr=None, layer_attr=None): """ @@ -2714,7 +2716,7 @@ def gru_step_layer(input, Layer( name=name, type=LayerType.GRU_STEP_LAYER, - inputs=[input.name, output_mem.name], + inputs=[input.name, Input(output_mem.name, **param_attr.attr)], bias=ParamAttr.to_bias(bias_attr), size=size, active_type=act.name, diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index 375bea34e8a..88e188cb2b1 100644 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -822,6 +822,7 @@ def gru_unit(input, size=None, name=None, gru_bias_attr=None, + gru_param_attr=None, act=None, gate_act=None, gru_layer_attr=None): @@ -862,6 +863,7 @@ def gru_unit(input, output_mem=out_mem, size=size, bias_attr=gru_bias_attr, + param_attr=gru_param_attr, act=act, gate_act=gate_act, layer_attr=gru_layer_attr) @@ -874,6 +876,7 @@ def gru_group(input, name=None, reverse=False, gru_bias_attr=None, + gru_param_attr=None, act=None, gate_act=None, gru_layer_attr=None): @@ -922,6 +925,7 @@ def gru_group(input, name=name, size=size, gru_bias_attr=gru_bias_attr, + gru_param_attr=gru_param_attr, act=act, gate_act=gate_act, gru_layer_attr=gru_layer_attr) @@ -942,6 +946,7 @@ def simple_gru(input, mixed_bias_param_attr=None, mixed_layer_attr=None, gru_bias_attr=None, + gru_param_attr=None, act=None, gate_act=None, gru_layer_attr=None): @@ -1010,6 +1015,7 @@ def simple_gru(input, input=m, reverse=reverse, gru_bias_attr=gru_bias_attr, + gru_param_attr=gru_param_attr, act=act, gate_act=gate_act, gru_layer_attr=gru_layer_attr) diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index 3f1d99701af..ea46b557a26 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -3,7 +3,7 @@ export configs=(test_fc layer_activations projections test_print_layer test_sequence_pooling test_lstmemory_layer test_grumemory_layer last_first_seq test_expand_layer test_ntm_layers test_hsigmoid img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers -test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight +test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr new file mode 100644 index 00000000000..c0868713ebb --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr @@ -0,0 +1,295 @@ +type: "recurrent_nn" +layers { + name: "data_a" + type: "data" + size: 100 + active_type: "" +} +layers { + name: "data_b" + type: "data" + size: 100 + active_type: "" +} +layers { + name: "__simple_gru_0___transform" + type: "mixed" + size: 600 + active_type: "" + inputs { + input_layer_name: "data_a" + input_parameter_name: "mixed_param" + proj_conf { + type: "fc" + name: "___simple_gru_0___transform.w0" + input_size: 100 + output_size: 600 + } + } +} +layers { + name: "__simple_gru_0___recurrent_group" + type: "recurrent_layer_group" + active_type: "" +} +layers { + name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group" + type: "scatter_agent" + size: 600 + active_type: "" +} +layers { + name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group" + type: "agent" + size: 200 + active_type: "" +} +layers { + name: "__simple_gru_0__@__simple_gru_0___recurrent_group" + type: "gru_step" + size: 200 + active_type: "tanh" + inputs { + input_layer_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group" + } + inputs { + input_layer_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group" + input_parameter_name: "gru_param" + } + bias_parameter_name: "gru_bias" + active_gate_type: "sigmoid" +} +layers { + name: "__simple_gru_0__" + type: "gather_agent" + size: 200 + active_type: "" +} +layers { + name: "__simple_gru_1___transform" + type: "mixed" + size: 600 + active_type: "" + inputs { + input_layer_name: "data_b" + input_parameter_name: "mixed_param" + proj_conf { + type: "fc" + name: "___simple_gru_1___transform.w0" + input_size: 100 + output_size: 600 + } + } +} +layers { + name: "__simple_gru_1___recurrent_group" + type: "recurrent_layer_group" + active_type: "" +} +layers { + name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group" + type: "scatter_agent" + size: 600 + active_type: "" +} +layers { + name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group" + type: "agent" + size: 200 + active_type: "" +} +layers { + name: "__simple_gru_1__@__simple_gru_1___recurrent_group" + type: "gru_step" + size: 200 + active_type: "tanh" + inputs { + input_layer_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group" + } + inputs { + input_layer_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group" + input_parameter_name: "gru_param" + } + bias_parameter_name: "gru_bias" + active_gate_type: "sigmoid" +} +layers { + name: "__simple_gru_1__" + type: "gather_agent" + size: 200 + active_type: "" +} +layers { + name: "__last_seq_0__" + type: "seqlastins" + size: 200 + active_type: "linear" + inputs { + input_layer_name: "__simple_gru_0__" + } + trans_type: "non-seq" +} +layers { + name: "__last_seq_1__" + type: "seqlastins" + size: 200 + active_type: "linear" + inputs { + input_layer_name: "__simple_gru_1__" + } + trans_type: "non-seq" +} +layers { + name: "__fc_layer_0__" + type: "fc" + size: 10 + active_type: "softmax" + inputs { + input_layer_name: "__last_seq_0__" + input_parameter_name: "softmax_param" + } + inputs { + input_layer_name: "__last_seq_1__" + input_parameter_name: "softmax_param" + } +} +layers { + name: "label" + type: "data" + size: 10 + active_type: "" +} +layers { + name: "__cost_0__" + type: "multi-class-cross-entropy" + size: 1 + active_type: "" + inputs { + input_layer_name: "__fc_layer_0__" + } + inputs { + input_layer_name: "label" + } + coeff: 1.0 +} +parameters { + name: "mixed_param" + size: 60000 + initial_mean: 0.0 + initial_std: 0.1 + dims: 100 + dims: 600 + initial_strategy: 0 + initial_smart: true +} +parameters { + name: "gru_param" + size: 120000 + initial_mean: 0.0 + initial_std: 0.0707106781187 + dims: 200 + dims: 600 + initial_strategy: 0 + initial_smart: true +} +parameters { + name: "gru_bias" + size: 600 + initial_mean: 0.0 + initial_std: 0.0 + dims: 1 + dims: 600 + initial_strategy: 0 + initial_smart: false +} +parameters { + name: "softmax_param" + size: 2000 + initial_mean: 0.0 + initial_std: 0.0707106781187 + dims: 200 + dims: 10 + initial_strategy: 0 + initial_smart: true +} +input_layer_names: "data_a" +input_layer_names: "data_b" +input_layer_names: "label" +output_layer_names: "__cost_0__" +evaluators { + name: "classification_error_evaluator" + type: "classification_error" + input_layers: "__fc_layer_0__" + input_layers: "label" +} +sub_models { + name: "root" + layer_names: "data_a" + layer_names: "data_b" + layer_names: "__simple_gru_0___transform" + layer_names: "__simple_gru_0___recurrent_group" + layer_names: "__simple_gru_0__" + layer_names: "__simple_gru_1___transform" + layer_names: "__simple_gru_1___recurrent_group" + layer_names: "__simple_gru_1__" + layer_names: "__last_seq_0__" + layer_names: "__last_seq_1__" + layer_names: "__fc_layer_0__" + layer_names: "label" + layer_names: "__cost_0__" + input_layer_names: "data_a" + input_layer_names: "data_b" + input_layer_names: "label" + output_layer_names: "__cost_0__" + evaluator_names: "classification_error_evaluator" + is_recurrent_layer_group: false +} +sub_models { + name: "__simple_gru_0___recurrent_group" + layer_names: "__simple_gru_0___transform@__simple_gru_0___recurrent_group" + layer_names: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group" + layer_names: "__simple_gru_0__@__simple_gru_0___recurrent_group" + is_recurrent_layer_group: true + reversed: false + memories { + layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group" + link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group" + is_sequence: false + } + in_links { + layer_name: "__simple_gru_0___transform" + link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group" + has_subseq: false + } + out_links { + layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group" + link_name: "__simple_gru_0__" + has_subseq: false + } + target_inlinkid: -1 +} +sub_models { + name: "__simple_gru_1___recurrent_group" + layer_names: "__simple_gru_1___transform@__simple_gru_1___recurrent_group" + layer_names: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group" + layer_names: "__simple_gru_1__@__simple_gru_1___recurrent_group" + is_recurrent_layer_group: true + reversed: false + memories { + layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group" + link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group" + is_sequence: false + } + in_links { + layer_name: "__simple_gru_1___transform" + link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group" + has_subseq: false + } + out_links { + layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group" + link_name: "__simple_gru_1__" + has_subseq: false + } + target_inlinkid: -1 +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr index 41d2e2f2671..c1d39f77295 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr @@ -307,10 +307,10 @@ layers { active_type: "tanh" inputs { input_layer_name: "__mixed_1__@__gru_group_0___recurrent_group" - input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w0" } inputs { input_layer_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group" + input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w1" } bias_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias" active_gate_type: "sigmoid" @@ -462,14 +462,14 @@ parameters { initial_smart: false } parameters { - name: "___gru_group_0__@__gru_group_0___recurrent_group.w0" + name: "___gru_group_0__@__gru_group_0___recurrent_group.w1" size: 30000 initial_mean: 0.0 - initial_std: 0.01 + initial_std: 0.1 dims: 100 dims: 300 initial_strategy: 0 - initial_smart: false + initial_smart: true } parameters { name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias" diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py new file mode 100644 index 00000000000..c19bb9685aa --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py @@ -0,0 +1,40 @@ +from paddle.trainer_config_helpers import * + +settings(learning_rate=1e-4, batch_size=1000) + +data_1 = data_layer(name='data_a', size=100) +data_2 = data_layer(name='data_b', size=100) + +mixed_param = ParamAttr(name='mixed_param') + +gru_param = ParamAttr(name='gru_param') +gru_bias = ParamAttr(name='gru_bias', initial_mean=0., initial_std=0.) + +gru1 = simple_gru( + input=data_1, + size=200, + mixed_param_attr=mixed_param, + mixed_bias_param_attr=False, + gru_bias_attr=gru_bias, + gru_param_attr=gru_param) + +gru2 = simple_gru( + input=data_2, + size=200, + mixed_param_attr=mixed_param, + mixed_bias_param_attr=False, + gru_bias_attr=gru_bias, + gru_param_attr=gru_param) + +softmax_param = ParamAttr(name='softmax_param') + +predict = fc_layer( + input=[last_seq(input=gru1), last_seq(input=gru2)], + size=10, + param_attr=[softmax_param, softmax_param], + bias_attr=False, + act=SoftmaxActivation()) +outputs( + classification_cost( + input=predict, label=data_layer( + name='label', size=10))) -- GitLab