提交 6da72834 编写于 作者: W wangyang59

make gru_group parameters sharable

上级 ecbff689
...@@ -68,8 +68,8 @@ bool GruStepLayer::init(const LayerMap& layerMap, ...@@ -68,8 +68,8 @@ bool GruStepLayer::init(const LayerMap& layerMap,
if (!Layer::init(layerMap, parameterMap)) return false; if (!Layer::init(layerMap, parameterMap)) return false;
CHECK_EQ(2U, inputLayers_.size()); CHECK_EQ(2U, inputLayers_.size());
CHECK_EQ(getSize() * getSize() * 3, parameters_[0]->getSize()); CHECK_EQ(getSize() * getSize() * 3, parameters_[1]->getSize());
weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[0])); weight_.reset(new Weight(getSize(), getSize() * 3, parameters_[1]));
if (biasParameter_.get() != NULL) { if (biasParameter_.get() != NULL) {
CHECK_EQ(getSize() * 3, biasParameter_->getSize()); CHECK_EQ(getSize() * 3, biasParameter_->getSize());
......
...@@ -2996,7 +2996,7 @@ class GruStepLayer(LayerBase): ...@@ -2996,7 +2996,7 @@ class GruStepLayer(LayerBase):
config_assert(input_layer1.size == size, config_assert(input_layer1.size == size,
'input_layer1.size != layer.size') 'input_layer1.size != layer.size')
self.config.active_gate_type = active_gate_type self.config.active_gate_type = active_gate_type
self.create_input_parameter(0, size * size * 3, [size, size * 3]) self.create_input_parameter(1, size * size * 3, [size, size * 3])
self.create_bias_parameter(bias, size * 3) self.create_bias_parameter(bias, size * 3)
......
...@@ -19,6 +19,9 @@ ...@@ -19,6 +19,9 @@
# to use these units, import this module in your config_file: # to use these units, import this module in your config_file:
# import trainer.recurrent_units # import trainer.recurrent_units
# #
# The modules in this file are DEPRECATED.
# If you would like to use lstm/gru
# please use the functions defined in paddle.trainer_config_helpers.
from paddle.trainer.config_parser import * from paddle.trainer.config_parser import *
......
...@@ -2682,6 +2682,7 @@ def lstm_step_layer(input, ...@@ -2682,6 +2682,7 @@ def lstm_step_layer(input,
@wrap_bias_attr_default() @wrap_bias_attr_default()
@wrap_param_attr_default()
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation()) @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
@wrap_act_default(act=TanhActivation()) @wrap_act_default(act=TanhActivation())
@wrap_name_default('gru_step') @wrap_name_default('gru_step')
...@@ -2693,6 +2694,7 @@ def gru_step_layer(input, ...@@ -2693,6 +2694,7 @@ def gru_step_layer(input,
name=None, name=None,
gate_act=None, gate_act=None,
bias_attr=None, bias_attr=None,
param_attr=None,
layer_attr=None): layer_attr=None):
""" """
...@@ -2714,7 +2716,7 @@ def gru_step_layer(input, ...@@ -2714,7 +2716,7 @@ def gru_step_layer(input,
Layer( Layer(
name=name, name=name,
type=LayerType.GRU_STEP_LAYER, type=LayerType.GRU_STEP_LAYER,
inputs=[input.name, output_mem.name], inputs=[input.name, Input(output_mem.name, **param_attr.attr)],
bias=ParamAttr.to_bias(bias_attr), bias=ParamAttr.to_bias(bias_attr),
size=size, size=size,
active_type=act.name, active_type=act.name,
......
...@@ -822,6 +822,7 @@ def gru_unit(input, ...@@ -822,6 +822,7 @@ def gru_unit(input,
size=None, size=None,
name=None, name=None,
gru_bias_attr=None, gru_bias_attr=None,
gru_param_attr=None,
act=None, act=None,
gate_act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None):
...@@ -862,6 +863,7 @@ def gru_unit(input, ...@@ -862,6 +863,7 @@ def gru_unit(input,
output_mem=out_mem, output_mem=out_mem,
size=size, size=size,
bias_attr=gru_bias_attr, bias_attr=gru_bias_attr,
param_attr=gru_param_attr,
act=act, act=act,
gate_act=gate_act, gate_act=gate_act,
layer_attr=gru_layer_attr) layer_attr=gru_layer_attr)
...@@ -874,6 +876,7 @@ def gru_group(input, ...@@ -874,6 +876,7 @@ def gru_group(input,
name=None, name=None,
reverse=False, reverse=False,
gru_bias_attr=None, gru_bias_attr=None,
gru_param_attr=None,
act=None, act=None,
gate_act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None):
...@@ -922,6 +925,7 @@ def gru_group(input, ...@@ -922,6 +925,7 @@ def gru_group(input,
name=name, name=name,
size=size, size=size,
gru_bias_attr=gru_bias_attr, gru_bias_attr=gru_bias_attr,
gru_param_attr=gru_param_attr,
act=act, act=act,
gate_act=gate_act, gate_act=gate_act,
gru_layer_attr=gru_layer_attr) gru_layer_attr=gru_layer_attr)
...@@ -942,6 +946,7 @@ def simple_gru(input, ...@@ -942,6 +946,7 @@ def simple_gru(input,
mixed_bias_param_attr=None, mixed_bias_param_attr=None,
mixed_layer_attr=None, mixed_layer_attr=None,
gru_bias_attr=None, gru_bias_attr=None,
gru_param_attr=None,
act=None, act=None,
gate_act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None):
...@@ -1010,6 +1015,7 @@ def simple_gru(input, ...@@ -1010,6 +1015,7 @@ def simple_gru(input,
input=m, input=m,
reverse=reverse, reverse=reverse,
gru_bias_attr=gru_bias_attr, gru_bias_attr=gru_bias_attr,
gru_param_attr=gru_param_attr,
act=act, act=act,
gate_act=gate_act, gate_act=gate_act,
gru_layer_attr=gru_layer_attr) gru_layer_attr=gru_layer_attr)
......
...@@ -3,7 +3,7 @@ export configs=(test_fc layer_activations projections test_print_layer ...@@ -3,7 +3,7 @@ export configs=(test_fc layer_activations projections test_print_layer
test_sequence_pooling test_lstmemory_layer test_grumemory_layer test_sequence_pooling test_lstmemory_layer test_grumemory_layer
last_first_seq test_expand_layer test_ntm_layers test_hsigmoid last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops) test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
export whole_configs=(test_split_datasource) export whole_configs=(test_split_datasource)
type: "recurrent_nn"
layers {
name: "data_a"
type: "data"
size: 100
active_type: ""
}
layers {
name: "data_b"
type: "data"
size: 100
active_type: ""
}
layers {
name: "__simple_gru_0___transform"
type: "mixed"
size: 600
active_type: ""
inputs {
input_layer_name: "data_a"
input_parameter_name: "mixed_param"
proj_conf {
type: "fc"
name: "___simple_gru_0___transform.w0"
input_size: 100
output_size: 600
}
}
}
layers {
name: "__simple_gru_0___recurrent_group"
type: "recurrent_layer_group"
active_type: ""
}
layers {
name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
type: "scatter_agent"
size: 600
active_type: ""
}
layers {
name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
type: "agent"
size: 200
active_type: ""
}
layers {
name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
type: "gru_step"
size: 200
active_type: "tanh"
inputs {
input_layer_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
}
inputs {
input_layer_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
input_parameter_name: "gru_param"
}
bias_parameter_name: "gru_bias"
active_gate_type: "sigmoid"
}
layers {
name: "__simple_gru_0__"
type: "gather_agent"
size: 200
active_type: ""
}
layers {
name: "__simple_gru_1___transform"
type: "mixed"
size: 600
active_type: ""
inputs {
input_layer_name: "data_b"
input_parameter_name: "mixed_param"
proj_conf {
type: "fc"
name: "___simple_gru_1___transform.w0"
input_size: 100
output_size: 600
}
}
}
layers {
name: "__simple_gru_1___recurrent_group"
type: "recurrent_layer_group"
active_type: ""
}
layers {
name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
type: "scatter_agent"
size: 600
active_type: ""
}
layers {
name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
type: "agent"
size: 200
active_type: ""
}
layers {
name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
type: "gru_step"
size: 200
active_type: "tanh"
inputs {
input_layer_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
}
inputs {
input_layer_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
input_parameter_name: "gru_param"
}
bias_parameter_name: "gru_bias"
active_gate_type: "sigmoid"
}
layers {
name: "__simple_gru_1__"
type: "gather_agent"
size: 200
active_type: ""
}
layers {
name: "__last_seq_0__"
type: "seqlastins"
size: 200
active_type: "linear"
inputs {
input_layer_name: "__simple_gru_0__"
}
trans_type: "non-seq"
}
layers {
name: "__last_seq_1__"
type: "seqlastins"
size: 200
active_type: "linear"
inputs {
input_layer_name: "__simple_gru_1__"
}
trans_type: "non-seq"
}
layers {
name: "__fc_layer_0__"
type: "fc"
size: 10
active_type: "softmax"
inputs {
input_layer_name: "__last_seq_0__"
input_parameter_name: "softmax_param"
}
inputs {
input_layer_name: "__last_seq_1__"
input_parameter_name: "softmax_param"
}
}
layers {
name: "label"
type: "data"
size: 10
active_type: ""
}
layers {
name: "__cost_0__"
type: "multi-class-cross-entropy"
size: 1
active_type: ""
inputs {
input_layer_name: "__fc_layer_0__"
}
inputs {
input_layer_name: "label"
}
coeff: 1.0
}
parameters {
name: "mixed_param"
size: 60000
initial_mean: 0.0
initial_std: 0.1
dims: 100
dims: 600
initial_strategy: 0
initial_smart: true
}
parameters {
name: "gru_param"
size: 120000
initial_mean: 0.0
initial_std: 0.0707106781187
dims: 200
dims: 600
initial_strategy: 0
initial_smart: true
}
parameters {
name: "gru_bias"
size: 600
initial_mean: 0.0
initial_std: 0.0
dims: 1
dims: 600
initial_strategy: 0
initial_smart: false
}
parameters {
name: "softmax_param"
size: 2000
initial_mean: 0.0
initial_std: 0.0707106781187
dims: 200
dims: 10
initial_strategy: 0
initial_smart: true
}
input_layer_names: "data_a"
input_layer_names: "data_b"
input_layer_names: "label"
output_layer_names: "__cost_0__"
evaluators {
name: "classification_error_evaluator"
type: "classification_error"
input_layers: "__fc_layer_0__"
input_layers: "label"
}
sub_models {
name: "root"
layer_names: "data_a"
layer_names: "data_b"
layer_names: "__simple_gru_0___transform"
layer_names: "__simple_gru_0___recurrent_group"
layer_names: "__simple_gru_0__"
layer_names: "__simple_gru_1___transform"
layer_names: "__simple_gru_1___recurrent_group"
layer_names: "__simple_gru_1__"
layer_names: "__last_seq_0__"
layer_names: "__last_seq_1__"
layer_names: "__fc_layer_0__"
layer_names: "label"
layer_names: "__cost_0__"
input_layer_names: "data_a"
input_layer_names: "data_b"
input_layer_names: "label"
output_layer_names: "__cost_0__"
evaluator_names: "classification_error_evaluator"
is_recurrent_layer_group: false
}
sub_models {
name: "__simple_gru_0___recurrent_group"
layer_names: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
layer_names: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
layer_names: "__simple_gru_0__@__simple_gru_0___recurrent_group"
is_recurrent_layer_group: true
reversed: false
memories {
layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
is_sequence: false
}
in_links {
layer_name: "__simple_gru_0___transform"
link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
has_subseq: false
}
out_links {
layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
link_name: "__simple_gru_0__"
has_subseq: false
}
target_inlinkid: -1
}
sub_models {
name: "__simple_gru_1___recurrent_group"
layer_names: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
layer_names: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
layer_names: "__simple_gru_1__@__simple_gru_1___recurrent_group"
is_recurrent_layer_group: true
reversed: false
memories {
layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
is_sequence: false
}
in_links {
layer_name: "__simple_gru_1___transform"
link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
has_subseq: false
}
out_links {
layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
link_name: "__simple_gru_1__"
has_subseq: false
}
target_inlinkid: -1
}
...@@ -307,10 +307,10 @@ layers { ...@@ -307,10 +307,10 @@ layers {
active_type: "tanh" active_type: "tanh"
inputs { inputs {
input_layer_name: "__mixed_1__@__gru_group_0___recurrent_group" input_layer_name: "__mixed_1__@__gru_group_0___recurrent_group"
input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
} }
inputs { inputs {
input_layer_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group" input_layer_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w1"
} }
bias_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias" bias_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
active_gate_type: "sigmoid" active_gate_type: "sigmoid"
...@@ -462,14 +462,14 @@ parameters { ...@@ -462,14 +462,14 @@ parameters {
initial_smart: false initial_smart: false
} }
parameters { parameters {
name: "___gru_group_0__@__gru_group_0___recurrent_group.w0" name: "___gru_group_0__@__gru_group_0___recurrent_group.w1"
size: 30000 size: 30000
initial_mean: 0.0 initial_mean: 0.0
initial_std: 0.01 initial_std: 0.1
dims: 100 dims: 100
dims: 300 dims: 300
initial_strategy: 0 initial_strategy: 0
initial_smart: false initial_smart: true
} }
parameters { parameters {
name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias" name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
......
from paddle.trainer_config_helpers import *
settings(learning_rate=1e-4, batch_size=1000)
data_1 = data_layer(name='data_a', size=100)
data_2 = data_layer(name='data_b', size=100)
mixed_param = ParamAttr(name='mixed_param')
gru_param = ParamAttr(name='gru_param')
gru_bias = ParamAttr(name='gru_bias', initial_mean=0., initial_std=0.)
gru1 = simple_gru(
input=data_1,
size=200,
mixed_param_attr=mixed_param,
mixed_bias_param_attr=False,
gru_bias_attr=gru_bias,
gru_param_attr=gru_param)
gru2 = simple_gru(
input=data_2,
size=200,
mixed_param_attr=mixed_param,
mixed_bias_param_attr=False,
gru_bias_attr=gru_bias,
gru_param_attr=gru_param)
softmax_param = ParamAttr(name='softmax_param')
predict = fc_layer(
input=[last_seq(input=gru1), last_seq(input=gru2)],
size=10,
param_attr=[softmax_param, softmax_param],
bias_attr=False,
act=SoftmaxActivation())
outputs(
classification_cost(
input=predict, label=data_layer(
name='label', size=10)))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册