提交 ddc2c6ef 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #1893 from reyoung/Add_error_clipping_to_mt_demo

Add error clipping to MT demo.
...@@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf, ...@@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf,
encoder_size=512, encoder_size=512,
decoder_size=512, decoder_size=512,
beam_size=3, beam_size=3,
max_length=250): max_length=250,
error_clipping=50):
""" """
A wrapper for an attention version of GRU Encoder-Decoder network A wrapper for an attention version of GRU Encoder-Decoder network
is_generating: whether this config is used for generating is_generating: whether this config is used for generating
...@@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf, ...@@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf,
input=src_word_id, input=src_word_id,
size=word_vector_dim, size=word_vector_dim,
param_attr=ParamAttr(name='_source_language_embedding')) param_attr=ParamAttr(name='_source_language_embedding'))
src_forward = simple_gru(input=src_embedding, size=encoder_size) src_forward = simple_gru(
input=src_embedding,
size=encoder_size,
naive=True,
gru_layer_attr=ExtraLayerAttribute(
error_clipping_threshold=error_clipping))
src_backward = simple_gru( src_backward = simple_gru(
input=src_embedding, size=encoder_size, reverse=True) input=src_embedding,
size=encoder_size,
reverse=True,
naive=True,
gru_layer_attr=ExtraLayerAttribute(
error_clipping_threshold=error_clipping))
encoded_vector = concat_layer(input=[src_forward, src_backward]) encoded_vector = concat_layer(input=[src_forward, src_backward])
with mixed_layer(size=decoder_size) as encoded_proj: with mixed_layer(size=decoder_size) as encoded_proj:
...@@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf, ...@@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf,
decoder_inputs += full_matrix_projection(input=context) decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(input=current_word) decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer( gru_step = gru_step_naive_layer(
name='gru_decoder', name='gru_decoder',
input=decoder_inputs, input=decoder_inputs,
output_mem=decoder_mem, output_mem=decoder_mem,
size=decoder_size) size=decoder_size,
layer_attr=ExtraLayerAttribute(
error_clipping_threshold=error_clipping))
with mixed_layer( with mixed_layer(
size=target_dict_dim, bias_attr=True, size=target_dict_dim, bias_attr=True,
......
...@@ -48,8 +48,7 @@ lstm = lstmemory_group( ...@@ -48,8 +48,7 @@ lstm = lstmemory_group(
size=hidden_dim, size=hidden_dim,
act=TanhActivation(), act=TanhActivation(),
gate_act=SigmoidActivation(), gate_act=SigmoidActivation(),
state_act=TanhActivation(), state_act=TanhActivation())
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
lstm_last = last_seq(input=lstm) lstm_last = last_seq(input=lstm)
......
...@@ -51,8 +51,7 @@ def lstm_group(lstm_group_input): ...@@ -51,8 +51,7 @@ def lstm_group(lstm_group_input):
size=hidden_dim, size=hidden_dim,
act=TanhActivation(), act=TanhActivation(),
gate_act=SigmoidActivation(), gate_act=SigmoidActivation(),
state_act=TanhActivation(), state_act=TanhActivation())
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
return lstm_output return lstm_output
......
...@@ -208,12 +208,15 @@ class ExtraLayerAttribute(object): ...@@ -208,12 +208,15 @@ class ExtraLayerAttribute(object):
drop_rate=None, drop_rate=None,
device=None): device=None):
self.attr = dict() self.attr = dict()
if isinstance(error_clipping_threshold, float): if error_clipping_threshold is not None:
assert error_clipping_threshold > 0 error_clipping_threshold = float(error_clipping_threshold)
self.attr["error_clipping_threshold"] = error_clipping_threshold if error_clipping_threshold < 0:
raise ValueError("Error clipping must > 0")
if isinstance(drop_rate, float): self.attr['error_clipping_threshold'] = error_clipping_threshold
assert drop_rate > 0 if drop_rate is not None:
drop_rate = float(drop_rate)
if drop_rate < 0:
raise ValueError("Dropout rate must > 0")
self.attr["drop_rate"] = drop_rate self.attr["drop_rate"] = drop_rate
if isinstance(device, int): if isinstance(device, int):
......
...@@ -84,6 +84,7 @@ __all__ = [ ...@@ -84,6 +84,7 @@ __all__ = [
'GeneratedInput', 'GeneratedInput',
'SubsequenceInput', 'SubsequenceInput',
'gru_step_layer', 'gru_step_layer',
'gru_step_naive_layer',
'recurrent_layer', 'recurrent_layer',
'BaseGeneratedInput', 'BaseGeneratedInput',
'conv_operator', 'conv_operator',
...@@ -3084,6 +3085,78 @@ def gru_step_layer(input, ...@@ -3084,6 +3085,78 @@ def gru_step_layer(input,
activation=act) activation=act)
@wrap_bias_attr_default()
@wrap_param_attr_default()
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
@wrap_act_default(act=TanhActivation())
@wrap_name_default('gru_step')
@layer_support(ERROR_CLIPPING, DROPOUT)
def gru_step_naive_layer(input,
output_mem,
size=None,
name=None,
act=None,
gate_act=None,
bias_attr=None,
param_attr=None,
layer_attr=None):
"""
GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
and DROPOUT.
:param input:
:param output_mem:
:param size:
:param name:
:param act:
:param gate_act:
:param bias_attr:
:param param_attr:
:param layer_attr:
:return:
"""
if input.size % 3 != 0:
raise ValueError("GruStep input size must be divided by 3")
if size is None:
size = input.size / 3
def __gate__(gate_name, offset):
with mixed_layer(
name=name + "_" + gate_name,
size=size,
layer_attr=layer_attr,
bias_attr=bias_attr,
act=gate_act) as gate:
gate += identity_projection(input=input, offset=offset)
gate += full_matrix_projection(
input=output_mem, param_attr=param_attr)
return gate
update_gate = __gate__("update", 0)
reset_gate = __gate__("reset", size)
with mixed_layer(
name=name + "_reset_output", bias_attr=False) as reset_output:
reset_output += dotmul_operator(a=output_mem, b=reset_gate)
with mixed_layer(
name=name + "_output_candidate",
size=size,
layer_attr=layer_attr,
bias_attr=bias_attr,
act=act) as output_candidate:
output_candidate += identity_projection(input=input, offset=2 * size)
output_candidate += full_matrix_projection(
input=reset_output, param_attr=param_attr)
with mixed_layer(name=name) as output:
output += identity_projection(output_mem)
output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
output += dotmul_operator(a=output_candidate, b=update_gate)
return output
@wrap_name_default() @wrap_name_default()
@layer_support() @layer_support()
def get_output_layer(input, arg_name, name=None, layer_attr=None): def get_output_layer(input, arg_name, name=None, layer_attr=None):
......
...@@ -825,7 +825,8 @@ def gru_unit(input, ...@@ -825,7 +825,8 @@ def gru_unit(input,
gru_param_attr=None, gru_param_attr=None,
act=None, act=None,
gate_act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None,
naive=False):
""" """
Define calculations that a gated recurrent unit performs in a single time Define calculations that a gated recurrent unit performs in a single time
step. This function itself is not a recurrent layer, so that it can not be step. This function itself is not a recurrent layer, so that it can not be
...@@ -857,7 +858,12 @@ def gru_unit(input, ...@@ -857,7 +858,12 @@ def gru_unit(input,
out_mem = memory(name=name, size=size) out_mem = memory(name=name, size=size)
gru_out = gru_step_layer( if naive:
__step__ = gru_step_naive_layer
else:
__step__ = gru_step_layer
gru_out = __step__(
name=name, name=name,
input=input, input=input,
output_mem=out_mem, output_mem=out_mem,
...@@ -879,7 +885,8 @@ def gru_group(input, ...@@ -879,7 +885,8 @@ def gru_group(input,
gru_param_attr=None, gru_param_attr=None,
act=None, act=None,
gate_act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None,
naive=False):
""" """
gru_group is a recurrent layer group version of Gated Recurrent Unit. It gru_group is a recurrent layer group version of Gated Recurrent Unit. It
does exactly the same calculation as the grumemory layer does. A promising does exactly the same calculation as the grumemory layer does. A promising
...@@ -928,7 +935,8 @@ def gru_group(input, ...@@ -928,7 +935,8 @@ def gru_group(input,
gru_param_attr=gru_param_attr, gru_param_attr=gru_param_attr,
act=act, act=act,
gate_act=gate_act, gate_act=gate_act,
gru_layer_attr=gru_layer_attr) gru_layer_attr=gru_layer_attr,
naive=naive)
return recurrent_group( return recurrent_group(
name='%s_recurrent_group' % name, name='%s_recurrent_group' % name,
...@@ -949,7 +957,8 @@ def simple_gru(input, ...@@ -949,7 +957,8 @@ def simple_gru(input,
gru_param_attr=None, gru_param_attr=None,
act=None, act=None,
gate_act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None,
naive=False):
""" """
You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group, You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
simple_gru in network.py. The reason why there are so many interfaces is simple_gru in network.py. The reason why there are so many interfaces is
...@@ -1018,7 +1027,8 @@ def simple_gru(input, ...@@ -1018,7 +1027,8 @@ def simple_gru(input,
gru_param_attr=gru_param_attr, gru_param_attr=gru_param_attr,
act=act, act=act,
gate_act=gate_act, gate_act=gate_act,
gru_layer_attr=gru_layer_attr) gru_layer_attr=gru_layer_attr,
naive=naive)
@wrap_name_default('simple_gru2') @wrap_name_default('simple_gru2')
......
...@@ -320,6 +320,7 @@ layers { ...@@ -320,6 +320,7 @@ layers {
} }
} }
drop_rate: 0.5 drop_rate: 0.5
error_clipping_threshold: 40.0
} }
parameters { parameters {
name: "___embedding_0__.w0" name: "___embedding_0__.w0"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册