提交 ddc2c6ef 编写于 作者: Y Yu Yang 提交者: GitHub

Merge pull request #1893 from reyoung/Add_error_clipping_to_mt_demo

Add error clipping to MT demo.
......@@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf,
encoder_size=512,
decoder_size=512,
beam_size=3,
max_length=250):
max_length=250,
error_clipping=50):
"""
A wrapper for an attention version of GRU Encoder-Decoder network
is_generating: whether this config is used for generating
......@@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf,
input=src_word_id,
size=word_vector_dim,
param_attr=ParamAttr(name='_source_language_embedding'))
src_forward = simple_gru(input=src_embedding, size=encoder_size)
src_forward = simple_gru(
input=src_embedding,
size=encoder_size,
naive=True,
gru_layer_attr=ExtraLayerAttribute(
error_clipping_threshold=error_clipping))
src_backward = simple_gru(
input=src_embedding, size=encoder_size, reverse=True)
input=src_embedding,
size=encoder_size,
reverse=True,
naive=True,
gru_layer_attr=ExtraLayerAttribute(
error_clipping_threshold=error_clipping))
encoded_vector = concat_layer(input=[src_forward, src_backward])
with mixed_layer(size=decoder_size) as encoded_proj:
......@@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf,
decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer(
gru_step = gru_step_naive_layer(
name='gru_decoder',
input=decoder_inputs,
output_mem=decoder_mem,
size=decoder_size)
size=decoder_size,
layer_attr=ExtraLayerAttribute(
error_clipping_threshold=error_clipping))
with mixed_layer(
size=target_dict_dim, bias_attr=True,
......
......@@ -48,8 +48,7 @@ lstm = lstmemory_group(
size=hidden_dim,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
state_act=TanhActivation())
lstm_last = last_seq(input=lstm)
......
......@@ -51,8 +51,7 @@ def lstm_group(lstm_group_input):
size=hidden_dim,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation(),
lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
state_act=TanhActivation())
return lstm_output
......
......@@ -208,12 +208,15 @@ class ExtraLayerAttribute(object):
drop_rate=None,
device=None):
self.attr = dict()
if isinstance(error_clipping_threshold, float):
assert error_clipping_threshold > 0
self.attr["error_clipping_threshold"] = error_clipping_threshold
if isinstance(drop_rate, float):
assert drop_rate > 0
if error_clipping_threshold is not None:
error_clipping_threshold = float(error_clipping_threshold)
if error_clipping_threshold < 0:
raise ValueError("Error clipping must > 0")
self.attr['error_clipping_threshold'] = error_clipping_threshold
if drop_rate is not None:
drop_rate = float(drop_rate)
if drop_rate < 0:
raise ValueError("Dropout rate must > 0")
self.attr["drop_rate"] = drop_rate
if isinstance(device, int):
......
......@@ -84,6 +84,7 @@ __all__ = [
'GeneratedInput',
'SubsequenceInput',
'gru_step_layer',
'gru_step_naive_layer',
'recurrent_layer',
'BaseGeneratedInput',
'conv_operator',
......@@ -2284,7 +2285,7 @@ def img_pool_layer(input,
type_name = pool_type.name + '-projection' \
if (
isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
else pool_type.name
pool_size_y = pool_size if pool_size_y is None else pool_size_y
......@@ -3084,6 +3085,78 @@ def gru_step_layer(input,
activation=act)
@wrap_bias_attr_default()
@wrap_param_attr_default()
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
@wrap_act_default(act=TanhActivation())
@wrap_name_default('gru_step')
@layer_support(ERROR_CLIPPING, DROPOUT)
def gru_step_naive_layer(input,
output_mem,
size=None,
name=None,
act=None,
gate_act=None,
bias_attr=None,
param_attr=None,
layer_attr=None):
"""
GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
and DROPOUT.
:param input:
:param output_mem:
:param size:
:param name:
:param act:
:param gate_act:
:param bias_attr:
:param param_attr:
:param layer_attr:
:return:
"""
if input.size % 3 != 0:
raise ValueError("GruStep input size must be divided by 3")
if size is None:
size = input.size / 3
def __gate__(gate_name, offset):
with mixed_layer(
name=name + "_" + gate_name,
size=size,
layer_attr=layer_attr,
bias_attr=bias_attr,
act=gate_act) as gate:
gate += identity_projection(input=input, offset=offset)
gate += full_matrix_projection(
input=output_mem, param_attr=param_attr)
return gate
update_gate = __gate__("update", 0)
reset_gate = __gate__("reset", size)
with mixed_layer(
name=name + "_reset_output", bias_attr=False) as reset_output:
reset_output += dotmul_operator(a=output_mem, b=reset_gate)
with mixed_layer(
name=name + "_output_candidate",
size=size,
layer_attr=layer_attr,
bias_attr=bias_attr,
act=act) as output_candidate:
output_candidate += identity_projection(input=input, offset=2 * size)
output_candidate += full_matrix_projection(
input=reset_output, param_attr=param_attr)
with mixed_layer(name=name) as output:
output += identity_projection(output_mem)
output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
output += dotmul_operator(a=output_candidate, b=update_gate)
return output
@wrap_name_default()
@layer_support()
def get_output_layer(input, arg_name, name=None, layer_attr=None):
......
......@@ -825,7 +825,8 @@ def gru_unit(input,
gru_param_attr=None,
act=None,
gate_act=None,
gru_layer_attr=None):
gru_layer_attr=None,
naive=False):
"""
Define calculations that a gated recurrent unit performs in a single time
step. This function itself is not a recurrent layer, so that it can not be
......@@ -857,7 +858,12 @@ def gru_unit(input,
out_mem = memory(name=name, size=size)
gru_out = gru_step_layer(
if naive:
__step__ = gru_step_naive_layer
else:
__step__ = gru_step_layer
gru_out = __step__(
name=name,
input=input,
output_mem=out_mem,
......@@ -879,7 +885,8 @@ def gru_group(input,
gru_param_attr=None,
act=None,
gate_act=None,
gru_layer_attr=None):
gru_layer_attr=None,
naive=False):
"""
gru_group is a recurrent layer group version of Gated Recurrent Unit. It
does exactly the same calculation as the grumemory layer does. A promising
......@@ -928,7 +935,8 @@ def gru_group(input,
gru_param_attr=gru_param_attr,
act=act,
gate_act=gate_act,
gru_layer_attr=gru_layer_attr)
gru_layer_attr=gru_layer_attr,
naive=naive)
return recurrent_group(
name='%s_recurrent_group' % name,
......@@ -949,7 +957,8 @@ def simple_gru(input,
gru_param_attr=None,
act=None,
gate_act=None,
gru_layer_attr=None):
gru_layer_attr=None,
naive=False):
"""
You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
simple_gru in network.py. The reason why there are so many interfaces is
......@@ -1018,7 +1027,8 @@ def simple_gru(input,
gru_param_attr=gru_param_attr,
act=act,
gate_act=gate_act,
gru_layer_attr=gru_layer_attr)
gru_layer_attr=gru_layer_attr,
naive=naive)
@wrap_name_default('simple_gru2')
......
......@@ -320,6 +320,7 @@ layers {
}
}
drop_rate: 0.5
error_clipping_threshold: 40.0
}
parameters {
name: "___embedding_0__.w0"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册