提交 3f1151a5 编写于 作者: Y Yu Yang

Add error clipping to MT demo.

* Compose GRU step naive layer in trainer config helpers.
  * It is uses mixed_layer for gate.
  * It supports ERROR_CLIPPING, DROPOUT
* Add error clipping in MT demo.
* Fix #1143
* Fix #1891
上级 aa39ca8d
...@@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf, ...@@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf,
encoder_size=512, encoder_size=512,
decoder_size=512, decoder_size=512,
beam_size=3, beam_size=3,
max_length=250): max_length=250,
error_clipping=50):
""" """
A wrapper for an attention version of GRU Encoder-Decoder network A wrapper for an attention version of GRU Encoder-Decoder network
is_generating: whether this config is used for generating is_generating: whether this config is used for generating
...@@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf, ...@@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf,
input=src_word_id, input=src_word_id,
size=word_vector_dim, size=word_vector_dim,
param_attr=ParamAttr(name='_source_language_embedding')) param_attr=ParamAttr(name='_source_language_embedding'))
src_forward = simple_gru(input=src_embedding, size=encoder_size) src_forward = simple_gru(
input=src_embedding,
size=encoder_size,
naive=True,
gru_layer_attr=ExtraLayerAttribute(
error_clipping_threshold=error_clipping))
src_backward = simple_gru( src_backward = simple_gru(
input=src_embedding, size=encoder_size, reverse=True) input=src_embedding,
size=encoder_size,
reverse=True,
naive=True,
gru_layer_attr=ExtraLayerAttribute(
error_clipping_threshold=error_clipping))
encoded_vector = concat_layer(input=[src_forward, src_backward]) encoded_vector = concat_layer(input=[src_forward, src_backward])
with mixed_layer(size=decoder_size) as encoded_proj: with mixed_layer(size=decoder_size) as encoded_proj:
...@@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf, ...@@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf,
decoder_inputs += full_matrix_projection(input=context) decoder_inputs += full_matrix_projection(input=context)
decoder_inputs += full_matrix_projection(input=current_word) decoder_inputs += full_matrix_projection(input=current_word)
gru_step = gru_step_layer( gru_step = gru_step_naive_layer(
name='gru_decoder', name='gru_decoder',
input=decoder_inputs, input=decoder_inputs,
output_mem=decoder_mem, output_mem=decoder_mem,
size=decoder_size) size=decoder_size,
layer_attr=ExtraLayerAttribute(
error_clipping_threshold=error_clipping))
with mixed_layer( with mixed_layer(
size=target_dict_dim, bias_attr=True, size=target_dict_dim, bias_attr=True,
......
...@@ -28,7 +28,6 @@ train_conf = seq_to_seq_data(data_dir = data_dir, ...@@ -28,7 +28,6 @@ train_conf = seq_to_seq_data(data_dir = data_dir,
### Algorithm Configuration ### Algorithm Configuration
settings( settings(
learning_method = AdamOptimizer(),
batch_size = 50, batch_size = 50,
learning_rate = 5e-4) learning_rate = 5e-4)
......
...@@ -208,12 +208,15 @@ class ExtraLayerAttribute(object): ...@@ -208,12 +208,15 @@ class ExtraLayerAttribute(object):
drop_rate=None, drop_rate=None,
device=None): device=None):
self.attr = dict() self.attr = dict()
if isinstance(error_clipping_threshold, float): if error_clipping_threshold is not None:
assert error_clipping_threshold > 0 error_clipping_threshold = float(error_clipping_threshold)
self.attr["error_clipping_threshold"] = error_clipping_threshold if error_clipping_threshold < 0:
raise ValueError("Error clipping must > 0")
if isinstance(drop_rate, float): self.attr['error_clipping_threshold'] = error_clipping_threshold
assert drop_rate > 0 if drop_rate is not None:
drop_rate = float(drop_rate)
if drop_rate < 0:
raise ValueError("Dropout rate must > 0")
self.attr["drop_rate"] = drop_rate self.attr["drop_rate"] = drop_rate
if isinstance(device, int): if isinstance(device, int):
......
...@@ -84,6 +84,7 @@ __all__ = [ ...@@ -84,6 +84,7 @@ __all__ = [
'GeneratedInput', 'GeneratedInput',
'SubsequenceInput', 'SubsequenceInput',
'gru_step_layer', 'gru_step_layer',
'gru_step_naive_layer',
'recurrent_layer', 'recurrent_layer',
'BaseGeneratedInput', 'BaseGeneratedInput',
'conv_operator', 'conv_operator',
...@@ -2284,7 +2285,7 @@ def img_pool_layer(input, ...@@ -2284,7 +2285,7 @@ def img_pool_layer(input,
type_name = pool_type.name + '-projection' \ type_name = pool_type.name + '-projection' \
if ( if (
isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \ isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
else pool_type.name else pool_type.name
pool_size_y = pool_size if pool_size_y is None else pool_size_y pool_size_y = pool_size if pool_size_y is None else pool_size_y
...@@ -3084,6 +3085,78 @@ def gru_step_layer(input, ...@@ -3084,6 +3085,78 @@ def gru_step_layer(input,
activation=act) activation=act)
@wrap_bias_attr_default()
@wrap_param_attr_default()
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
@wrap_act_default(act=TanhActivation())
@wrap_name_default('gru_step')
@layer_support(ERROR_CLIPPING, DROPOUT)
def gru_step_naive_layer(input,
output_mem,
size=None,
name=None,
act=None,
gate_act=None,
bias_attr=None,
param_attr=None,
layer_attr=None):
"""
GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
and DROPOUT.
:param input:
:param output_mem:
:param size:
:param name:
:param act:
:param gate_act:
:param bias_attr:
:param param_attr:
:param layer_attr:
:return:
"""
if input.size % 3 != 0:
raise ValueError("GruStep input size must be divided by 3")
if size is None:
size = input.size / 3
def __gate__(gate_name, offset):
with mixed_layer(
name=name + "_" + gate_name,
size=size,
layer_attr=layer_attr,
bias_attr=bias_attr,
act=gate_act) as gate:
gate += identity_projection(input=input, offset=offset)
gate += full_matrix_projection(
input=output_mem, param_attr=param_attr)
return gate
update_gate = __gate__("update", 0)
reset_gate = __gate__("reset", size)
with mixed_layer(
name=name + "_reset_output", bias_attr=False) as reset_output:
reset_output += dotmul_operator(a=output_mem, b=reset_gate)
with mixed_layer(
name=name + "_output_candidate",
size=size,
layer_attr=layer_attr,
bias_attr=bias_attr,
act=act) as output_candidate:
output_candidate += identity_projection(input=input, offset=2 * size)
output_candidate += full_matrix_projection(
input=reset_output, param_attr=param_attr)
with mixed_layer(name=name) as output:
output += identity_projection(output_mem)
output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
output += dotmul_operator(a=output_candidate, b=update_gate)
return output
@wrap_name_default() @wrap_name_default()
@layer_support() @layer_support()
def get_output_layer(input, arg_name, name=None, layer_attr=None): def get_output_layer(input, arg_name, name=None, layer_attr=None):
......
...@@ -825,7 +825,8 @@ def gru_unit(input, ...@@ -825,7 +825,8 @@ def gru_unit(input,
gru_param_attr=None, gru_param_attr=None,
act=None, act=None,
gate_act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None,
naive=False):
""" """
Define calculations that a gated recurrent unit performs in a single time Define calculations that a gated recurrent unit performs in a single time
step. This function itself is not a recurrent layer, so that it can not be step. This function itself is not a recurrent layer, so that it can not be
...@@ -857,7 +858,12 @@ def gru_unit(input, ...@@ -857,7 +858,12 @@ def gru_unit(input,
out_mem = memory(name=name, size=size) out_mem = memory(name=name, size=size)
gru_out = gru_step_layer( if naive:
__step__ = gru_step_naive_layer
else:
__step__ = gru_step_layer
gru_out = __step__(
name=name, name=name,
input=input, input=input,
output_mem=out_mem, output_mem=out_mem,
...@@ -879,7 +885,8 @@ def gru_group(input, ...@@ -879,7 +885,8 @@ def gru_group(input,
gru_param_attr=None, gru_param_attr=None,
act=None, act=None,
gate_act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None,
naive=False):
""" """
gru_group is a recurrent layer group version of Gated Recurrent Unit. It gru_group is a recurrent layer group version of Gated Recurrent Unit. It
does exactly the same calculation as the grumemory layer does. A promising does exactly the same calculation as the grumemory layer does. A promising
...@@ -928,7 +935,8 @@ def gru_group(input, ...@@ -928,7 +935,8 @@ def gru_group(input,
gru_param_attr=gru_param_attr, gru_param_attr=gru_param_attr,
act=act, act=act,
gate_act=gate_act, gate_act=gate_act,
gru_layer_attr=gru_layer_attr) gru_layer_attr=gru_layer_attr,
naive=naive)
return recurrent_group( return recurrent_group(
name='%s_recurrent_group' % name, name='%s_recurrent_group' % name,
...@@ -949,7 +957,8 @@ def simple_gru(input, ...@@ -949,7 +957,8 @@ def simple_gru(input,
gru_param_attr=None, gru_param_attr=None,
act=None, act=None,
gate_act=None, gate_act=None,
gru_layer_attr=None): gru_layer_attr=None,
naive=False):
""" """
You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group, You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
simple_gru in network.py. The reason why there are so many interfaces is simple_gru in network.py. The reason why there are so many interfaces is
...@@ -1018,7 +1027,8 @@ def simple_gru(input, ...@@ -1018,7 +1027,8 @@ def simple_gru(input,
gru_param_attr=gru_param_attr, gru_param_attr=gru_param_attr,
act=act, act=act,
gate_act=gate_act, gate_act=gate_act,
gru_layer_attr=gru_layer_attr) gru_layer_attr=gru_layer_attr,
naive=naive)
@wrap_name_default('simple_gru2') @wrap_name_default('simple_gru2')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册