提交 a0be0ed6 编写于 作者: C Cao Ying 提交者: GitHub

Merge pull request #2254 from kuke/error_clipping_dev

modify seq2seq demo to show gradient/error clipping.
...@@ -21,9 +21,12 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -21,9 +21,12 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
size=word_vector_dim, size=word_vector_dim,
param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
src_forward = paddle.networks.simple_gru( src_forward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size) name='src_forward_gru', input=src_embedding, size=encoder_size)
src_backward = paddle.networks.simple_gru( src_backward = paddle.networks.simple_gru(
input=src_embedding, size=encoder_size, reverse=True) name='src_backward_gru',
input=src_embedding,
size=encoder_size,
reverse=True)
encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
#### Decoder #### Decoder
...@@ -34,7 +37,9 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -34,7 +37,9 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
backward_first = paddle.layer.first_seq(input=src_backward) backward_first = paddle.layer.first_seq(input=src_backward)
with paddle.layer.mixed( with paddle.layer.mixed(
size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot: name="decoder_boot_mixed",
size=decoder_size,
act=paddle.activation.Tanh()) as decoder_boot:
decoder_boot += paddle.layer.full_matrix_projection( decoder_boot += paddle.layer.full_matrix_projection(
input=backward_first) input=backward_first)
...@@ -44,11 +49,17 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -44,11 +49,17 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
context = paddle.networks.simple_attention( context = paddle.networks.simple_attention(
name="simple_attention",
encoded_sequence=enc_vec, encoded_sequence=enc_vec,
encoded_proj=enc_proj, encoded_proj=enc_proj,
decoder_state=decoder_mem) decoder_state=decoder_mem)
with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs: with paddle.layer.mixed(
name="input_recurrent",
size=decoder_size * 3,
# enable error clipping
layer_attr=paddle.attr.ExtraAttr(
error_clipping_threshold=100.0)) as decoder_inputs:
decoder_inputs += paddle.layer.full_matrix_projection(input=context) decoder_inputs += paddle.layer.full_matrix_projection(input=context)
decoder_inputs += paddle.layer.full_matrix_projection( decoder_inputs += paddle.layer.full_matrix_projection(
input=current_word) input=current_word)
...@@ -57,9 +68,12 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -57,9 +68,12 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
name='gru_decoder', name='gru_decoder',
input=decoder_inputs, input=decoder_inputs,
output_mem=decoder_mem, output_mem=decoder_mem,
# uncomment to enable local threshold for gradient clipping
# param_attr=paddle.attr.ParamAttr(gradient_clipping_threshold=9.9),
size=decoder_size) size=decoder_size)
with paddle.layer.mixed( with paddle.layer.mixed(
name="gru_step_output",
size=target_dict_dim, size=target_dict_dim,
bias_attr=True, bias_attr=True,
act=paddle.activation.Softmax()) as out: act=paddle.activation.Softmax()) as out:
...@@ -125,7 +139,13 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False): ...@@ -125,7 +139,13 @@ def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
def main(): def main():
paddle.init(use_gpu=False, trainer_count=1) paddle.init(
use_gpu=False,
trainer_count=1,
# log gradient clipping info
log_clipping=True,
# log error clipping info
log_error_clipping=True)
is_generating = False is_generating = False
# source and target dict dim. # source and target dict dim.
...@@ -140,6 +160,8 @@ def main(): ...@@ -140,6 +160,8 @@ def main():
# define optimize method and trainer # define optimize method and trainer
optimizer = paddle.optimizer.Adam( optimizer = paddle.optimizer.Adam(
learning_rate=5e-5, learning_rate=5e-5,
# uncomment to enable global threshold for gradient clipping
# gradient_clipping_threshold=10.0,
regularization=paddle.optimizer.L2Regularization(rate=8e-4)) regularization=paddle.optimizer.L2Regularization(rate=8e-4))
trainer = paddle.trainer.SGD(cost=cost, trainer = paddle.trainer.SGD(cost=cost,
parameters=parameters, parameters=parameters,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册