From d1a17cadd48be68a3abde522e6dc94a608db207d Mon Sep 17 00:00:00 2001 From: phlrain Date: Thu, 29 Nov 2018 17:17:52 +0800 Subject: [PATCH] fix cudnn rnn; test=develop --- paddle/fluid/operators/cudnn_lstm_op.cc | 38 +++++++---- paddle/fluid/operators/cudnn_lstm_op.cu.cc | 9 ++- paddle/fluid/operators/cudnn_lstm_op.h | 7 ++- python/paddle/fluid/layers/nn.py | 73 ++++++++++++---------- 4 files changed, 78 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc index cadc5b883..c73c64f4a 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -122,13 +122,11 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { "The will affect the shape of the Out, last_h, and last_c") .SetDefault(false); AddAttr("input_size", "input size ot the Input Tensor").SetDefault(10); - AddAttr("batch_size", "the instance number the batch").SetDefault(10); AddAttr("hidden_size", "hidden size of the LSTM").SetDefault(100); AddAttr("num_layers", "the total layer number of the LSTM") .SetDefault(1); AddAttr("is_test", "True if in test phase.").SetDefault(false); - AddAttr("fix_seed", "True if it fix dropout seed").SetDefault(false); - AddAttr("seed", "seed to used if fix_seed is True").SetDefault(0); + AddAttr("seed", "seed to used if fix_seed is True").SetDefault(-1); AddComment(R"DOC( CUDNN LSTM implementation @@ -136,16 +134,32 @@ A four-gate Long Short-Term Memory network with no peephole connections. In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: -it = σ(Wi X xt + Ri X ht-1 + bWi + bRi) -ft = σ(Wf X xt + Rf X ht-1 + bWf + bRf) -ot = σ(Wo X xt + Ro X ht-1 + bWo + bRo) -c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc) -ct = ft * ct-1 + it * c't -ht = ot * tanh(ct) +$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ -Where σ is the sigmoid operator: σ(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, +$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + +$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + +$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + +$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + +$$ h_t = o_t \\odot tanh(c_t) $$ + +- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) +- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). +- sigmoid is the logistic sigmoid function. +- $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. +- The $\odot$ is the element-wise product of the vectors. +- `tanh` is the activation functions. +- $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + +Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, X represensts a matrix multiplication -and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively. )DOC"); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 9caf65b53..cadd3772a 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -273,7 +273,6 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { size_t max_len = ctx.Attr("max_len"); float dropout_prob = ctx.Attr("dropout_prob"); bool is_bidirec = ctx.Attr("is_bidirec"); - int batch_size = ctx.Attr("batch_size"); int input_size = ctx.Attr("input_size"); int hidden_size = ctx.Attr("hidden_size"); int num_layers = ctx.Attr("num_layers"); @@ -304,9 +303,13 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); std::random_device rnd; - int seed = ctx.Attr("fix_seed") ? ctx.Attr("seed") : rnd(); + int seed = ctx.Attr("seed"); + if (seed == -1) { + seed = rnd(); + } auto input_w_numel = w->numel(); + auto batch_size = x->dims()[1]; cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, hidden_size, num_layers, dropout_prob, is_bidirec, seed, input_w_numel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.h b/paddle/fluid/operators/cudnn_lstm_op.h index fb4b37e46..fc329cc23 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.h +++ b/paddle/fluid/operators/cudnn_lstm_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -29,7 +29,10 @@ using Tensor = framework::Tensor; template class CudnnLSTMKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override {} + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW( + "CPU is not support for this kernel now. Will be add in the future"); + } }; template diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 30d108bbe..fa2215f9f 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -169,7 +169,7 @@ __all__ = [ 'log_loss', 'add_position_encoding', 'bilinear_tensor_product', - 'cudnn_lstm', + 'lstm', ] @@ -467,39 +467,53 @@ def dynamic_lstm(input, return hidden, cell -def cudnn_lstm(input, - init_h, - init_c, - batch_size, - max_len, - dropout_prob, - input_size, - hidden_size, - num_layers, - is_bidirec=False, - dtype='float32', - is_test=False, - name=None, - default_initializer=None, - fix_seed=False, - seed=0): +def lstm(input, + init_h, + init_c, + max_len, + dropout_prob, + input_size, + hidden_size, + num_layers, + is_bidirec=False, + dtype='float32', + is_test=False, + name=None, + default_initializer=None, + seed=-1): """ - CUDNN LSTM implementation + If Device is GPU, This op will use cudnn LSTM implementation A four-gate Long Short-Term Memory network with no peephole connections. In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: - it = sigmoid(Wi X xt + Ri X ht-1 + bWi + bRi) - ft = sigmoid(Wf X xt + Rf X ht-1 + bWf + bRf) - ot = sigmoid(Wo X xt + Ro X ht-1 + bWo + bRo) - c't = tanh(Wc X xt + Rc X ht-1 + bWc + bRc) - ct = ft * ct-1 + it * c't - ht = ot * tanh(ct) + $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ + + $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + + $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + + $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + + $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + + $$ h_t = o_t \\odot tanh(c_t) $$ + + - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) + - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). + - sigmoid is the logistic sigmoid function. + - $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. + - The $\odot$ is the element-wise product of the vectors. + - `tanh` is the activation functions. + - $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, X represensts a matrix multiplication - and tanh is the hyperbolic tangent function. it, ft, ot, c't represent the input, forget, output and new gates respectively. Args: @@ -510,7 +524,6 @@ def cudnn_lstm(input, init_c(Variable): The initial cell state of the LSTM. This is a tensor with shape ( num_layers x batch_size x hidden_size ) if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) - batch_size (int): total distance numer of the batch max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len dropout_prob(float): dropout prob, dropout ONLY work between rnn layers, NOT between time steps There is NO dropout work on rnn output of the last RNN layers @@ -524,9 +537,7 @@ def cudnn_lstm(input, will be named automatically. default_initializer(Initialize|None): Where use initializer to initialize the Weight If set None, defaule initializer will be used - - fix_seed(bool): If it's True, fix seed will used for dropout in LSTM - seed(int): If fix_seed is True, dropout seed in LSTM will use this seed + seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed Returns: @@ -553,7 +564,7 @@ def cudnn_lstm(input, init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) - rnn_out, last_h, last_c = layers.cudnn_lstm( input, init_h, init_c, batch_size, \ + rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \ max_len, dropout_prob, input_size, hidden_size, \ num_layers) """ @@ -610,12 +621,10 @@ def cudnn_lstm(input, 'max_len': max_len, 'is_bidirec': is_bidirec, 'input_size': input_size, - 'batch_size': batch_size, 'hidden_size': hidden_size, 'num_layers': num_layers, 'is_test': is_test, 'dropout_prob': dropout_prob, - 'fix_seed': fix_seed, 'seed': seed, }) return out, last_h, last_c -- GitLab