未验证 提交 81870723 编写于 作者: X xuezhong 提交者: GitHub

Merge pull request #15605 from xuezhong/fix_bug_for_lstmp

Fix bug for lstmp
......@@ -71,7 +71,7 @@ paddle.fluid.initializer.NumpyArrayInitializer.__init__ ArgSpec(args=['self', 'v
paddle.fluid.layers.fc ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None))
paddle.fluid.layers.embedding ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32'))
paddle.fluid.layers.dynamic_lstm ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None))
paddle.fluid.layers.dynamic_lstmp ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None))
paddle.fluid.layers.dynamic_gru ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False))
paddle.fluid.layers.gru_unit ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False))
paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,))
......
......@@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel<T> {
lstm_value.output_value = out_t.data<T>();
lstm_value.state_value = cell_t.data<T>();
lstm_value.state_active_value = cell_pre_act_t.data<T>();
T cell_clip = 0.0;
math::LstmUnitFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
cell_act, cand_act);
device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip,
gate_act, cell_act, cand_act);
lstm_value.prev_state_value = lstm_value.state_value;
}
......@@ -316,9 +317,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
lstm_value.output_value = nullptr;
lstm_grad.state_active_grad = nullptr;
int cur_batch_size = bend - bstart;
T cell_clip = 0.0;
math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
cell_clip, gate_act, cell_act, cand_act);
if (n > 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
......
......@@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(C0) of LSTMP operator should not be null after "
"Input(H0) provided.");
auto h_dims = ctx->GetInputDim("H0");
auto c_dims = ctx->GetInputDim("C0");
PADDLE_ENFORCE(h_dims == c_dims,
"The dimension of Input(H0) and Input(C0) "
"should be the same.");
ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
}
auto b_dims = ctx->GetInputDim("Bias");
......@@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
"This LoDTensor is obtained in the forward and used in the "
"backward.")
.AsIntermediate();
AddOutput("OrderedP0",
"(Tensor) the projection of the initial hidden state "
"H0. This is a tensor with shape (N x P), where N is the "
"batch size and P is the hidden size.")
.AsIntermediate();
AddAttr<bool>("use_peepholes",
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections.")
......@@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, defalut: False) "
"whether to compute reversed LSTMP.")
.SetDefault(false);
AddAttr<float>("cell_clip",
"(float, defalut: 0.0) "
"Clip for Tensor for cell state tensor when clip value is "
"greater than 0.0")
.SetDefault(0.0);
AddAttr<float>("proj_clip",
"(float, defalut: 0.0) "
"Clip for Tensor for projection tensor when clip value is "
"greater than 0.0")
.SetDefault(0.0);
AddAttr<std::string>(
"gate_activation",
"(string, default: sigmoid)"
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
......@@ -21,17 +22,50 @@ limitations under the License. */
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
using platform::Transform;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
class _ClipFunctor {
public:
explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
HOSTDEVICE T operator()(const T& x) const {
if (x < min_)
return min_;
else if (x > max_)
return max_;
else
return x;
}
private:
T min_;
T max_;
};
template <typename T>
class _ClipGradFunctor {
public:
explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
HOSTDEVICE T operator()(const T& x, const T& y) const {
return (y > min_ && y < max_) ? x : 0;
}
private:
T min_;
T max_;
};
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src,
......@@ -67,9 +101,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden_t0 = ctx.Input<Tensor>("H0");
auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
auto* cell_t0 = ctx.Input<Tensor>("C0");
auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
batch_gate->mutable_data<T>(ctx.GetPlace());
auto* proj_out = ctx.Output<LoDTensor>("Projection");
......@@ -110,6 +146,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
}
lstmp_value.prev_state_value = nullptr;
Tensor ordered_c0;
Tensor ordered_h0;
framework::Vector<size_t> order(batch_gate->lod()[2]);
......@@ -169,18 +206,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
// Since the batch computing for LSTMP reorders the input sequence
// according to their length. The initialized hidden state also needs
// to reorder.
Tensor ordered_h0;
ordered_proj0->mutable_data<T>(ctx.GetPlace());
ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
&ordered_h0, true);
blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast<T>(1.0),
ordered_proj0, static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
ActCompute(cell_act, place, proj0_dev, proj0_dev);
}
blas.MatMul(*ordered_proj0, false, *weight, false, static_cast<T>(1.0),
blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
&gate_t, static_cast<T>(1.0));
}
......@@ -189,8 +217,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
lstmp_value.state_value = cell_t.data<T>();
lstmp_value.state_active_value = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
cell_act, cand_act);
device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip,
gate_act, cell_act, cand_act);
lstmp_value.prev_state_value = lstmp_value.state_value;
blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
&proj_t, static_cast<T>(0.0));
......@@ -198,6 +226,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto proj_t_dev = EigenMatrix<T>::From(proj_t);
ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
}
if (proj_clip && proj_clip > 0.0) {
T* x_data = proj_t.data<T>();
int64_t numel = proj_t.numel();
Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), x_data,
x_data + numel, x_data,
_ClipFunctor<T>(-1.0 * proj_clip, proj_clip));
}
}
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
......@@ -239,6 +275,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto* proj_out = ctx.Input<LoDTensor>("Projection");
auto* cell_out = ctx.Input<LoDTensor>("Cell");
auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
......@@ -253,7 +292,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* h0 = ctx.Input<Tensor>("H0");
auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
auto* c0 = ctx.Input<Tensor>("C0");
auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
......@@ -363,6 +401,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
Tensor cur_proj = batch_proj.Slice(bstart, bend);
Tensor proj_g = batch_proj_g.Slice(bstart, bend);
if (proj_clip && proj_clip > 0.0) {
T* dx_data = proj_g.data<T>();
T* x_data = cur_proj.data<T>();
int64_t numel = proj_g.numel();
Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), dx_data,
dx_data + numel, x_data, dx_data,
_ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
}
if (proj_act != math::detail::ActivationType::kIdentity) {
auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
auto proj_g_dev = EigenMatrix<T>::From(proj_g);
......@@ -412,7 +461,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
cell_clip, gate_act, cell_act, cand_act);
if (n > 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
......@@ -431,32 +480,15 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
&ordered_h0, true);
if (weight_g) {
blas.MatMul(*ordered_proj0, true, gate_g, false,
static_cast<T>(1.0), weight_g, static_cast<T>(1.0));
blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
weight_g, static_cast<T>(1.0));
}
}
if (h0 && (h0_g || proj_weight_g)) {
ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
Tensor proj0_g;
proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
proj0_g.mutable_data<T>(ctx.GetPlace());
blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
&proj0_g, static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
proj0_g_dev);
}
if (h0_g) {
blas.MatMul(proj0_g, false, *proj_weight, true, static_cast<T>(1.0),
&ordered_h0_g, static_cast<T>(0.0));
}
if (proj_weight_g) {
blas.MatMul(ordered_h0, true, proj0_g, false, static_cast<T>(1.0),
proj_weight_g, static_cast<T>(1.0));
}
}
}
}
......
......@@ -32,7 +32,8 @@ namespace detail {
template <class T, class Op>
void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frame_size, ActivationType active_node,
int frame_size, T cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
T r_value_in;
......@@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
&r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
value_in[i] = r_value_in;
value_ig[i] = r_value_ig;
......@@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size,
ActivationType active_node,
T cell_clip, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
T r_value_in;
......@@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
&r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
&r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
&r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
grad_in[i] = r_grad_in;
grad_ig[i] = r_grad_ig;
......@@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frame_size, ActivationType active_node,
int frame_size, T cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
#ifdef __AVX__
......@@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
&r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
value_in[i] = r_value_in;
value_ig[i] = r_value_ig;
......@@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size,
ActivationType active_node,
T cell_clip, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
#ifdef __AVX__
......@@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
&r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
&r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
&r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
grad_in[i] = r_grad_in;
grad_ig[i] = r_grad_ig;
......@@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
T cell_clip, ActivationType active_node,
ActivationType active_gate, ActivationType active_state) {
if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state);
avx_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
active_node, active_gate, active_state);
} else {
naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state);
naive_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
active_node, active_gate, active_state);
}
}
template <class T, class Op>
void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, ActivationType active_node,
int frame_size, T cell_clip, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
active_gate, active_state);
avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
active_node, active_gate, active_state);
} else {
naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
active_node, active_gate, active_state);
}
}
......
......@@ -31,7 +31,8 @@ namespace detail {
*/
template <class T, class Op, bool is_batch>
__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_node,
int batch_size, T cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
&r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
value.gate_value[frame_idx] = r_value_in;
value.gate_value[frame_idx + frame_size] = r_value_ig;
......@@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
template <class T, class Op, bool is_batch>
__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size,
int batch_size, ActivationType active_node,
int batch_size, T cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
&r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
&r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
&r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node,
active_gate, active_state);
&r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip,
active_node, active_gate, active_state);
grad.gate_grad[frame_idx] = r_grad_in;
grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
......@@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, int frame_size, int batch_size,
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
T cell_clip, ActivationType active_node,
ActivationType active_gate, ActivationType active_state) {
dim3 threads;
dim3 grid;
if (batch_size == 1) {
......@@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
if (batch_size == 1) {
KeLstmForward<T, Op,
/* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, frame_size, batch_size, active_node, active_gate,
op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
active_state);
} else {
KeLstmForward<T, Op,
/* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, frame_size, batch_size, active_node, active_gate,
op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
active_state);
}
}
......@@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
template <class T, class Op>
void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
int frame_size, int batch_size, T cell_clip,
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
dim3 threads;
......@@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
if (batch_size == 1) {
KeLstmBackward<T, Op,
/* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state);
op, value, grad, frame_size, batch_size, cell_clip, active_node,
active_gate, active_state);
} else {
KeLstmBackward<T, Op,
/* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state);
op, value, grad, frame_size, batch_size, cell_clip, active_node,
active_gate, active_state);
}
}
......
......@@ -29,7 +29,7 @@ class lstm {
public:
HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
T *prev_state, T *state, T *state_atv, T *output,
T *checkI, T *checkF, T *checkO,
T *checkI, T *checkF, T *checkO, T *cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
......@@ -37,6 +37,15 @@ class lstm {
*value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
*value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
*state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
if (*cell_clip > 0.0) {
if (*state < -1.0 * (*cell_clip)) {
*state = -1.0 * (*cell_clip);
}
if (*state > *cell_clip) {
*state = *cell_clip;
}
}
*value_og = activation(*value_og + (*state) * (*checkO), active_gate);
*state_atv = activation(*state, active_state);
*output = (*value_og) * (*state_atv);
......@@ -52,7 +61,7 @@ class lstm {
__m256 *value_fg, __m256 *value_og,
__m256 *prev_state, __m256 *state,
__m256 *state_atv, __m256 *output, __m256 *checkI,
__m256 *checkF, __m256 *checkO,
__m256 *checkF, __m256 *checkO, T *cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
......@@ -65,6 +74,13 @@ class lstm {
active_gate);
*state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
_mm256_mul_ps(*prev_state, *value_fg));
if (*cell_clip > 0.0f) {
__m256 min = _mm256_set1_ps(0.0f - *cell_clip);
__m256 max = _mm256_set1_ps(*cell_clip);
*state = _mm256_min_ps(max, *state);
*state = _mm256_max_ps(min, *state);
}
*value_og = activation(
_mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
*state_atv = activation(*state, active_state);
......@@ -86,15 +102,26 @@ class lstm {
T *prev_state, T *prev_state_grad, T *state,
T *state_grad, T *state_atv, T *output_grad,
T *checkI, T *checkF, T *checkO, T *checkIGrad,
T *checkFGrad, T *checkOGrad,
T *checkFGrad, T *checkOGrad, T *cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
*grad_og =
activation((*output_grad) * (*state_atv), *value_og, active_gate);
if (*cell_clip > 0.0f) {
if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) {
*state_grad = 0.0f;
} else {
*state_grad +=
activation((*output_grad) * (*value_og), *state_atv, active_state) +
(*grad_og) * (*checkO);
}
} else {
*state_grad +=
activation((*output_grad) * (*value_og), *state_atv, active_state) +
(*grad_og) * (*checkO);
}
*grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
*grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
*grad_fg =
......@@ -117,15 +144,24 @@ class lstm {
__m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
__m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
__m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
__m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node,
ActivationType active_gate, ActivationType active_state) {
__m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip,
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
*grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
active_gate);
if (*cell_clip > 0.0f) {
T *state_ = reinterpret_cast<T *>(state);
if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
*state_grad = _mm256_set1_ps(0.0f);
} else {
*state_grad =
_mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
*state_atv, active_state),
*state_grad);
*state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
*state_grad =
_mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
}
}
*grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
active_node);
*grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
......
......@@ -24,12 +24,12 @@ template <class T>
struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
static void compute(const platform::CPUDeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
const detail::ActivationType& gate_act,
T cell_clip, const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
cand_act, gate_act, cell_act);
cell_clip, cand_act, gate_act, cell_act);
value.gate_value += frame_size * 4;
value.state_value += frame_size;
value.state_active_value += frame_size;
......@@ -45,13 +45,14 @@ template <class T>
struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
static void compute(const platform::CPUDeviceContext& context,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
int frame_size, int batch_size, T cell_clip,
const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
frame_size, cand_act, gate_act, cell_act);
frame_size, cell_clip, cand_act, gate_act,
cell_act);
value.gate_value += frame_size * 4;
value.state_value += frame_size;
......
......@@ -24,12 +24,12 @@ template <class T>
struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
static void compute(const platform::CUDADeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
const detail::ActivationType& gate_act,
T cell_clip, const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
frame_size, batch_size, cand_act, gate_act,
cell_act);
frame_size, batch_size, cell_clip, cand_act,
gate_act, cell_act);
}
};
......@@ -37,13 +37,13 @@ template <class T>
struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
static void compute(const platform::CUDADeviceContext& context,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
int frame_size, int batch_size, T cell_clip,
const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
frame_size, batch_size, cand_act, gate_act,
cell_act);
frame_size, batch_size, cell_clip, cand_act,
gate_act, cell_act);
}
};
......
......@@ -50,7 +50,7 @@ template <typename DeviceContext, typename T>
class LstmUnitFunctor {
public:
static void compute(const DeviceContext &context, LstmMetaValue<T> value,
int frame_size, int batch_size,
int frame_size, int batch_size, T cell_clip,
const detail::ActivationType &gate_act,
const detail::ActivationType &cell_act,
const detail::ActivationType &cand_act);
......@@ -61,7 +61,7 @@ class LstmUnitGradFunctor {
public:
static void compute(const DeviceContext &context, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size, int batch_size,
const detail::ActivationType &gate_act,
T cell_clip, const detail::ActivationType &gate_act,
const detail::ActivationType &cell_act,
const detail::ActivationType &cand_act);
};
......
......@@ -668,7 +668,11 @@ def dynamic_lstmp(input,
candidate_activation='tanh',
proj_activation='tanh',
dtype='float32',
name=None):
name=None,
h_0=None,
c_0=None,
cell_clip=None,
proj_clip=None):
"""
**Dynamic LSTMP Layer**
......@@ -785,6 +789,17 @@ def dynamic_lstmp(input,
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
h_0(Variable): The initial hidden state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size and D is the projection size.
c_0(Variable): The initial cell state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size. `h_0` and `c_0` can be NULL but only at the same time.
cell_clip(float): If provided the cell state is clipped
by this value prior to the cell output activation.
proj_clip(float): If `num_proj > 0` and `proj_clip` is
provided, then the projected values are clipped elementwise to within
`[-proj_clip, proj_clip]`.
Returns:
tuple: A tuple of two output variable: the projection of hidden state, \
......@@ -831,25 +846,41 @@ def dynamic_lstmp(input,
batch_hidden = helper.create_variable_for_type_inference(dtype)
batch_gate = helper.create_variable_for_type_inference(dtype)
batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
helper.append_op(
type='lstmp',
inputs={
inputs = {
'Input': input,
'Weight': weight,
'ProjWeight': proj_weight,
'Bias': bias
},
}
batch_size = input.shape[0]
if h_0:
assert h_0.shape == (batch_size, proj_size), \
'The shape of h0 should be (batch_size, %d)' % proj_size
inputs['H0'] = h_0
if c_0:
assert c_0.shape == (batch_size, size), \
'The shape of c0 should be (batch_size, %d)' % size
inputs['C0'] = c_0
if cell_clip:
assert cell_clip >= 0, "cell_clip should not be negtive."
if proj_clip:
assert proj_clip >= 0, "proj_clip should not be negtive."
helper.append_op(
type='lstmp',
inputs=inputs,
outputs={
'Projection': projection,
'Cell': cell,
'OrderedP0': ordered_proj0,
'BatchHidden': batch_hidden,
'BatchGate': batch_gate,
'BatchCellPreAct': batch_cell_pre_act
},
attrs={
'use_peepholes': use_peepholes,
'cell_clip': cell_clip,
'proj_clip': proj_clip,
'is_reverse': is_reverse,
'gate_activation': gate_activation,
'cell_activation': cell_activation,
......
......@@ -36,12 +36,14 @@ def lstmp(
w_b=None, # 1 x 4D
w_c=None, # 1 x 3D
is_reverse=False,
proj_clip=0.0,
cell_clip=0.0,
act_gate=None,
act_cell=None,
act_cand=None,
act_proj=None):
def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
act_proj):
def _step(x, w_r, w_rh, w_c, r_pre, c_pre, proj_clip, cell_clip, act_gate,
act_cell, act_cand, act_proj):
g = np.dot(r_pre, w_r) # 1 x 4D
g = g + x
g = np.reshape(g, (1, g.size))
......@@ -55,6 +57,17 @@ def lstmp(
g_f = act_gate(g_f + w_fc * c_pre) # 1 x D
c = g_f * c_pre + g_i * act_cand(c) # 1 x D
def array_clip(a, clip):
size = np.prod(a.shape)
new_a = np.reshape(a, (size))
for i in range(size):
new_a[i] = max(new_a[i], -1.0 * clip)
new_a[i] = min(new_a[i], clip)
new_a = np.reshape(new_a, a.shape)
return new_a
if cell_clip > 0.0:
c = array_clip(c, cell_clip)
if w_c is None:
g_o = act_gate(g_o) # 1 x D
else:
......@@ -64,6 +77,8 @@ def lstmp(
# projection
r = np.dot(h, w_rh)
r = act_proj(r)
if proj_clip > 0.0:
r = array_clip(r, proj_clip)
return r, c
def _reverse(x, offset):
......@@ -87,13 +102,13 @@ def lstmp(
# compute one sequence
seq_len = lod[0][i]
x = input[offset[i]:offset[i + 1], :]
r_pre = np.dot(h0[i], w_rh) # 1 x P
r_pre = act_proj(r_pre)
r_pre = h0[i]
c_pre = c0[i] # 1 x D
for j in range(seq_len):
# compute one step
r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
act_cell, act_cand, act_proj)
r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, proj_clip,
cell_clip, act_gate, act_cell, act_cand,
act_proj)
projection.append(r_pre.flatten())
cell.append(c_pre.flatten())
......@@ -123,13 +138,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
T = sum(self.lod[0])
N = len(self.lod[0])
x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
if self.has_initial_state:
h0 = np.random.normal(size=(N, self.D)).astype('float64')
h0 = np.random.normal(size=(N, self.P)).astype('float64')
c0 = np.random.normal(size=(N, self.D)).astype('float64')
else:
h0 = np.zeros((N, self.D)).astype('float64')
h0 = np.zeros((N, self.P)).astype('float64')
c0 = np.zeros((N, self.D)).astype('float64')
w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
if self.use_peepholes:
......@@ -140,9 +154,12 @@ class TestLstmpOp(LstmTest.TestLstmOp):
w_b = b[:, 0:4 * self.D]
w_c = b[:, 4 * self.D:] if self.use_peepholes else None
w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
proj_clip = 0.1
cell_clip = 0.1
r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
proj_clip, cell_clip, ACTIVATION[self.act_gate],
ACTIVATION[self.act_cell], ACTIVATION[self.act_cand],
ACTIVATION[self.act_proj])
self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
......@@ -159,6 +176,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
self.attrs = {
'use_peepholes': self.use_peepholes,
'is_reverse': self.is_reverse,
'proj_clip': proj_clip,
'cell_clip': cell_clip,
'gate_activation': self.act_gate,
'cell_activation': self.act_cell,
'candidate_activation': self.act_cand,
......@@ -171,14 +190,14 @@ class TestLstmpOp(LstmTest.TestLstmOp):
def test_check_grad(self):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
max_relative_error=1e-2)
max_relative_error=1e-2,
numeric_grad_delta=0.0000005)
class TestLstmpOpHasInitial(TestLstmpOp):
......@@ -188,7 +207,6 @@ class TestLstmpOpHasInitial(TestLstmpOp):
def test_check_grad(self):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -196,11 +214,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
['Projection'],
numeric_grad_delta=0.0000005,
max_relative_error=1e-2)
def test_check_grad_ingore_bias(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -208,11 +226,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'ProjWeight', 'Weight'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('Bias'))
def test_check_grad_ingore_weight(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -220,11 +238,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'ProjWeight', 'Bias'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('Weight'))
def test_check_grad_ingore_proj_weight(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -232,11 +250,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'Weight', 'Bias'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('ProjWeight'))
def test_check_grad_ingore_input(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -244,11 +262,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Weight', 'ProjWeight', 'Bias'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('Input'))
def test_check_grad_ingore_h0(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -256,11 +274,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('H0'))
def test_check_grad_ingore_c0(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -268,6 +286,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('C0'))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册