提交 88083632 编写于 作者: X xuezhong

add cell clip and proj clip, fix bug for h0

上级 294d5944
......@@ -151,9 +151,10 @@ class LSTMKernel : public framework::OpKernel<T> {
lstm_value.output_value = out_t.data<T>();
lstm_value.state_value = cell_t.data<T>();
lstm_value.state_active_value = cell_pre_act_t.data<T>();
T cell_clip = 0.0;
math::LstmUnitFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
cell_act, cand_act);
device_ctx, lstm_value, frame_size, cur_batch_size, cell_clip,
gate_act, cell_act, cand_act);
lstm_value.prev_state_value = lstm_value.state_value;
}
......@@ -312,9 +313,10 @@ class LSTMGradKernel : public framework::OpKernel<T> {
}
int cur_batch_size = bend - bstart;
T cell_clip = 0.0;
math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
cell_clip, gate_act, cell_act, cand_act);
if (n > 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
......
......@@ -73,12 +73,6 @@ class LSTMPOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(C0) of LSTMP operator should not be null after "
"Input(H0) provided.");
auto h_dims = ctx->GetInputDim("H0");
auto c_dims = ctx->GetInputDim("C0");
PADDLE_ENFORCE(h_dims == c_dims,
"The dimension of Input(H0) and Input(C0) "
"should be the same.");
ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
}
auto b_dims = ctx->GetInputDim("Bias");
......@@ -180,11 +174,6 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
"This LoDTensor is obtained in the forward and used in the "
"backward.")
.AsIntermediate();
AddOutput("OrderedP0",
"(Tensor) the projection of the initial hidden state "
"H0. This is a tensor with shape (N x P), where N is the "
"batch size and P is the hidden size.")
.AsIntermediate();
AddAttr<bool>("use_peepholes",
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections.")
......@@ -193,6 +182,16 @@ class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
"(bool, defalut: False) "
"whether to compute reversed LSTMP.")
.SetDefault(false);
AddAttr<float>("cell_clip",
"(float, defalut: 0.0) "
"Clip for Tensor for cell state tensor when clip value is "
"greater than 0.0")
.SetDefault(0.0);
AddAttr<float>("proj_clip",
"(float, defalut: 0.0) "
"Clip for Tensor for projection tensor when clip value is "
"greater than 0.0")
.SetDefault(0.0);
AddAttr<std::string>(
"gate_activation",
"(string, default: sigmoid)"
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h"
......@@ -21,17 +22,50 @@ limitations under the License. */
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
using platform::Transform;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
class _ClipFunctor {
public:
explicit _ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
HOSTDEVICE T operator()(const T& x) const {
if (x < min_)
return min_;
else if (x > max_)
return max_;
else
return x;
}
private:
T min_;
T max_;
};
template <typename T>
class _ClipGradFunctor {
public:
explicit _ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
HOSTDEVICE T operator()(const T& x, const T& y) const {
return (y > min_ && y < max_) ? x : 0;
}
private:
T min_;
T max_;
};
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src,
......@@ -60,6 +94,25 @@ class LSTMPKernel : public framework::OpKernel<T> {
PADDLE_THROW("unsupported activation type");
}
void Print(const Tensor& t, std::string name) const {
VLOG(1) << name << "size = " << t.numel();
size_t size = t.numel();
T* d = t.data<T>();
#ifdef PADDLE_WITH_CUDA
std::vector<T> vec;
platform::DeviceContextPool::Instance().Get(t.place())->Wait();
if (platform::is_gpu_place(t.place())) {
vec.resize(size);
cudaMemcpy(vec.data(), d, sizeof(T) * size, cudaMemcpyDeviceToHost);
d = vec.data();
}
#endif
VLOG(1) << name << " data_ptr = " << static_cast<void*>(d);
for (size_t i = 0; i < size; i++) {
VLOG(1) << d[i] << ",";
}
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<LoDTensor>("Input");
auto* weight = ctx.Input<Tensor>("Weight");
......@@ -67,9 +120,11 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden_t0 = ctx.Input<Tensor>("H0");
auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
auto* cell_t0 = ctx.Input<Tensor>("C0");
auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
batch_gate->mutable_data<T>(ctx.GetPlace());
auto* proj_out = ctx.Output<LoDTensor>("Projection");
......@@ -110,6 +165,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
}
lstmp_value.prev_state_value = nullptr;
Tensor ordered_c0;
Tensor ordered_h0;
framework::Vector<size_t> order(batch_gate->lod()[2]);
......@@ -169,18 +225,10 @@ class LSTMPKernel : public framework::OpKernel<T> {
// Since the batch computing for LSTMP reorders the input sequence
// according to their length. The initialized hidden state also needs
// to reorder.
Tensor ordered_h0;
ordered_proj0->mutable_data<T>(ctx.GetPlace());
VLOG(1) << "qxz h0 used";
ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
&ordered_h0, true);
blas.MatMul(ordered_h0, false, *proj_weight, false, static_cast<T>(1.0),
ordered_proj0, static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
ActCompute(cell_act, place, proj0_dev, proj0_dev);
}
blas.MatMul(*ordered_proj0, false, *weight, false, static_cast<T>(1.0),
blas.MatMul(ordered_h0, false, *weight, false, static_cast<T>(1.0),
&gate_t, static_cast<T>(1.0));
}
......@@ -189,8 +237,8 @@ class LSTMPKernel : public framework::OpKernel<T> {
lstmp_value.state_value = cell_t.data<T>();
lstmp_value.state_active_value = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
cell_act, cand_act);
device_ctx, lstmp_value, frame_size, cur_batch_size, cell_clip,
gate_act, cell_act, cand_act);
lstmp_value.prev_state_value = lstmp_value.state_value;
blas.MatMul(hidden_t, false, *proj_weight, false, static_cast<T>(1.0),
&proj_t, static_cast<T>(0.0));
......@@ -198,6 +246,14 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto proj_t_dev = EigenMatrix<T>::From(proj_t);
ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
}
if (proj_clip && proj_clip > 0.0) {
T* x_data = proj_t.data<T>();
int64_t numel = proj_t.numel();
Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), x_data,
x_data + numel, x_data,
_ClipFunctor<T>(-1.0 * proj_clip, proj_clip));
}
}
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
......@@ -239,6 +295,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto* proj_out = ctx.Input<LoDTensor>("Projection");
auto* cell_out = ctx.Input<LoDTensor>("Cell");
auto proj_clip = static_cast<T>(ctx.Attr<float>("proj_clip"));
auto cell_clip = static_cast<T>(ctx.Attr<float>("cell_clip"));
auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
......@@ -253,7 +312,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* h0 = ctx.Input<Tensor>("H0");
auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
auto* c0 = ctx.Input<Tensor>("C0");
auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
......@@ -363,6 +421,17 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
Tensor cur_proj = batch_proj.Slice(bstart, bend);
Tensor proj_g = batch_proj_g.Slice(bstart, bend);
if (proj_clip && proj_clip > 0.0) {
T* dx_data = proj_g.data<T>();
T* x_data = cur_proj.data<T>();
int64_t numel = proj_g.numel();
Transform<DeviceContext> trans;
trans(ctx.template device_context<DeviceContext>(), dx_data,
dx_data + numel, x_data, dx_data,
_ClipGradFunctor<T>(-1.0 * proj_clip, proj_clip));
}
if (proj_act != math::detail::ActivationType::kIdentity) {
auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
auto proj_g_dev = EigenMatrix<T>::From(proj_g);
......@@ -407,7 +476,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
int cur_batch_size = bend - bstart;
math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
cell_clip, gate_act, cell_act, cand_act);
if (n > 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
......@@ -426,31 +495,14 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
&ordered_h0, true);
if (weight_g) {
blas.MatMul(*ordered_proj0, true, gate_g, false,
static_cast<T>(1.0), weight_g, static_cast<T>(1.0));
blas.MatMul(ordered_h0, true, gate_g, false, static_cast<T>(1.0),
weight_g, static_cast<T>(1.0));
}
}
if (h0 && (h0_g || proj_weight_g)) {
ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
Tensor proj0_g;
proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
proj0_g.mutable_data<T>(ctx.GetPlace());
blas.MatMul(gate_g, false, *weight, true, static_cast<T>(1.0),
&proj0_g, static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
proj0_g_dev);
}
if (h0_g) {
blas.MatMul(proj0_g, false, *proj_weight, true, static_cast<T>(1.0),
&ordered_h0_g, static_cast<T>(0.0));
}
if (proj_weight_g) {
blas.MatMul(ordered_h0, true, proj0_g, false, static_cast<T>(1.0),
proj_weight_g, static_cast<T>(1.0));
}
&ordered_h0_g, static_cast<T>(0.0));
}
}
}
......
......@@ -32,7 +32,8 @@ namespace detail {
template <class T, class Op>
void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frame_size, ActivationType active_node,
int frame_size, T cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
T r_value_in;
......@@ -67,7 +68,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
&r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
value_in[i] = r_value_in;
value_ig[i] = r_value_ig;
......@@ -82,7 +83,7 @@ void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size,
ActivationType active_node,
T cell_clip, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
T r_value_in;
......@@ -135,7 +136,7 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
&r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
&r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
&r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
grad_in[i] = r_grad_in;
grad_ig[i] = r_grad_ig;
......@@ -154,7 +155,8 @@ void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frame_size, ActivationType active_node,
int frame_size, T cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
#ifdef __AVX__
......@@ -194,7 +196,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
&r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
value_in[i] = r_value_in;
value_ig[i] = r_value_ig;
......@@ -210,7 +212,7 @@ void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size,
ActivationType active_node,
T cell_clip, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
#ifdef __AVX__
......@@ -268,7 +270,7 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
&r_grad_ig, &r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad,
&r_state, &r_state_grad, &r_state_atv, &r_output_grad, &r_checkI,
&r_checkF, &r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
grad_in[i] = r_grad_in;
grad_ig[i] = r_grad_ig;
......@@ -292,27 +294,27 @@ void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
T cell_clip, ActivationType active_node,
ActivationType active_gate, ActivationType active_state) {
if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state);
avx_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
active_node, active_gate, active_state);
} else {
naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
active_gate, active_state);
naive_lstm_forward_one_sequence<T>(op, value, frame_size, cell_clip,
active_node, active_gate, active_state);
}
}
template <class T, class Op>
void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, ActivationType active_node,
int frame_size, T cell_clip, ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
active_gate, active_state);
avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
active_node, active_gate, active_state);
} else {
naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size, cell_clip,
active_node, active_gate, active_state);
}
}
......
......@@ -31,7 +31,8 @@ namespace detail {
*/
template <class T, class Op, bool is_batch>
__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_node,
int batch_size, T cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -72,7 +73,7 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_prev_state,
&r_state, &r_state_atv, &r_out, &r_checkI, &r_checkF, &r_checkO,
active_node, active_gate, active_state);
&cell_clip, active_node, active_gate, active_state);
value.gate_value[frame_idx] = r_value_in;
value.gate_value[frame_idx + frame_size] = r_value_ig;
......@@ -91,7 +92,8 @@ __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
template <class T, class Op, bool is_batch>
__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size,
int batch_size, ActivationType active_node,
int batch_size, T cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -148,8 +150,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
op(&r_value_in, &r_value_ig, &r_value_fg, &r_value_og, &r_grad_in, &r_grad_ig,
&r_grad_fg, &r_grad_og, &r_prev_state, &r_prev_state_grad, &r_state,
&r_state_grad, &r_state_atv, &r_output_grad, &r_checkI, &r_checkF,
&r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, active_node,
active_gate, active_state);
&r_checkO, &r_checkIGrad, &r_checkFGrad, &r_checkOGrad, &cell_clip,
active_node, active_gate, active_state);
grad.gate_grad[frame_idx] = r_grad_in;
grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
......@@ -185,8 +187,8 @@ __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
template <class T, class Op>
void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, int frame_size, int batch_size,
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
T cell_clip, ActivationType active_node,
ActivationType active_gate, ActivationType active_state) {
dim3 threads;
dim3 grid;
if (batch_size == 1) {
......@@ -205,12 +207,12 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
if (batch_size == 1) {
KeLstmForward<T, Op,
/* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, frame_size, batch_size, active_node, active_gate,
op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
active_state);
} else {
KeLstmForward<T, Op,
/* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, frame_size, batch_size, active_node, active_gate,
op, value, frame_size, batch_size, cell_clip, active_node, active_gate,
active_state);
}
}
......@@ -218,7 +220,7 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
template <class T, class Op>
void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
int frame_size, int batch_size, T cell_clip,
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
dim3 threads;
......@@ -239,13 +241,13 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
if (batch_size == 1) {
KeLstmBackward<T, Op,
/* is_batch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state);
op, value, grad, frame_size, batch_size, cell_clip, active_node,
active_gate, active_state);
} else {
KeLstmBackward<T, Op,
/* is_batch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frame_size, batch_size, active_node, active_gate,
active_state);
op, value, grad, frame_size, batch_size, cell_clip, active_node,
active_gate, active_state);
}
}
......
......@@ -29,7 +29,7 @@ class lstm {
public:
HOSTDEVICE void operator()(T *value_in, T *value_ig, T *value_fg, T *value_og,
T *prev_state, T *state, T *state_atv, T *output,
T *checkI, T *checkF, T *checkO,
T *checkI, T *checkF, T *checkO, T *cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
......@@ -37,6 +37,14 @@ class lstm {
*value_ig = activation(*value_ig + (*prev_state) * (*checkI), active_gate);
*value_fg = activation(*value_fg + (*prev_state) * (*checkF), active_gate);
*state = (*value_in) * (*value_ig) + (*prev_state) * (*value_fg);
if (*cell_clip > 0.0) {
if (*state < -1.0 * (*cell_clip)) {
*state = -1.0 * (*cell_clip);
}
if (*state > *cell_clip) {
*state = *cell_clip;
}
}
*value_og = activation(*value_og + (*state) * (*checkO), active_gate);
*state_atv = activation(*state, active_state);
*output = (*value_og) * (*state_atv);
......@@ -52,7 +60,7 @@ class lstm {
__m256 *value_fg, __m256 *value_og,
__m256 *prev_state, __m256 *state,
__m256 *state_atv, __m256 *output, __m256 *checkI,
__m256 *checkF, __m256 *checkO,
__m256 *checkF, __m256 *checkO, T *cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
......@@ -65,6 +73,12 @@ class lstm {
active_gate);
*state = _mm256_add_ps(_mm256_mul_ps(*value_in, *value_ig),
_mm256_mul_ps(*prev_state, *value_fg));
if (*cell_clip > 0.0f) {
__m256 min = _mm256_set1_ps(0.0f - *cell_clip);
__m256 max = _mm256_set1_ps(*cell_clip);
*state = _mm256_min_ps(max, *state);
*state = _mm256_max_ps(min, *state);
}
*value_og = activation(
_mm256_add_ps(*value_og, _mm256_mul_ps(*state, *checkO)), active_gate);
*state_atv = activation(*state, active_state);
......@@ -86,15 +100,21 @@ class lstm {
T *prev_state, T *prev_state_grad, T *state,
T *state_grad, T *state_atv, T *output_grad,
T *checkI, T *checkF, T *checkO, T *checkIGrad,
T *checkFGrad, T *checkOGrad,
T *checkFGrad, T *checkOGrad, T *cell_clip,
ActivationType active_node,
ActivationType active_gate,
ActivationType active_state) {
*grad_og =
activation((*output_grad) * (*state_atv), *value_og, active_gate);
*state_grad +=
activation((*output_grad) * (*value_og), *state_atv, active_state) +
(*grad_og) * (*checkO);
if (*cell_clip > 0.0f) {
if (*state >= (*cell_clip) || *state <= (0.0f - (*cell_clip))) {
*state_grad = 0.0f;
} else {
*state_grad +=
activation((*output_grad) * (*value_og), *state_atv, active_state) +
(*grad_og) * (*checkO);
}
}
*grad_in = activation((*state_grad) * (*value_ig), *value_in, active_node);
*grad_ig = activation((*state_grad) * (*value_in), *value_ig, active_gate);
*grad_fg =
......@@ -117,15 +137,24 @@ class lstm {
__m256 *prev_state, __m256 *prev_state_grad, __m256 *state,
__m256 *state_grad, __m256 *state_atv, __m256 *output_grad,
__m256 *checkI, __m256 *checkF, __m256 *checkO, __m256 *checkIGrad,
__m256 *checkFGrad, __m256 *checkOGrad, ActivationType active_node,
ActivationType active_gate, ActivationType active_state) {
__m256 *checkFGrad, __m256 *checkOGrad, T *cell_clip,
ActivationType active_node, ActivationType active_gate,
ActivationType active_state) {
*grad_og = activation(_mm256_mul_ps(*output_grad, *state_atv), *value_og,
active_gate);
*state_grad =
_mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
*state_atv, active_state),
*state_grad);
*state_grad = _mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
if (*cell_clip > 0.0f) {
T *state_ = reinterpret_cast<T *>(state);
if (*state_ >= (*cell_clip) || *state_ <= (0.0f - (*cell_clip))) {
*state_grad = _mm256_set1_ps(0.0f);
} else {
*state_grad =
_mm256_add_ps(activation(_mm256_mul_ps(*output_grad, *value_og),
*state_atv, active_state),
*state_grad);
*state_grad =
_mm256_add_ps(_mm256_mul_ps(*grad_og, *checkO), *state_grad);
}
}
*grad_in = activation(_mm256_mul_ps(*state_grad, *value_ig), *value_in,
active_node);
*grad_ig = activation(_mm256_mul_ps(*state_grad, *value_in), *value_ig,
......
......@@ -24,12 +24,12 @@ template <class T>
struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
static void compute(const platform::CPUDeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
const detail::ActivationType& gate_act,
T cell_clip, const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
cand_act, gate_act, cell_act);
cell_clip, cand_act, gate_act, cell_act);
value.gate_value += frame_size * 4;
value.state_value += frame_size;
value.state_active_value += frame_size;
......@@ -45,13 +45,14 @@ template <class T>
struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
static void compute(const platform::CPUDeviceContext& context,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
int frame_size, int batch_size, T cell_clip,
const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
frame_size, cand_act, gate_act, cell_act);
frame_size, cell_clip, cand_act, gate_act,
cell_act);
value.gate_value += frame_size * 4;
value.state_value += frame_size;
......
......@@ -24,12 +24,12 @@ template <class T>
struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
static void compute(const platform::CUDADeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
const detail::ActivationType& gate_act,
T cell_clip, const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
frame_size, batch_size, cand_act, gate_act,
cell_act);
frame_size, batch_size, cell_clip, cand_act,
gate_act, cell_act);
}
};
......@@ -37,13 +37,13 @@ template <class T>
struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
static void compute(const platform::CUDADeviceContext& context,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size,
int frame_size, int batch_size, T cell_clip,
const detail::ActivationType& gate_act,
const detail::ActivationType& cell_act,
const detail::ActivationType& cand_act) {
detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
frame_size, batch_size, cand_act, gate_act,
cell_act);
frame_size, batch_size, cell_clip, cand_act,
gate_act, cell_act);
}
};
......
......@@ -50,7 +50,7 @@ template <typename DeviceContext, typename T>
class LstmUnitFunctor {
public:
static void compute(const DeviceContext &context, LstmMetaValue<T> value,
int frame_size, int batch_size,
int frame_size, int batch_size, T cell_clip,
const detail::ActivationType &gate_act,
const detail::ActivationType &cell_act,
const detail::ActivationType &cand_act);
......@@ -61,7 +61,7 @@ class LstmUnitGradFunctor {
public:
static void compute(const DeviceContext &context, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frame_size, int batch_size,
const detail::ActivationType &gate_act,
T cell_clip, const detail::ActivationType &gate_act,
const detail::ActivationType &cell_act,
const detail::ActivationType &cand_act);
};
......
......@@ -659,14 +659,18 @@ def lstm(input,
def dynamic_lstmp(input,
size,
proj_size,
h_0=None,
c_0=None,
param_attr=None,
bias_attr=None,
use_peepholes=True,
cell_clip=None,
proj_clip=None,
is_reverse=False,
gate_activation='sigmoid',
cell_activation='tanh',
candidate_activation='tanh',
proj_activation='tanh',
proj_activation='identity',
dtype='float32',
name=None):
"""
......@@ -736,6 +740,12 @@ def dynamic_lstmp(input,
mini-batch, D is the hidden size.
size(int): 4 * hidden size.
proj_size(int): The size of projection output.
h_0(Variable): The initial hidden state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size and D is the projection size.
c_0(Variable): The initial cell state is an optional input, default is zero.
This is a tensor with shape (N x D), where N is the
batch size. `h_0` and `c_0` can be NULL but only at the same time.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight and projection weight.
......@@ -770,6 +780,11 @@ def dynamic_lstmp(input,
the bias is initialized zero. Default: None.
use_peepholes(bool): Whether to enable diagonal/peephole connections,
default `True`.
cell_clip(float): If provided the cell state is clipped
by this value prior to the cell output activation.
proj_clip(float): If `num_proj > 0` and `proj_clip` is
provided, then the projected values are clipped elementwise to within
`[-proj_clip, proj_clip]`.
is_reverse(bool): Whether to compute reversed LSTM, default `False`.
gate_activation(str): The activation for input gate, forget gate and
output gate. Choices = ["sigmoid", "tanh", "relu",
......@@ -781,7 +796,7 @@ def dynamic_lstmp(input,
default "tanh".
proj_activation(str): The activation for projection output.
Choices = ["sigmoid", "tanh", "relu", "identity"],
default "tanh".
default "identity".
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
......@@ -831,25 +846,36 @@ def dynamic_lstmp(input,
batch_hidden = helper.create_variable_for_type_inference(dtype)
batch_gate = helper.create_variable_for_type_inference(dtype)
batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
inputs = {
'Input': input,
'Weight': weight,
'ProjWeight': proj_weight,
'Bias': bias
}
batch_size = input.shape[0]
if h_0:
assert h_0.shape == (batch_size, proj_size), \
'The shape of h0 should be (batch_size, %d)' % proj_size
inputs['H0'] = h_0
if c_0:
assert c_0.shape == (batch_size, size), \
'The shape of c0 should be (batch_size, %d)' % size
inputs['C0'] = c_0
helper.append_op(
type='lstmp',
inputs={
'Input': input,
'Weight': weight,
'ProjWeight': proj_weight,
'Bias': bias
},
inputs=inputs,
outputs={
'Projection': projection,
'Cell': cell,
'OrderedP0': ordered_proj0,
'BatchHidden': batch_hidden,
'BatchGate': batch_gate,
'BatchCellPreAct': batch_cell_pre_act
},
attrs={
'use_peepholes': use_peepholes,
'cell_clip': cell_clip,
'proj_clip': proj_clip,
'is_reverse': is_reverse,
'gate_activation': gate_activation,
'cell_activation': cell_activation,
......
......@@ -294,6 +294,7 @@ class OpTest(unittest.TestCase):
# fetch_list = map(block.var, fetch_list)
if not isinstance(fetch_list[0], fluid.framework.Variable):
fetch_list = list(map(block.var, fetch_list))
#import pdb; pdb.set_trace()
outs = executor.run(program,
feed=feed_map,
fetch_list=fetch_list,
......@@ -468,8 +469,10 @@ class OpTest(unittest.TestCase):
delta=numeric_grad_delta,
in_place=in_place) for input_to_check in inputs_to_check
]
#import pdb; pdb.set_trace()
analytic_grads = self._get_gradient(inputs_to_check, place,
output_names, no_grad_set)
#import pdb; pdb.set_trace()
self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
max_relative_error,
......
......@@ -36,12 +36,15 @@ def lstmp(
w_b=None, # 1 x 4D
w_c=None, # 1 x 3D
is_reverse=False,
proj_clip=0.0,
cell_clip=0.0,
act_gate=None,
act_cell=None,
act_cand=None,
act_proj=None):
def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
act_proj):
def _step(x, w_r, w_rh, w_c, r_pre, c_pre, proj_clip, cell_clip, act_gate,
act_cell, act_cand, act_proj):
#import pdb; pdb.set_trace()
g = np.dot(r_pre, w_r) # 1 x 4D
g = g + x
g = np.reshape(g, (1, g.size))
......@@ -55,6 +58,21 @@ def lstmp(
g_f = act_gate(g_f + w_fc * c_pre) # 1 x D
c = g_f * c_pre + g_i * act_cand(c) # 1 x D
def array_clip(a, clip):
#print('clip:{}'.format(clip))
#print('old' + str(a))
size = np.prod(a.shape)
new_a = np.reshape(a, (size))
for i in range(size):
new_a[i] = max(new_a[i], -1.0 * clip)
new_a[i] = min(new_a[i], clip)
new_a = np.reshape(new_a, a.shape)
#print('new' + str(new_a))
return new_a
if cell_clip > 0.0:
c = array_clip(c, cell_clip)
if w_c is None:
g_o = act_gate(g_o) # 1 x D
else:
......@@ -64,6 +82,8 @@ def lstmp(
# projection
r = np.dot(h, w_rh)
r = act_proj(r)
if proj_clip > 0.0:
r = array_clip(r, proj_clip)
return r, c
def _reverse(x, offset):
......@@ -87,13 +107,15 @@ def lstmp(
# compute one sequence
seq_len = lod[0][i]
x = input[offset[i]:offset[i + 1], :]
r_pre = np.dot(h0[i], w_rh) # 1 x P
r_pre = act_proj(r_pre)
#r_pre = np.dot(h0[i], w_rh) # 1 x P
r_pre = h0[i]
#r_pre = act_proj(r_pre)
c_pre = c0[i] # 1 x D
for j in range(seq_len):
# compute one step
r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
act_cell, act_cand, act_proj)
r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, proj_clip,
cell_clip, act_gate, act_cell, act_cand,
act_proj)
projection.append(r_pre.flatten())
cell.append(c_pre.flatten())
......@@ -112,24 +134,98 @@ class TestLstmpOp(LstmTest.TestLstmOp):
def reset_argument(self):
pass
def setUp2(self):
self.set_argument()
# projection size
self.P = 2
self.reset_argument()
self.op_type = 'lstmp'
self.act_proj = 'identity'
self.use_peepholes = False
self.has_initial_state = True
self.lod = [[5]]
T = sum(self.lod[0])
N = len(self.lod[0])
proj_clip = 0.5
cell_clip = 0.0
#import pdb; pdb.set_trace()
x=np.array([[-0.50806344, 0.50909436], \
[-0.50087136, 0.4904187 ], \
[-0.48933774, 0.50408053], \
[ 0.00896523, 0.00770854], \
[-0.00851139,-0.01005108]])
wx = np.array([[ 0.2932311, -0.8829277, 1.100133, 0.8197811, -0.8194872, -0.829262, 0.7708865, -0.62339246, -0.7656475, 0.4283645, -0.27164033, -0.3600223 ], \
[-0.609142, 0.25025278, 0.15731744, -0.66051376, -0.70994514, 0.8344964, -0.00551117, -0.7072167, -0.63929003, -0.52340907, -0.8842589, 0.9531688 ]])
x = np.dot(x, wx)
w = np.array([[ 0.7808204, -0.7412322, -0.9458036, -0.01664658, 0.7930616, 0.10208707, 0.20036687, -0.16743736, 1.0295134, -0.3118722, 0.02241168, 0.3154219 ], \
[-0.29026014, 0.24638331, -0.5435432, 0.87635124, -0.96091515, -0.1411362, 0.58606523, -0.38996056, -0.9003789, 0.8540163, -0.8831781, -0.28499633]])
w_rh = np.array([[0.15685119, 0.05694652], [-0.9641068, -1.5106804],
[0.3599193, 1.2540514]])
w_b = np.array([[
-0.49999997, 0.5, -0.49999997, -0.5, 0.5, 0.5, 0.49999997,
-0.49999997, 0.49999997, -0.5, 0.49999997, 0.5
]])
h0 = np.array([[-1.3392334e-04, -6.8468950e-04]])
c0 = np.array([[4.5552300e-04, 1.3302206e-03, -3.6721351e-04]])
w_c = None
self.lod = [[5]]
#import pdb; pdb.set_trace()
r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
proj_clip, cell_clip, ACTIVATION[self.act_gate],
ACTIVATION[self.act_cell], ACTIVATION[self.act_cand],
ACTIVATION[self.act_proj])
self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
self.inputs['Bias'] = w_b
if self.has_initial_state:
self.inputs['H0'] = h0
self.inputs['C0'] = c0
self.outputs = {
'Projection': (r, self.lod),
'Cell': (c, self.lod),
}
self.attrs = {
'use_peepholes': self.use_peepholes,
'is_reverse': self.is_reverse,
'proj_clip': proj_clip,
'cell_clip': cell_clip,
'gate_activation': self.act_gate,
'cell_activation': self.act_cell,
'candidate_activation': self.act_cand,
'proj_activation': self.act_proj
}
def setUp(self):
self.set_argument()
# projection size
self.P = 10
#self.D = 9
self.act_proj = self.act_cell
self.reset_argument()
self.op_type = 'lstmp'
#self.use_peepholes=False
#self.lod=[[7]]
#self.act_proj='identity'
#self.act_proj='tanh'
T = sum(self.lod[0])
N = len(self.lod[0])
#np.random.seed=123
x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
if self.has_initial_state:
h0 = np.random.normal(size=(N, self.D)).astype('float64')
h0 = np.random.normal(size=(N, self.P)).astype('float64')
c0 = np.random.normal(size=(N, self.D)).astype('float64')
else:
h0 = np.zeros((N, self.D)).astype('float64')
h0 = np.zeros((N, self.P)).astype('float64')
c0 = np.zeros((N, self.D)).astype('float64')
w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
if self.use_peepholes:
......@@ -140,9 +236,13 @@ class TestLstmpOp(LstmTest.TestLstmOp):
w_b = b[:, 0:4 * self.D]
w_c = b[:, 4 * self.D:] if self.use_peepholes else None
w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
proj_clip = 0.1
cell_clip = 0.1
#import pdb; pdb.set_trace()
r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
proj_clip, cell_clip, ACTIVATION[self.act_gate],
ACTIVATION[self.act_cell], ACTIVATION[self.act_cand],
ACTIVATION[self.act_proj])
self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
......@@ -159,6 +259,8 @@ class TestLstmpOp(LstmTest.TestLstmOp):
self.attrs = {
'use_peepholes': self.use_peepholes,
'is_reverse': self.is_reverse,
'proj_clip': proj_clip,
'cell_clip': cell_clip,
'gate_activation': self.act_gate,
'cell_activation': self.act_cell,
'candidate_activation': self.act_cand,
......@@ -171,14 +273,14 @@ class TestLstmpOp(LstmTest.TestLstmOp):
def test_check_grad(self):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
max_relative_error=1e-2)
max_relative_error=1e-2,
numeric_grad_delta=0.0000005)
class TestLstmpOpHasInitial(TestLstmpOp):
......@@ -188,7 +290,6 @@ class TestLstmpOpHasInitial(TestLstmpOp):
def test_check_grad(self):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -196,11 +297,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
['Projection'],
numeric_grad_delta=0.0000005,
max_relative_error=1e-2)
def test_check_grad_ingore_bias(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -208,11 +309,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'ProjWeight', 'Weight'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('Bias'))
def test_check_grad_ingore_weight(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -220,11 +321,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'ProjWeight', 'Bias'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('Weight'))
def test_check_grad_ingore_proj_weight(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -232,11 +333,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'Weight', 'Bias'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('ProjWeight'))
def test_check_grad_ingore_input(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -244,11 +345,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Weight', 'ProjWeight', 'Bias'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('Input'))
def test_check_grad_ingore_h0(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -256,11 +357,11 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('H0'))
def test_check_grad_ingore_c0(self):
N = len(self.lod[0])
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
......@@ -268,6 +369,7 @@ class TestLstmpOpHasInitial(TestLstmpOp):
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
max_relative_error=1e-2,
numeric_grad_delta=0.0000005,
no_grad_set=set('C0'))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册