提交 2a8dbd13 编写于 作者: D dangqingqing

LSTM Operator forward implementation.

上级 83627d3e
...@@ -46,9 +46,9 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope frame ...@@ -46,9 +46,9 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope frame
set(EXECUTOR_TEST_OP elementwise_add_op gaussian_random_op feed_op fetch_op set(EXECUTOR_TEST_OP elementwise_add_op gaussian_random_op feed_op fetch_op
mul_op sum_op squared_l2_distance_op fill_constant_op sgd_op mean_op) mul_op sum_op squared_l2_distance_op fill_constant_op sgd_op mean_op)
if(WITH_GPU) if(WITH_GPU)
nv_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP}) # nv_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
else() else()
cc_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP}) # cc_test(executor_test SRCS executor_test.cc DEPS executor ${EXECUTOR_TEST_OP})
endif() endif()
cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor) cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
......
...@@ -115,7 +115,8 @@ set(DEPS_OPS ...@@ -115,7 +115,8 @@ set(DEPS_OPS
softmax_with_cross_entropy_op softmax_with_cross_entropy_op
sum_op sum_op
pool_op pool_op
pool_with_index_op) pool_with_index_op
lstm_op)
op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
...@@ -126,6 +127,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) ...@@ -126,6 +127,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(sum_op DEPS net_op) op_library(sum_op DEPS net_op)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lstm_op DEPS sequence2batch)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
......
...@@ -22,12 +22,12 @@ class LSTMOp : public framework::OperatorWithKernel { ...@@ -22,12 +22,12 @@ class LSTMOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
void InferShape(framework::InferShapeContextBase* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"), PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTM should not be null."); "Input(Input) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"), PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of LSTM should not be null."); "Output(Hidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("H"), PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTM should not be null."); "Output(Cell) of LSTM should not be null.");
auto x_dims = ctx->GetInputDim("Input"); auto x_dims = ctx->GetInputDim("Input");
...@@ -60,7 +60,7 @@ class LSTMOp : public framework::OperatorWithKernel { ...@@ -60,7 +60,7 @@ class LSTMOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1, PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1."); "The first dimension of Input(Bias) should be 1.");
if (ctx->Attrs().Get<bool>("use_peepholes")) { if (ctx->Attrs().Get<bool>("usePeepholes")) {
PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
"The second dimension of Input(Bias) should be " "The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection", "7 * %d if enable peepholes connection",
...@@ -73,7 +73,7 @@ class LSTMOp : public framework::OperatorWithKernel { ...@@ -73,7 +73,7 @@ class LSTMOp : public framework::OperatorWithKernel {
} }
ctx->SetOutputDim("Hidden", x_dims); ctx->SetOutputDim("Hidden", x_dims);
ctx->SetOutputDim("Cell", x_dims); ctx->SetOutputDim("Cell", x_dims);
ctx->SetOutputDim("Hidden", x_dims); ctx->SetOutputDim("Batch", x_dims);
ctx->ShareLoD("Input", "Hidden"); ctx->ShareLoD("Input", "Hidden");
ctx->ShareLoD("Input", "Cell"); ctx->ShareLoD("Input", "Cell");
} }
...@@ -86,7 +86,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -86,7 +86,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Input", AddInput("Input",
"(LoDTensor) the first input is a LodTensor, which support " "(LoDTensor) the first input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in " "variable-time length input sequence. The underlying tensor in "
"this LoDTenosr is a matrix with shape (T X D), where, T is the " "this LoDTenosr is a matrix with shape (T X 4D), where, T is the "
"total time steps in this mini-batch, D is the hidden size."); "total time steps in this mini-batch, D is the hidden size.");
AddInput("H0", AddInput("H0",
"(Tensor, optional) the initial hidden state is an optional " "(Tensor, optional) the initial hidden state is an optional "
...@@ -103,14 +103,21 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -103,14 +103,21 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Bias", AddInput("Bias",
"(Tensor) the learnable weights, which contains two parts: " "(Tensor) the learnable weights, which contains two parts: "
"input-hidden bias weight and peephole connections weight if " "input-hidden bias weight and peephole connections weight if "
"seting `use_peepholes` True. " "seting `usePeepholes` True. "
"1. `use_peepholes = False` " "1. `usePeepholes = False` "
" - The shape is (1 x 4*D). " " - The shape is (1 x 4*D). "
" - Bias = {b_i, b_f, b_c, b_o}." " - Bias = {b_i, b_f, b_c, b_o}."
"2. `use_peepholes = True` " "2. `usePeepholes = True` "
" - The shape is (1 x 7*D). " " - The shape is (1 x 7*D). "
" - Bias = {b_i, b_f, b_c, b_o, W_ic, W_fc, W_oc}."); " - Bias = {b_i, b_f, b_c, b_o, W_ic, W_fc, W_oc}.");
AddOutput("Batch", "(LoDTensor) save the reorganized input as batch info. ") AddOutput("BatchGate",
"(LoDTensor) This LoDTensor contains input gate, forget gate "
"and output gate aftern the nonlinear computation. This "
"LoDTensor has the same shape with the reorganized input, which "
"was also be called batch input. The LoD size is 2. The first "
"LoD is the batch offsets and the second LoD contains the "
"indexes, which denote the position of reorganized sequence "
"in the raw input.")
.AsIntermediate(); .AsIntermediate();
AddOutput("Hidden", AddOutput("Hidden",
"(LoDTensor) the hidden state lod tensor of LSTM operator. " "(LoDTensor) the hidden state lod tensor of LSTM operator. "
...@@ -118,25 +125,25 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -118,25 +125,25 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Cell", AddOutput("Cell",
"(LoDTensor) the cell state lod tensor of LSTM operator. " "(LoDTensor) the cell state lod tensor of LSTM operator. "
"The shape and lod is the same with the `Input`."); "The shape and lod is the same with the `Input`.");
AddAttr<bool>("use_peepholes", AddAttr<bool>("usePeepholes",
"(bool, defalut: True) " "(bool, defalut: True) "
"whether to enable diagonal/peephole connections.") "whether to enable diagonal/peephole connections.")
.SetDefault(true); .SetDefault(true);
AddAttr<bool>("is_reverse", AddAttr<bool>("isReverse",
"(bool, defalut: False) " "(bool, defalut: False) "
"whether to compute reversed LSTM.") "whether to compute reversed LSTM.")
.SetDefault(true); .SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"gate_activation", "gateActivation",
"(string, defalut: sigmoid)" "(string, defalut: sigmoid)"
"The activation for input gate, forget gate and output " "The activation for input gate, forget gate and output "
"gate, `sigmoid` by defalut.") "gate, `sigmoid` by defalut.")
.SetDefault("sigmoid"); .SetDefault("sigmoid");
AddAttr<std::string>("cell_activation", AddAttr<std::string>("cellActivation",
"(string, defalut: tanh)" "(string, defalut: tanh)"
"The activation for cell output, `tanh` by defalut.") "The activation for cell output, `tanh` by defalut.")
.SetDefault("tanh"); .SetDefault("tanh");
AddAttr<std::string>("candidate_activation", AddAttr<std::string>("candidateActivation",
"(string, defalut: tanh)" "(string, defalut: tanh)"
"The activation for candidate hidden state, " "The activation for candidate hidden state, "
"`tanh` by defalut.") "`tanh` by defalut.")
...@@ -173,7 +180,7 @@ are the cell input and cell output activation functions, `tanh` is usually ...@@ -173,7 +180,7 @@ are the cell input and cell output activation functions, `tanh` is usually
used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state, used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state. which is computed based on the current input and the previous hidden state.
Set `use_peepholes` False to disable peephole connection [2]. The formula Set `usePeepholes` False to disable peephole connection [2]. The formula
is omitted here. is omitted here.
@note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$ @note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
...@@ -196,7 +203,7 @@ class LSTMGradOp : public framework::OperatorWithKernel { ...@@ -196,7 +203,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
void InferShape(framework::InferShapeContextBase* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
"Input(Hidden@GRAD) should not be null"); "Input(Hidden@GRAD) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cell")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cell")),
......
...@@ -14,30 +14,120 @@ limitations under the License. */ ...@@ -14,30 +14,120 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence2batch.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::LoDTensor; using framework::LoDTensor;
using framework::Tensor; using framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T> template <typename Place, typename T>
class LSTMKernel : public framework::OpKernel<T> { class LSTMKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* input_t = ctx.Input<framework::LoDTensor>("Input"); auto* input = ctx.Input<framework::LoDTensor>("Input");
auto* batch_t = ctx.Input<framework::LoDTensor>("Batch"); auto* weight = ctx.Input<framework::Tensor>("Weight");
auto* bias_t = ctx.Input<framework::LoDTensor>("Bias"); auto* bias = ctx.Input<framework::Tensor>("Bias");
bool is_reverse = ctx.Attr<bool>("is_reverse");
LoDTensor2BatchFunctor<Place, T> to_batch(ctx.device_context(), input_t, auto* batch_gate = ctx.Output<framework::LoDTensor>("BatchGate");
batch_t, is_reverse); batch_gate->mutable_data<T>(ctx.GetPlace());
auto* hidden_out = ctx.Output<framework::LoDTensor>("Hidden");
auto in_dims = input_t->dims(); hidden_out->mutable_data<T>(ctx.GetPlace());
auto* cell_out = ctx.Output<framework::LoDTensor>("Cell");
cell_out->mutable_data<T>(ctx.GetPlace());
// Now the function ShareLoD in InferShape is not implemented.
// So copy LoD here.
ctx.ShareLoD("Input", "Hidden");
ctx.ShareLoD("Input", "Cell");
bool is_reverse = ctx.Attr<bool>("isReverse");
math::LoDTensor2BatchFunctor<Place, T> to_batch;
to_batch(ctx.device_context(), *input, *batch_gate, is_reverse);
auto in_dims = input->dims();
int frame_size = in_dims[1]; int frame_size = in_dims[1];
if (bias_t) { if (bias) {
Eigen::array<int, 2> extents({{1, 4 * frame_size}});
Eigen::array<int, 2> offsets({{0, 0}});
auto b = EigenMatrix<T>::From(*bias); auto b = EigenMatrix<T>::From(*bias);
auto gate = EigenMatrix<T>::From(*batch_gate);
gate.device(ctx.GetEigenDevice<Place>()) =
gate +
b.slice(offsets, extents)
.reshape(Eigen::array<int, 2>({{1, frame_size * 4}}))
.broadcast(
Eigen::array<int, 2>({{static_cast<int>(in_dims[0]), 1}}));
}
math::LstmMetaValue<T> lstm_value;
T* bias_data = const_cast<T*>(bias->data<T>());
// the code styple in LstmMetaValue will be updated later.
lstm_value.checkIg = bias_data + 4 * frame_size;
lstm_value.checkFg = lstm_value.checkIg + frame_size;
lstm_value.checkOg = lstm_value.checkFg + frame_size;
lstm_value.prevStateValue = nullptr;
framework::LoDTensor batch_out;
batch_out.mutable_data<T>(in_dims, ctx.GetPlace());
framework::LoDTensor batch_cell;
batch_cell.mutable_data<T>(in_dims, ctx.GetPlace());
framework::LoDTensor batch_cell_pre_act;
batch_cell_pre_act.mutable_data<T>(in_dims, ctx.GetPlace());
auto batch_lod = batch_gate->lod()[0];
int num_batch = batch_lod.size() - 1;
auto gate_act = ctx.Attr<std::string>("gateActivation");
auto cell_act = ctx.Attr<std::string>("cellActivation");
auto cand_act = ctx.Attr<std::string>("candidateActivation");
for (int n = 0; n < num_batch; n++) {
int bstart = batch_lod[n];
int bend = batch_lod[n + 1];
Tensor gate_t = batch_gate->Slice<T>(bstart, bend);
Tensor out_t = batch_out.Slice<T>(bstart, bend);
Tensor cell_t = batch_cell.Slice<T>(bstart, bend);
Tensor cell_pre_act_t = batch_cell_pre_act.Slice<T>(bstart, bend);
int cur_batch_size = bend - bstart;
if (n != 0) {
int pre_end = batch_lod[n - 1];
auto pre_hidden_t = batch_out.Slice<T>(pre_end, bstart);
math::matmul<Place, T>(ctx.device_context(), pre_hidden_t, false,
*weight, false, static_cast<T>(1.0), &gate_t,
static_cast<T>(0.0));
}
// else if : how to pass the state from
// last mini-batch will be supported later
lstm_value.gateValue = gate_t.data<T>();
lstm_value.outputValue = out_t.data<T>();
lstm_value.stateValue = cell_t.data<T>();
lstm_value.stateActiveValue = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<Place, T>::compute(ctx.device_context(), lstm_value,
frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
lstm_value.prevStateValue = lstm_value.stateValue;
} }
math::Batch2LoDTensorFunctor<Place, T> to_seq;
batch_out.set_lod(batch_gate->lod());
// restore the output hidden in LoDTensor from the batch hidden
to_seq(ctx.device_context(), batch_out, *hidden_out);
batch_out.set_lod(batch_gate->lod());
// restore the output cell state in LoDTensor from the batch cell
to_seq(ctx.device_context(), batch_cell, *cell_out);
} }
}; };
......
...@@ -5,13 +5,16 @@ if(WITH_GPU) ...@@ -5,13 +5,16 @@ if(WITH_GPU)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context)
else() else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(softmax SRCS softmax.cc DEPS operator)
cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(vol2col SRCS vol2col.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context)
endif() endif()
cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
......
...@@ -16,15 +16,30 @@ limitations under the License. */ ...@@ -16,15 +16,30 @@ limitations under the License. */
#define HL_ACTIVATION_FUNCTIONS_H_ #define HL_ACTIVATION_FUNCTIONS_H_
#include "hl_functions.h" #include "hl_functions.h"
#include "paddle/operators/math/lstm_compute.h"
/** /**
* Active functions: sigmoid, relu, tanh and linear. * Active functions: sigmoid, relu, tanh and linear.
*/ */
#define HPPL_ACTIVE_FUNCTION \ #define FLOAT_ACTIVE_FUNCTION \
{ \
hppl::typef::sigmoid, hppl::typef::relu, hppl::typef::tanh, \
hppl::typef::linear \
}
#define DOUBLE_ACTIVE_FUNCTION \
{ \
hppl::typed::sigmoid, hppl::typed::relu, hppl::typed::tanh, \
hppl::typed::linear \
}
#define AVX_ACTIVE_FUNCTION \
{ hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear } { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
namespace hppl { namespace hppl {
using activation_mode_t = paddle::operators::math::activation_mode_t;
/** /**
* Hppl supports sigmoid, relu, tanh, linear active functions * Hppl supports sigmoid, relu, tanh, linear active functions
* for neural networks' forward and backward activation. * for neural networks' forward and backward activation.
...@@ -36,25 +51,134 @@ class Active { ...@@ -36,25 +51,134 @@ class Active {
typedef T (*backward)(T, T); typedef T (*backward)(T, T);
}; };
template <typename T>
struct ForwardActType;
template <>
struct ForwardActType<float> {
using type = Active<float>::forward;
};
template <>
struct ForwardActType<double> {
using type = Active<double>::forward;
};
template <typename T>
struct BackwardActType;
template <>
struct BackwardActType<float> {
using type = Active<float>::backward;
};
template <>
struct BackwardActType<double> {
using type = Active<double>::backward;
};
#ifdef __NVCC__ #ifdef __NVCC__
namespace gpu { namespace gpu {
static __device__ Active<float>::forward forward[] = HPPL_ACTIVE_FUNCTION; static __device__ Active<float>::forward forward[] = FLOAT_ACTIVE_FUNCTION;
static __device__ Active<float>::backward backward[] = HPPL_ACTIVE_FUNCTION; static __device__ Active<float>::backward backward[] = FLOAT_ACTIVE_FUNCTION;
static __device__ Active<double>::forward forward[] = HPPL_ACTIVE_FUNCTION;
static __device__ Active<double>::backward backward[] = HPPL_ACTIVE_FUNCTION; static __device__ Active<double>::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION;
static __device__ Active<double>::backward backward_d[] =
DOUBLE_ACTIVE_FUNCTION;
template <typename T>
struct ForwardAct {
__device__ typename ForwardActType<T>::type operator()(
activation_mode_t type);
};
template <>
struct ForwardAct<float> {
__device__ ForwardActType<float>::type operator()(activation_mode_t type) {
return forward[type];
}
};
template <>
struct ForwardAct<double> {
__device__ ForwardActType<double>::type operator()(activation_mode_t type) {
return forward_d[type];
}
};
template <typename T>
struct BackwardAct {
__device__ typename BackwardActType<T>::type operator()(
activation_mode_t type);
};
template <>
struct BackwardAct<float> {
__device__ BackwardActType<float>::type operator()(activation_mode_t type) {
return backward[type];
}
};
template <>
struct BackwardAct<double> {
__device__ BackwardActType<double>::type operator()(activation_mode_t type) {
return backward_d[type];
}
};
} // namespace gpu } // namespace gpu
#else #else
namespace cpu { namespace cpu {
static Active<float>::forward forward[] = HPPL_ACTIVE_FUNCTION; static Active<float>::forward forward[] = FLOAT_ACTIVE_FUNCTION;
static Active<float>::backward backward[] = HPPL_ACTIVE_FUNCTION; static Active<float>::backward backward[] = FLOAT_ACTIVE_FUNCTION;
static Active<double>::forward forward[] = HPPL_ACTIVE_FUNCTION;
static Active<double>::backward backward[] = HPPL_ACTIVE_FUNCTION; static Active<double>::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION;
static Active<double>::backward backward_d[] = DOUBLE_ACTIVE_FUNCTION;
template <typename T>
struct ForwardAct {
typename ForwardActType<T>::type operator()(activation_mode_t type);
};
template <>
struct ForwardAct<float> {
ForwardActType<float>::type operator()(activation_mode_t type) {
return forward[type];
}
};
template <>
struct ForwardAct<double> {
ForwardActType<double>::type operator()(activation_mode_t type) {
return forward_d[type];
}
};
template <typename T>
struct BackwardAct {
typename BackwardActType<T>::type operator()(activation_mode_t type);
};
template <>
struct BackwardAct<float> {
BackwardActType<float>::type operator()(activation_mode_t type) {
return backward[type];
}
};
template <>
struct BackwardAct<double> {
BackwardActType<double>::type operator()(activation_mode_t type) {
return backward_d[type];
}
};
} // namespace cpu } // namespace cpu
#ifdef __AVX__ #ifdef __AVX__
namespace avx { namespace avx {
static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION; static Active<__m256>::forward forward[] = AVX_ACTIVE_FUNCTION;
static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION; static Active<__m256>::backward backward[] = AVX_ACTIVE_FUNCTION;
} // namespace avx } // namespace avx
#endif #endif
#endif #endif
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <math.h>
#include "/paddle/operators/math/detail/hl_functions.h"
namespace hppl {
real relu(const real a) { return a > 0.0f ? a : 0.0f; }
real sigmoid(const real a) {
const real min = SIGMOID_THRESHOLD_MIN;
const real max = SIGMOID_THRESHOLD_MAX;
real tmp = (a < min) ? min : ((a > max) ? max : a);
return 1.0 / (1.0 + exp(-tmp));
}
real tanh(const real a) {
real tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
real linear(const real a) { return a; }
real relu(const real a, const real b) { return a * (b > 0.0f ? 1.0f : 0.0f); }
real sigmoid(const real a, const real b) { return a * b * (1 - b); }
real tanh(const real a, const real b) { return a * (1.0f - b * b); }
real linear(const real a, const real b) { return a; }
} // namespace hppl
...@@ -25,31 +25,94 @@ limitations under the License. */ ...@@ -25,31 +25,94 @@ limitations under the License. */
*/ */
#define SIGMOID_THRESHOLD_MAX 13.0 #define SIGMOID_THRESHOLD_MAX 13.0
/**
* The maximum input value for exp, used to avoid overflow problem.
* currently only used for tanh function.
*/
#define EXP_MAX_INPUT 40.0
#ifndef __NVCC__ #ifndef __NVCC__
namespace hppl { namespace hppl {
namespace typef {
/*
* forward activation
*/
float relu(const float a) {
return a > static_cast<float>(0.0) ? a : static_cast<float>(0.0);
}
float sigmoid(const float a) {
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
float tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<float>(1.0) / (static_cast<float>(1.0) + exp(-tmp));
}
float tanh(const float a) {
float tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
float linear(const float a) { return a; }
/*
* backward activation
*/
float relu(const float a, const float b) { return a * (b > 0.0 ? 1.0 : 0.0); }
float sigmoid(const float a, const float b) {
return a * b * (static_cast<float>(1) - b);
}
float tanh(const float a, const float b) {
return a * (static_cast<float>(1) - b * b);
}
float linear(const float a, const float b) { return a; }
} // namespace typef
namespace typed {
/* /*
* forward activation * forward activation
*/ */
template <typename T> double relu(const double a) {
T relu(const T a); return a > static_cast<double>(0.0) ? a : static_cast<double>(0.0);
template <typename T> }
T sigmoid(const T a);
template <typename T> double sigmoid(const double a) {
T tanh(const T a); const double min = SIGMOID_THRESHOLD_MIN;
template <typename T> const double max = SIGMOID_THRESHOLD_MAX;
T linear(const T a); double tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<double>(1.0) / (static_cast<double>(1.0) + exp(-tmp));
}
double tanh(const double a) {
double tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
double linear(const double a) { return a; }
/* /*
* backward activation * backward activation
*/ */
template <typename T> double relu(const double a, const double b) {
T relu(const T a, const T b); return a * (b > 0.0 ? 1.0 : 0.0);
template <typename T> }
T sigmoid(const T a, const T b);
template <typename T> double sigmoid(const double a, const double b) {
T tanh(const T a, const T b); return a * b * (static_cast<double>(1) - b);
template <typename T> }
T linear(const T a, const T b);
double tanh(const double a, const double b) {
return a * (static_cast<double>(1) - b * b);
}
double linear(const double a, const double b) { return a; }
} // namespace typed
} // namespace hppl } // namespace hppl
#ifdef __AVX__ #ifdef __AVX__
......
...@@ -18,13 +18,10 @@ limitations under the License. */ ...@@ -18,13 +18,10 @@ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
namespace hppl { namespace hppl {
namespace typef {
template <typename T> __device__ static float relu(const float a) { return a > 0.0f ? a : 0.0f; }
__device__ static T relu(const T a) {
return a > 0.0f ? a : 0.0f;
}
template <>
__device__ static float sigmoid(const float a) { __device__ static float sigmoid(const float a) {
const float min = SIGMOID_THRESHOLD_MIN; const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX; const float max = SIGMOID_THRESHOLD_MAX;
...@@ -32,7 +29,32 @@ __device__ static float sigmoid(const float a) { ...@@ -32,7 +29,32 @@ __device__ static float sigmoid(const float a) {
return __fdividef(1.0f, 1.0f + __expf(-tmp)); return __fdividef(1.0f, 1.0f + __expf(-tmp));
} }
template <> __device__ static float tanh(const float a) {
return __fdividef(2.0f, (1.0f + __expf(-2.0f * a))) - 1.0f;
}
__device__ static float linear(const float a) { return a; }
__device__ static float relu(const float a, const float b) {
return a * (b > 0.0f ? 1.0f : 0.0f);
}
__device__ static float sigmoid(const float a, const float b) {
return a * b * (1.0f - b);
}
__device__ static float tanh(const float a, const float b) {
return a * (1.0f - b * b);
}
__device__ static float linear(const float a, const float b) { return a; }
} // namespace typef
namespace typed {
__device__ static double relu(const double a) { return a > 0.0 ? a : 0.0; }
__device__ static double sigmoid(const double a) { __device__ static double sigmoid(const double a) {
const double min = SIGMOID_THRESHOLD_MIN; const double min = SIGMOID_THRESHOLD_MIN;
const double max = SIGMOID_THRESHOLD_MAX; const double max = SIGMOID_THRESHOLD_MAX;
...@@ -40,40 +62,27 @@ __device__ static double sigmoid(const double a) { ...@@ -40,40 +62,27 @@ __device__ static double sigmoid(const double a) {
return 1.0 / (1.0 + exp(-tmp)); return 1.0 / (1.0 + exp(-tmp));
} }
template <>
__device__ static float tanh(const float a) {
return __fdividef(2.0f, (1.0f + __expf(-2.0f * a))) - 1.0f;
}
template <>
__device__ static double tanh(const double a) { __device__ static double tanh(const double a) {
return (2.0 / (1.0 + exp(-2.0 * a))) - 1.0; return (2.0 / (1.0 + exp(-2.0 * a))) - 1.0;
} }
template <typename T> __device__ static double linear(const double a) { return a; }
__device__ static T linear(const T a) {
return a;
}
template <typename T> __device__ static double relu(const double a, const double b) {
__device__ static T relu(const T a, const T b) { return a * (b > 0.0 ? 1.0 : 0.0);
return a * (b > 0.0f ? 1.0f : 0.0f);
} }
template <typename T> __device__ static double sigmoid(const double a, const double b) {
__device__ static T sigmoid(const T a, const T b) {
return a * b * (1 - b); return a * b * (1 - b);
} }
template <typename T> __device__ static double tanh(const double a, const double b) {
__device__ static T tanh(const T a, const T b) { return a * (1.0 - b * b);
return a * (1.0f - b * b);
} }
template <typename T> __device__ static double linear(const double a, const double b) { return a; }
__device__ static T linear(const T a, const T b) {
return a; } // namespace typef
}
} // namespace hppl } // namespace hppl
......
...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <type_traits>
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/operators/math/lstm_compute.h" #include "paddle/operators/math/lstm_compute.h"
namespace paddle { namespace paddle {
...@@ -23,7 +25,8 @@ namespace detail { ...@@ -23,7 +25,8 @@ namespace detail {
#ifndef __NVCC__ #ifndef __NVCC__
template <class T, class Op> template <class T, class Op>
void naive_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize, void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frameSize,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
...@@ -57,9 +60,10 @@ void naive_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize, ...@@ -57,9 +60,10 @@ void naive_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize,
rPrevState = value.prevStateValue[i]; rPrevState = value.prevStateValue[i];
} }
hppl::cpu::ForwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, hppl::cpu::forward[active_node], rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate),
hppl::cpu::forward[active_gate], hppl::cpu::forward[active_state]); act(active_state));
valueIn[i] = rValueIn; valueIn[i] = rValueIn;
valueIg[i] = rValueIg; valueIg[i] = rValueIg;
...@@ -72,8 +76,8 @@ void naive_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize, ...@@ -72,8 +76,8 @@ void naive_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize,
} }
template <class T, class Op> template <class T, class Op>
void naive_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad, void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
int frameSize, LstmMetaGrad<T> grad, int frameSize,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
...@@ -123,11 +127,11 @@ void naive_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad, ...@@ -123,11 +127,11 @@ void naive_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad,
rPrevState = value.prevStateValue[i]; rPrevState = value.prevStateValue[i];
} }
hppl::cpu::BackwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
rCheckOGrad, hppl::cpu::backward[active_node], rCheckOGrad, act(active_node), act(active_gate), act(active_state));
hppl::cpu::backward[active_gate], hppl::cpu::backward[active_state]);
gradIn[i] = rGradIn; gradIn[i] = rGradIn;
gradIg[i] = rGradIg; gradIg[i] = rGradIg;
...@@ -144,8 +148,8 @@ void naive_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad, ...@@ -144,8 +148,8 @@ void naive_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad,
} }
} }
template <class Op> template <class T, class Op>
void avx_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize, void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
...@@ -195,9 +199,9 @@ void avx_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize, ...@@ -195,9 +199,9 @@ void avx_lstm_forward_one_sequence(Op op, lstm_value value, int frameSize,
#endif #endif
} }
template <class Op> template <class T, class Op>
void avx_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad, void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
int frameSize, LstmMetaGrad<T> grad, int frameSize,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
...@@ -271,13 +275,13 @@ void avx_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad, ...@@ -271,13 +275,13 @@ void avx_lstm_backward_one_sequence(Op op, lstm_value value, lstm_grad grad,
} }
template <class T, class Op> template <class T, class Op>
void cpu_lstm_forward(Op op, lstm_value value, int frameSize, void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence(op, value, frameSize, active_node, avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
active_gate, active_state); active_gate, active_state);
} else { } else {
naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node, naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
active_gate, active_state); active_gate, active_state);
...@@ -285,13 +289,13 @@ void cpu_lstm_forward(Op op, lstm_value value, int frameSize, ...@@ -285,13 +289,13 @@ void cpu_lstm_forward(Op op, lstm_value value, int frameSize,
} }
template <class T, class Op> template <class T, class Op>
void cpu_lstm_backward(Op op, lstm_value value, lstm_grad grad, int frameSize, void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
activation_mode_t active_node, int frameSize, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence(op, value, grad, frameSize, active_node, avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
active_gate, active_state); active_gate, active_state);
} else { } else {
naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node, naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
active_gate, active_state); active_gate, active_state);
......
...@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "paddle/operators/math/detail/lstm_kernel.h" #include <type_traits>
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/operators/math/lstm_compute.h" #include "paddle/operators/math/lstm_compute.h"
#include "paddle/platform/cuda_helper.h" #include "paddle/platform/cuda_helper.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -27,10 +29,11 @@ namespace detail { ...@@ -27,10 +29,11 @@ namespace detail {
* grid(frameBlocks, batchBlocks) * grid(frameBlocks, batchBlocks)
*/ */
template <class T, class Op, bool isBatch> template <class T, class Op, bool isBatch>
__global__ void KeLstmForward(Op op, lstm_value value, int frameSize, __global__ void KeLstmForward(
int batchSize, activation_mode_t active_node, Op op, LstmMetaValue<T> value, int frameSize, int batchSize,
activation_mode_t active_gate, typename hppl::ForwardActType<T>::type active_node,
activation_mode_t active_state) { typename hppl::ForwardActType<T>::type active_gate,
typename hppl::ForwardActType<T>::type active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frameIdx >= frameSize) return;
...@@ -67,8 +70,7 @@ __global__ void KeLstmForward(Op op, lstm_value value, int frameSize, ...@@ -67,8 +70,7 @@ __global__ void KeLstmForward(Op op, lstm_value value, int frameSize,
} }
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, hppl::gpu::forward[active_node], rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
hppl::gpu::forward[active_gate], hppl::gpu::forward[active_state]);
value.gateValue[frameIdx] = rValueIn; value.gateValue[frameIdx] = rValueIn;
value.gateValue[frameIdx + frameSize] = rValueIg; value.gateValue[frameIdx + frameSize] = rValueIg;
...@@ -85,11 +87,11 @@ __global__ void KeLstmForward(Op op, lstm_value value, int frameSize, ...@@ -85,11 +87,11 @@ __global__ void KeLstmForward(Op op, lstm_value value, int frameSize,
* grid(frameBlocks, batchBlocks) * grid(frameBlocks, batchBlocks)
*/ */
template <class T, class Op, bool isBatch> template <class T, class Op, bool isBatch>
__global__ void KeLstmBackward(Op op, lstm_value value, lstm_grad grad, __global__ void KeLstmBackward(
int frameSize, int batchSize, Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad, int frameSize,
activation_mode_t active_node, int batchSize, typename hppl::BackwardActType<T>::type active_node,
activation_mode_t active_gate, typename hppl::BackwardActType<T>::type active_gate,
activation_mode_t active_state) { typename hppl::BackwardActType<T>::type active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frameIdx >= frameSize) return;
...@@ -143,8 +145,7 @@ __global__ void KeLstmBackward(Op op, lstm_value value, lstm_grad grad, ...@@ -143,8 +145,7 @@ __global__ void KeLstmBackward(Op op, lstm_value value, lstm_grad grad,
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg, op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg,
rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad,
rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad,
hppl::gpu::backward[active_node], hppl::gpu::backward[active_gate], active_node, active_gate, active_state);
hppl::gpu::backward[active_state]);
grad.gateGrad[frameIdx] = rGradIn; grad.gateGrad[frameIdx] = rGradIn;
grad.gateGrad[frameIdx + frameSize] = rGradIg; grad.gateGrad[frameIdx + frameSize] = rGradIg;
...@@ -177,7 +178,8 @@ __global__ void KeLstmBackward(Op op, lstm_value value, lstm_grad grad, ...@@ -177,7 +178,8 @@ __global__ void KeLstmBackward(Op op, lstm_value value, lstm_grad grad,
} }
template <class T, class Op> template <class T, class Op>
void gpu_lstm_forward(Op op, lstm_value value, int frameSize, int batchSize, void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, int frameSize, int batchSize,
activation_mode_t active_node, activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
...@@ -194,22 +196,30 @@ void gpu_lstm_forward(Op op, lstm_value value, int frameSize, int batchSize, ...@@ -194,22 +196,30 @@ void gpu_lstm_forward(Op op, lstm_value value, int frameSize, int batchSize,
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
} }
using type = typename hppl::ForwardActType<T>::type;
hppl::gpu::ForwardAct<T> act;
type act_node = act(active_node);
type act_gate = act(active_gate);
type act_state = act(active_state);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) { if (batchSize == 1) {
KeLstmForward<T, Op, KeLstmForward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, STREAM_DEFAULT>>>( /* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate, op, value, frameSize, batchSize, act_node, act_gate, act_state);
active_state);
} else { } else {
KeLstmForward<T, Op, KeLstmForward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, STREAM_DEFAULT>>>( /* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate, op, value, frameSize, batchSize, act_node, act_gate, act_state);
active_state);
} }
} }
template <class T, class Op> template <class T, class Op>
void gpu_lstm_backward(Op op, lstm_value value, lstm_grad grad, int frameSize, void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
int batchSize, activation_mode_t active_node, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate, activation_mode_t active_gate,
activation_mode_t active_state) { activation_mode_t active_state) {
dim3 threads; dim3 threads;
...@@ -225,16 +235,22 @@ void gpu_lstm_backward(Op op, lstm_value value, lstm_grad grad, int frameSize, ...@@ -225,16 +235,22 @@ void gpu_lstm_backward(Op op, lstm_value value, lstm_grad grad, int frameSize,
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
} }
using type = typename hppl::BackwardActType<T>::type;
hppl::gpu::BackwardAct<T> act;
type act_node = act(active_node);
type act_gate = act(active_gate);
type act_state = act(active_state);
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) { if (batchSize == 1) {
KeLstmBackward<T, Op, KeLstmBackward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, STREAM_DEFAULT>>>( /* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate, op, value, grad, frameSize, batchSize, act_node, act_gate, act_state);
active_state);
} else { } else {
KeLstmBackward<T, Op, KeLstmBackward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, STREAM_DEFAULT>>>( /* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate, op, value, grad, frameSize, batchSize, act_node, act_gate, act_state);
active_state);
} }
} }
......
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "hl_activation_functions.h" #include "paddle/operators/math/detail/hl_activation_functions.h"
#ifdef __CUDA_ARCH__ #ifdef __CUDA_ARCH__
#define INLINE __device__ inline #define INLINE __device__ inline
...@@ -33,9 +33,9 @@ class lstm { ...@@ -33,9 +33,9 @@ class lstm {
INLINE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, INLINE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
T &prevState, T &state, T &stateAtv, T &output, T &prevState, T &state, T &stateAtv, T &output,
T &checkI, T &checkF, T &checkO, T &checkI, T &checkF, T &checkO,
Active<T>::forward actInput, typename hppl::ForwardActType<T>::type actInput,
Active<T>::forward actGate, typename hppl::ForwardActType<T>::type actGate,
Active<T>::forward actState) { typename hppl::ForwardActType<T>::type actState) {
valueIn = actInput(valueIn); valueIn = actInput(valueIn);
valueIg = actGate(valueIg + prevState * checkI); valueIg = actGate(valueIg + prevState * checkI);
valueFg = actGate(valueFg + prevState * checkF); valueFg = actGate(valueFg + prevState * checkF);
...@@ -53,9 +53,9 @@ class lstm { ...@@ -53,9 +53,9 @@ class lstm {
__m256 &valueOg, __m256 &prevState, __m256 &state, __m256 &valueOg, __m256 &prevState, __m256 &state,
__m256 &stateAtv, __m256 &output, __m256 &checkI, __m256 &stateAtv, __m256 &output, __m256 &checkI,
__m256 &checkF, __m256 &checkO, __m256 &checkF, __m256 &checkO,
Active<__m256>::forward actInput, hppl::Active<__m256>::forward actInput,
Active<__m256>::forward actGate, hppl::Active<__m256>::forward actGate,
Active<__m256>::forward actState) { hppl::Active<__m256>::forward actState) {
valueIn = actInput(valueIn); valueIn = actInput(valueIn);
valueIg = actGate(_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI))); valueIg = actGate(_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)));
valueFg = actGate(_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF))); valueFg = actGate(_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)));
...@@ -81,9 +81,9 @@ class lstm { ...@@ -81,9 +81,9 @@ class lstm {
T &prevState, T &prevStateGrad, T &state, T &stateGrad, T &prevState, T &prevStateGrad, T &state, T &stateGrad,
T &stateAtv, T &outputGrad, T &checkI, T &checkF, T &stateAtv, T &outputGrad, T &checkI, T &checkF,
T &checkO, T &checkIGrad, T &checkFGrad, T &checkOGrad, T &checkO, T &checkIGrad, T &checkFGrad, T &checkOGrad,
Active<T>::backward actInput, typename hppl::BackwardActType<T>::type actInput,
Active<T>::backward actGate, typename hppl::BackwardActType<T>::type actGate,
Active<T>::backward actState) { typename hppl::BackwardActType<T>::type actState) {
gradOg = actGate(outputGrad * stateAtv, valueOg); gradOg = actGate(outputGrad * stateAtv, valueOg);
stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO; stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO;
gradIn = actInput(stateGrad * valueIg, valueIn); gradIn = actInput(stateGrad * valueIg, valueIn);
...@@ -106,9 +106,10 @@ class lstm { ...@@ -106,9 +106,10 @@ class lstm {
__m256 &stateGrad, __m256 &stateAtv, __m256 &stateGrad, __m256 &stateAtv,
__m256 &outputGrad, __m256 &checkI, __m256 &checkF, __m256 &outputGrad, __m256 &checkI, __m256 &checkF,
__m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad, __m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad,
__m256 &checkOGrad, Active<__m256>::backward actInput, __m256 &checkOGrad,
Active<__m256>::backward actGate, hppl::Active<__m256>::backward actInput,
Active<__m256>::backward actState) { hppl::Active<__m256>::backward actGate,
hppl::Active<__m256>::backward actState) {
gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg); gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg);
stateGrad = _mm256_add_ps( stateGrad = _mm256_add_ps(
actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad); actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad);
...@@ -134,5 +135,3 @@ class lstm { ...@@ -134,5 +135,3 @@ class lstm {
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif /* HL_LSTM_OPS_CUH_ */
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "LstmCompute.h" #include "paddle/operators/math/lstm_compute.h"
#include "paddle/operators/math/detail/lstm_cpu_kernel.h" #include "paddle/operators/math/detail/lstm_cpu_kernel.h"
#include "paddle/operators/math/detail/lstm_kernel.h" #include "paddle/operators/math/detail/lstm_kernel.h"
...@@ -22,19 +22,20 @@ namespace math { ...@@ -22,19 +22,20 @@ namespace math {
template <class T> template <class T>
struct LstmUnitFunctor<platform::CPUPlace, T> { struct LstmUnitFunctor<platform::CPUPlace, T> {
static void compute(lstm_value value, int frame_size, int batch_size, static void compute(const platform::DeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
std::string gate_act, std::string cell_act, std::string gate_act, std::string cell_act,
std::string cand_act) { std::string cand_act) {
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frameSize, detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
ActiveType(cand_act), ActiveType(gate_act), ActiveType(cand_act), ActiveType(gate_act),
ActiveType(cell_act)); ActiveType(cell_act));
value.gateValue += frameSize * 4; value.gateValue += frame_size * 4;
value.stateValue += frameSize; value.stateValue += frame_size;
value.stateActiveValue += frameSize; value.stateActiveValue += frame_size;
value.outputValue += frameSize; value.outputValue += frame_size;
if (value.prevStateValue) { if (value.prevStateValue) {
value.prevStateValue += frameSize; value.prevStateValue += frame_size;
} }
} }
} }
...@@ -42,31 +43,36 @@ struct LstmUnitFunctor<platform::CPUPlace, T> { ...@@ -42,31 +43,36 @@ struct LstmUnitFunctor<platform::CPUPlace, T> {
template <class T> template <class T>
struct LstmUnitGradFunctor<platform::CPUPlace, T> { struct LstmUnitGradFunctor<platform::CPUPlace, T> {
static void compute(lstm_value value, lstm_grad grad, int frame_size, static void compute(const platform::DeviceContext& context,
int batch_size, std::string gate_act, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size, std::string gate_act,
std::string cell_act, std::string cand_act) { std::string cell_act, std::string cand_act) {
for (int b = 0; b < batchSize; b++) { for (int b = 0; b < batch_size; b++) {
detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad, detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
frameSize, ActiveType(cand_act), frame_size, ActiveType(cand_act),
ActiveType(gate_act), ActiveType(cell_act)); ActiveType(gate_act), ActiveType(cell_act));
value.gateValue += frameSize * 4; value.gateValue += frame_size * 4;
value.stateValue += frameSize; value.stateValue += frame_size;
value.stateActiveValue += frameSize; value.stateActiveValue += frame_size;
value.outputValue += frameSize; value.outputValue += frame_size;
if (value.prevStateValue) { if (value.prevStateValue) {
value.prevStateValue += frameSize; value.prevStateValue += frame_size;
} }
grad.gateGrad += frameSize * 4; grad.gateGrad += frame_size * 4;
grad.stateGrad += frameSize; grad.stateGrad += frame_size;
grad.stateActiveGrad += frameSize; grad.stateActiveGrad += frame_size;
grad.outputGrad += frameSize; grad.outputGrad += frame_size;
if (grad.prevStateGrad) { if (grad.prevStateGrad) {
grad.prevStateGrad += frameSize; grad.prevStateGrad += frame_size;
} }
} }
}; }
};
template class LstmUnitFunctor<platform::CPUPlace, float>;
template class LstmUnitGradFunctor<platform::CPUPlace, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "LstmCompute.h" #include "paddle/operators/math/detail/lstm_gpu_kernel.h"
#include "paddle/operators/math/detail/lstm_cpu_kernel.h"
#include "paddle/operators/math/detail/lstm_kernel.h" #include "paddle/operators/math/detail/lstm_kernel.h"
#include "paddle/operators/math/lstm_compute.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -22,19 +22,20 @@ namespace math { ...@@ -22,19 +22,20 @@ namespace math {
template <class T> template <class T>
struct LstmUnitFunctor<platform::GPUPlace, T> { struct LstmUnitFunctor<platform::GPUPlace, T> {
static void compute(lstm_value value, int frame_size, int batch_size, static void compute(const platform::DeviceContext& context,
LstmMetaValue<T> value, int frame_size, int batch_size,
std::string gate_act, std::string cell_act, std::string gate_act, std::string cell_act,
std::string cand_act) { std::string cand_act) {
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
detail::gpu_lstm_forward(detail::forward::lstm<T>(), value, frameSize, detail::gpu_lstm_forward(context, detail::forward::lstm<T>(), value,
ActiveType(cand_act), ActiveType(gate_act), frame_size, batch_size, ActiveType(cand_act),
ActiveType(cell_act)); ActiveType(gate_act), ActiveType(cell_act));
value.gateValue += frameSize * 4; value.gateValue += frame_size * 4;
value.stateValue += frameSize; value.stateValue += frame_size;
value.stateActiveValue += frameSize; value.stateActiveValue += frame_size;
value.outputValue += frameSize; value.outputValue += frame_size;
if (value.prevStateValue) { if (value.prevStateValue) {
value.prevStateValue += frameSize; value.prevStateValue += frame_size;
} }
} }
} }
...@@ -42,31 +43,37 @@ struct LstmUnitFunctor<platform::GPUPlace, T> { ...@@ -42,31 +43,37 @@ struct LstmUnitFunctor<platform::GPUPlace, T> {
template <class T> template <class T>
struct LstmUnitGradFunctor<platform::GPUPlace, T> { struct LstmUnitGradFunctor<platform::GPUPlace, T> {
static void compute(lstm_value value, lstm_grad grad, int frame_size, static void compute(const platform::DeviceContext& context,
int batch_size, std::string gate_act, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size, std::string gate_act,
std::string cell_act, std::string cand_act) { std::string cell_act, std::string cand_act) {
for (int b = 0; b < batchSize; b++) { for (int b = 0; b < batch_size; b++) {
detail::gpu_lstm_backward(detail::backward::lstm<T>(), value, grad, detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value,
frameSize, ActiveType(cand_act), grad, frame_size, batch_size,
ActiveType(gate_act), ActiveType(cell_act)); ActiveType(cand_act), ActiveType(gate_act),
ActiveType(cell_act));
value.gateValue += frameSize * 4; value.gateValue += frame_size * 4;
value.stateValue += frameSize; value.stateValue += frame_size;
value.stateActiveValue += frameSize; value.stateActiveValue += frame_size;
value.outputValue += frameSize; value.outputValue += frame_size;
if (value.prevStateValue) { if (value.prevStateValue) {
value.prevStateValue += frameSize; value.prevStateValue += frame_size;
} }
grad.gateGrad += frameSize * 4; grad.gateGrad += frame_size * 4;
grad.stateGrad += frameSize; grad.stateGrad += frame_size;
grad.stateActiveGrad += frameSize; grad.stateActiveGrad += frame_size;
grad.outputGrad += frameSize; grad.outputGrad += frame_size;
if (grad.prevStateGrad) { if (grad.prevStateGrad) {
grad.prevStateGrad += frameSize; grad.prevStateGrad += frame_size;
} }
} }
}; }
};
template class LstmUnitFunctor<platform::GPUPlace, float>;
template class LstmUnitGradFunctor<platform::GPUPlace, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -14,7 +14,8 @@ limitations under the License. */ ...@@ -14,7 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/platform/macros.h" #include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -28,28 +29,28 @@ typedef enum { ...@@ -28,28 +29,28 @@ typedef enum {
HL_ACTIVATION_END HL_ACTIVATION_END
} activation_mode_t; } activation_mode_t;
template <T> template <class T>
struct lstm_value { struct LstmMetaValue {
real *gateValue; T *gateValue;
real *prevStateValue; T *prevStateValue;
real *stateValue; T *stateValue;
real *stateActiveValue; T *stateActiveValue;
real *outputValue; T *outputValue;
real *checkIg; T *checkIg;
real *checkFg; T *checkFg;
real *checkOg; T *checkOg;
}; };
template <T> template <class T>
struct lstm_grad { struct LstmMetaGrad {
real *gateGrad; T *gateGrad;
real *prevStateGrad; T *prevStateGrad;
real *stateGrad; T *stateGrad;
real *stateActiveGrad; T *stateActiveGrad;
real *outputGrad; T *outputGrad;
real *checkIgGrad; T *checkIgGrad;
real *checkFgGrad; T *checkFgGrad;
real *checkOgGrad; T *checkOgGrad;
}; };
activation_mode_t ActiveType(const std::string &type) { activation_mode_t ActiveType(const std::string &type) {
...@@ -69,7 +70,8 @@ activation_mode_t ActiveType(const std::string &type) { ...@@ -69,7 +70,8 @@ activation_mode_t ActiveType(const std::string &type) {
template <typename Place, typename T> template <typename Place, typename T>
class LstmUnitFunctor { class LstmUnitFunctor {
public: public:
static void compute(lstm_value value, int frame_size, int batch_size, static void compute(const platform::DeviceContext &context,
LstmMetaValue<T> value, int frame_size, int batch_size,
std::string gate_act, std::string cell_act, std::string gate_act, std::string cell_act,
std::string cand_act); std::string cand_act);
}; };
...@@ -77,8 +79,9 @@ class LstmUnitFunctor { ...@@ -77,8 +79,9 @@ class LstmUnitFunctor {
template <typename Place, typename T> template <typename Place, typename T>
class LstmUnitGradFunctor { class LstmUnitGradFunctor {
public: public:
static void compute(lstm_value value, lstm_grad grad, int frame_size, static void compute(const platform::DeviceContext &context,
int batch_size, std::string gate_act, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size, std::string gate_act,
std::string cell_act, std::string cand_act); std::string cell_act, std::string cand_act);
}; };
......
...@@ -22,12 +22,14 @@ template <typename T> ...@@ -22,12 +22,14 @@ template <typename T>
class CopyMatrixRowsFunctor<platform::CPUPlace, T> { class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& src, const size_t* index, const framework::LoDTensor& src, const size_t* index,
framework::Tensor& dst, bool is_src_index) { framework::LoDTensor& dst, bool is_src_index) {
auto src_dims = src.dims(); auto src_dims = src.dims();
auto dst_dims = dst.dims(); auto dst_dims = dst.dims();
PADDLE_ENFORCE(src_dims.size(), 2, "The src must be matrix with rank 2."); PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
PADDLE_ENFORCE(dst_dims.size(), 2, "The dst must be matrix with rank 2."); "The src must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
"The dst must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1], PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
"The width of src and dst must be same."); "The width of src and dst must be same.");
auto height = dst_dims[0]; auto height = dst_dims[0];
...@@ -50,7 +52,9 @@ template class CopyMatrixRowsFunctor<platform::CPUPlace, float>; ...@@ -50,7 +52,9 @@ template class CopyMatrixRowsFunctor<platform::CPUPlace, float>;
template class CopyMatrixRowsFunctor<platform::CPUPlace, double>; template class CopyMatrixRowsFunctor<platform::CPUPlace, double>;
template class LoDTensor2BatchFunctor<platform::CPUPlace, float>; template class LoDTensor2BatchFunctor<platform::CPUPlace, float>;
template class Batch2LoDTensor2Functor<platform::CPUPlace, float>; template class LoDTensor2BatchFunctor<platform::CPUPlace, double>;
template class Batch2LoDTensorFunctor<platform::CPUPlace, float>;
template class Batch2LoDTensorFunctor<platform::CPUPlace, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -19,8 +19,8 @@ namespace operators { ...@@ -19,8 +19,8 @@ namespace operators {
namespace math { namespace math {
template <typename T, int BlockDimX, int BlockDimY, int GridDimX> template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const int* index, __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
int height, int width, int64_t height, int64_t width,
const bool is_src_index) { const bool is_src_index) {
int idx = threadIdx.x; int idx = threadIdx.x;
int idy = threadIdx.y; int idy = threadIdx.y;
...@@ -28,7 +28,7 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const int* index, ...@@ -28,7 +28,7 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const int* index,
while (id < height) { while (id < height) {
int src_idx = is_src_index ? index[id] : id; int src_idx = is_src_index ? index[id] : id;
int dst_idx = is_src_index ? id : index[id]; int dst_idx = is_src_index ? id : index[id];
T* src_data = src + src_idx * width; const T* src_data = src + src_idx * width;
T* dst_data = dst + dst_idx * width; T* dst_data = dst + dst_idx * width;
for (int i = idx; i < width; i += BlockDimX) { for (int i = idx; i < width; i += BlockDimX) {
dst_data[i] = src_data[i]; dst_data[i] = src_data[i];
...@@ -41,12 +41,14 @@ template <typename T> ...@@ -41,12 +41,14 @@ template <typename T>
class CopyMatrixRowsFunctor<platform::GPUPlace, T> { class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& src, const size_t* index, const framework::LoDTensor& src, const size_t* index,
framework::Tensor& dst, bool is_src_index) { framework::LoDTensor& dst, bool is_src_index) {
auto src_dims = src.dims(); auto src_dims = src.dims();
auto dst_dims = dst.dims(); auto dst_dims = dst.dims();
PADDLE_ENFORCE(src_dims.size(), 2, "The src must be matrix with rank 2."); PADDLE_ENFORCE_EQ(src_dims.size(), 2,
PADDLE_ENFORCE(dst_dims.size(), 2, "The dst must be matrix with rank 2."); "The src must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
"The dst must be matrix with rank 2.");
PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1], PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
"The width of src and dst must be same."); "The width of src and dst must be same.");
auto height = dst_dims[0]; auto height = dst_dims[0];
...@@ -56,9 +58,10 @@ class CopyMatrixRowsFunctor<platform::GPUPlace, T> { ...@@ -56,9 +58,10 @@ class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grid(8, 1); dim3 grid(8, 1);
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(context); auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>( CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
src_data, dst_data, index, height, width); src_data, dst_data, index, height, width, is_src_index);
} }
}; };
...@@ -66,7 +69,9 @@ template class CopyMatrixRowsFunctor<platform::GPUPlace, float>; ...@@ -66,7 +69,9 @@ template class CopyMatrixRowsFunctor<platform::GPUPlace, float>;
template class CopyMatrixRowsFunctor<platform::GPUPlace, double>; template class CopyMatrixRowsFunctor<platform::GPUPlace, double>;
template class LoDTensor2BatchFunctor<platform::GPUPlace, float>; template class LoDTensor2BatchFunctor<platform::GPUPlace, float>;
template class Batch2LoDTensor2Functor<platform::GPUPlace, float>; template class LoDTensor2BatchFunctor<platform::GPUPlace, double>;
template class Batch2LoDTensorFunctor<platform::GPUPlace, float>;
template class Batch2LoDTensorFunctor<platform::GPUPlace, double>;
} // namespace math } // namespace math
} // namespace operators } // namespace operators
......
...@@ -12,6 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor.h"
#include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -25,8 +30,8 @@ class CopyMatrixRowsFunctor { ...@@ -25,8 +30,8 @@ class CopyMatrixRowsFunctor {
// copy the input src to the indexed rows of output dst. // copy the input src to the indexed rows of output dst.
// The indexed rows are based on the input index. // The indexed rows are based on the input index.
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& src, const size_t* index, const framework::LoDTensor& src, const size_t* index,
framework::Tensor& dst, const bool is_src_index); framework::LoDTensor& dst, const bool is_src_index);
}; };
template <typename Place, typename T> template <typename Place, typename T>
...@@ -35,8 +40,8 @@ class LoDTensor2BatchFunctor { ...@@ -35,8 +40,8 @@ class LoDTensor2BatchFunctor {
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& lod_tensor, const framework::LoDTensor& lod_tensor,
framework::LoDTensor& batch, const bool is_reverse) const { framework::LoDTensor& batch, const bool is_reverse) const {
auto lods = lod_tensor->lod(); auto lods = lod_tensor.lod();
PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
auto lod = lods[0]; auto lod = lods[0];
// Calculate the length of each sequence and // Calculate the length of each sequence and
...@@ -47,7 +52,7 @@ class LoDTensor2BatchFunctor { ...@@ -47,7 +52,7 @@ class LoDTensor2BatchFunctor {
// //
struct SeqInfo { struct SeqInfo {
SeqInfo(int start, int length, int seq_idx) SeqInfo(int start, int length, int seq_idx)
: start(start), length(length), seqIdx(seq_idx) {} : start(start), length(length), seq_idx(seq_idx) {}
int start; int start;
int length; int length;
int seq_idx; int seq_idx;
...@@ -78,19 +83,19 @@ class LoDTensor2BatchFunctor { ...@@ -78,19 +83,19 @@ class LoDTensor2BatchFunctor {
// The batch number represents batch size after rearranging the // The batch number represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence. // input LodTensor. It is also the maximum length of input sequence.
auto batch_lods = batch->lod(); auto batch_lods = batch.lod();
if (!batch_lods) { if (batch_lods.size() == 0) {
batch_lods->resize(2); batch_lods.resize(2);
} }
// batch_lods[0] is the start positions for batch LoDTensor // batch_lods[0] is the start positions for batch LoDTensor
int num_batch = (size_t)seq_info[0].length; int num_batch = (size_t)seq_info[0].length;
batch_lods[0]->resize(num_batch + 1); batch_lods[0].resize(num_batch + 1);
// batch_lods[1] is the raw index in the input LoDTensor // batch_lods[1] is the raw index in the input LoDTensor
auto dims = lod_tensor->dims(); auto dims = lod_tensor.dims();
batch_lods[1]->resize(dims[0]); batch_lods[1].resize(dims[0]);
auto* batch_starts = batch_lods[0].data(); size_t* batch_starts = batch_lods[0].data();
auto* seq2batch_idx = batch_lods[1].data(); size_t* seq2batch_idx = batch_lods[1].data();
batch_starts[0] = 0; batch_starts[0] = 0;
for (size_t n = 0; n < num_batch; n++) { for (size_t n = 0; n < num_batch; n++) {
int batch_id = batch_starts[n]; int batch_id = batch_starts[n];
...@@ -112,17 +117,27 @@ class LoDTensor2BatchFunctor { ...@@ -112,17 +117,27 @@ class LoDTensor2BatchFunctor {
} }
CopyMatrixRowsFunctor<Place, T> to_batch; CopyMatrixRowsFunctor<Place, T> to_batch;
to_batch(context, lod_tensor, batch, true); to_batch(context, lod_tensor, seq2batch_idx, batch, true);
} }
}; };
template <typename Place, typename T> template <typename Place, typename T>
class Batch2LoDTensor2Functor { class Batch2LoDTensorFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::LoDTensor& batch, const framework::LoDTensor& batch,
framework::LoDTensor& lod_tensor, framework::LoDTensor& lod_tensor) const {
const bool is_reverse) const; auto in_lod = batch.lod();
PADDLE_ENFORCE_EQ(in_lod.size(), 2UL,
"The LoD size of input `batch` should be 2.");
auto out_lod = lod_tensor.lod();
PADDLE_ENFORCE_EQ(out_lod[0][0], out_lod[1].size());
PADDLE_ENFORCE_EQ(out_lod[0][0], lod_tensor.dims()[0]);
PADDLE_ENFORCE_EQ(out_lod[0][0], batch.dims()[0]);
CopyMatrixRowsFunctor<Place, T> to_seq;
size_t* index = out_lod[1].data();
to_seq(context, batch, index, lod_tensor, false);
}
}; };
} // namespace math } // namespace math
......
import unittest
import numpy as np
from op_test import OpTest
def identity(x):
return x
def sigmoid(x):
return 1. / (1. + np.exp(-x))
def tanh(x):
return 2. * sigmoid(2. * x) - 1.
def relu(x):
return np.maximum(x, 0)
def lstm(
input, # T x 4D
lod, # 1 x N
h0=None, # N x D
c0=None, # N x D
w_h=None, # D x 4D
w_b=None, # 1 x 4D
w_c=None, # 1 x 3D
is_reverse=False,
gate_act=None,
cell_act=None,
cand_act=None):
def _step(x, w_h, w_c, h_pre, c_pre, gate_act, cell_act, cand_act):
g = np.dot(h_pre, w_h) # 1 x 4D
g = g + x
g = np.reshape(g, (1, g.size))
c, g_i, g_f, g_o = np.split(g, 4, axis=1)
if w_c is None:
g_i = gate_act(g_i) # 1 x D
g_f = gate_act(g_f) # 1 x D
else:
w_ic, w_fc, w_oc = np.split(w_c, 3, axis=1)
g_i = gate_act(g_i + w_ic * c_pre) # 1 x D
g_f = gate_act(g_f + w_fc * c_pre) # 1 x D
c = g_f * c_pre + g_i * cand_act(c) # 1 x D
if w_c is None:
g_o = gate_act(g_o) # 1 x D
else:
_, _, w_oc = np.split(w_c, 3, axis=1)
g_o = gate_act(g_o + w_oc * c) # 1 x D
h = g_o * cell_act(c)
return h, c
offset = lod[0]
batch_size = len(offset) - 1
hidden = []
cell = []
if w_b is not None:
input = input + np.tile(w_b, (offset[-1], 1))
for i in range(batch_size):
# compute one sequence
seq_len = offset[i + 1] - offset[i]
x = input[offset[i]:offset[i + 1], :]
h_pre = h0[i] # 1 x D
c_pre = h0[i] # 1 x D
for j in range(seq_len):
# compute one step
h_pre, c_pre = _step(x[j], w_h, w_c, h_pre, c_pre, gate_act,
cell_act, cand_act)
hidden.append(h_pre.flatten())
cell.append(c_pre.flatten())
hidden = np.array(hidden).astype("float64")
cell = np.array(cell).astype("float64")
assert hidden.shape == (input.shape[0], input.shape[1] / 4)
assert cell.shape == (input.shape[0], input.shape[1] / 4)
return hidden, cell
class LstmUnitTest(OpTest):
def set_data(self):
lod = [[0, 2, 6, 9]]
shape = (9, 64)
x = np.random.normal(size=(9, 4 * 64)).astype("float64")
h0 = np.random.normal(size=(4, 64)).astype("float64")
c0 = np.random.normal(size=(4, 64)).astype("float64")
w = np.random.normal(size=(64, 4 * 64)).astype("float64")
b = np.random.normal(size=(1, 7 * 64)).astype("float64")
w_b = b[:, 4 * 64]
w_c = b[:, 4 * 64:]
h, c = lstm(x, lod, h0, c0, w, w_b, w_c, False, sigmoid, tanh, tanh)
self.inputs = {'Input': x, 'H0': h0, 'C0': c0, 'Weight': w, 'Bias': b}
self.inputs = {'Hidden': h, 'Cell': c}
self.attrs = {
'usePeepholes': True,
'isReverse': False,
'gateActivation': 'sigmoid',
'cellActivation': 'tanh',
'candidateActivation': 'tanh'
}
def setUp(self):
self.set_data()
self.op_type = "lstm"
def test_check_output(self):
self.check_output()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册