提交 d1fbf50b 编写于 作者: D dangqingqing

Add unit testing for forwad implementation.

上级 2a8dbd13
...@@ -127,7 +127,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) ...@@ -127,7 +127,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(sum_op DEPS net_op) op_library(sum_op DEPS net_op)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lstm_op DEPS sequence2batch) op_library(lstm_op DEPS sequence2batch lstm_compute math_function)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
......
...@@ -44,7 +44,7 @@ class LSTMOp : public framework::OperatorWithKernel { ...@@ -44,7 +44,7 @@ class LSTMOp : public framework::OperatorWithKernel {
"should be the same."); "should be the same.");
} }
int frame_size = x_dims[1]; int frame_size = x_dims[1] / 4;
auto w_dims = ctx->GetInputDim("Weight"); auto w_dims = ctx->GetInputDim("Weight");
PADDLE_ENFORCE_EQ(w_dims.size(), 2, PADDLE_ENFORCE_EQ(w_dims.size(), 2,
"The rank of Input(Weight) should be 2."); "The rank of Input(Weight) should be 2.");
...@@ -71,9 +71,9 @@ class LSTMOp : public framework::OperatorWithKernel { ...@@ -71,9 +71,9 @@ class LSTMOp : public framework::OperatorWithKernel {
"4 * %d if diable peepholes connection", "4 * %d if diable peepholes connection",
frame_size); frame_size);
} }
ctx->SetOutputDim("Hidden", x_dims); ctx->SetOutputDim("Hidden", {x_dims[0], frame_size});
ctx->SetOutputDim("Cell", x_dims); ctx->SetOutputDim("Cell", {x_dims[0], frame_size});
ctx->SetOutputDim("Batch", x_dims); ctx->SetOutputDim("BatchGate", x_dims);
ctx->ShareLoD("Input", "Hidden"); ctx->ShareLoD("Input", "Hidden");
ctx->ShareLoD("Input", "Cell"); ctx->ShareLoD("Input", "Cell");
} }
......
...@@ -52,9 +52,14 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -52,9 +52,14 @@ class LSTMKernel : public framework::OpKernel<T> {
to_batch(ctx.device_context(), *input, *batch_gate, is_reverse); to_batch(ctx.device_context(), *input, *batch_gate, is_reverse);
auto in_dims = input->dims(); auto in_dims = input->dims();
int frame_size = in_dims[1]; int frame_size = in_dims[1] / 4;
framework::DDim dims({in_dims[0], frame_size});
if (bias) { if (bias) {
// framework::Tensor cpu_t;
// cpu_t.mutable_data<T>(in_dims, platform::CPUPlace());
// cpu_t.CopyFrom<T>(*batch_gate, platform::CPUPlace(),
// ctx.device_context());
Eigen::array<int, 2> extents({{1, 4 * frame_size}}); Eigen::array<int, 2> extents({{1, 4 * frame_size}});
Eigen::array<int, 2> offsets({{0, 0}}); Eigen::array<int, 2> offsets({{0, 0}});
auto b = EigenMatrix<T>::From(*bias); auto b = EigenMatrix<T>::From(*bias);
...@@ -76,15 +81,14 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -76,15 +81,14 @@ class LSTMKernel : public framework::OpKernel<T> {
lstm_value.prevStateValue = nullptr; lstm_value.prevStateValue = nullptr;
framework::LoDTensor batch_out; framework::LoDTensor batch_out;
batch_out.mutable_data<T>(in_dims, ctx.GetPlace()); batch_out.mutable_data<T>(dims, ctx.GetPlace());
framework::LoDTensor batch_cell; framework::LoDTensor batch_cell;
batch_cell.mutable_data<T>(in_dims, ctx.GetPlace()); batch_cell.mutable_data<T>(dims, ctx.GetPlace());
framework::LoDTensor batch_cell_pre_act; framework::LoDTensor batch_cell_pre_act;
batch_cell_pre_act.mutable_data<T>(in_dims, ctx.GetPlace()); batch_cell_pre_act.mutable_data<T>(dims, ctx.GetPlace());
auto batch_lod = batch_gate->lod()[0]; auto batch_lod = batch_gate->lod()[0];
int num_batch = batch_lod.size() - 1; int num_batch = batch_lod.size() - 1;
auto gate_act = ctx.Attr<std::string>("gateActivation"); auto gate_act = ctx.Attr<std::string>("gateActivation");
auto cell_act = ctx.Attr<std::string>("cellActivation"); auto cell_act = ctx.Attr<std::string>("cellActivation");
auto cand_act = ctx.Attr<std::string>("candidateActivation"); auto cand_act = ctx.Attr<std::string>("candidateActivation");
...@@ -125,9 +129,12 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -125,9 +129,12 @@ class LSTMKernel : public framework::OpKernel<T> {
// restore the output hidden in LoDTensor from the batch hidden // restore the output hidden in LoDTensor from the batch hidden
to_seq(ctx.device_context(), batch_out, *hidden_out); to_seq(ctx.device_context(), batch_out, *hidden_out);
batch_out.set_lod(batch_gate->lod()); batch_cell.set_lod(batch_gate->lod());
// restore the output cell state in LoDTensor from the batch cell // restore the output cell state in LoDTensor from the batch cell
to_seq(ctx.device_context(), batch_cell, *cell_out); to_seq(ctx.device_context(), batch_cell, *cell_out);
auto t = framework::EigenVector<T>::Flatten(*batch_gate);
t.device(ctx.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
} }
}; };
......
add_subdirectory(detail)
if(WITH_GPU) if(WITH_GPU)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator) nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator)
nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
...@@ -6,7 +8,7 @@ if(WITH_GPU) ...@@ -6,7 +8,7 @@ if(WITH_GPU)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
else() else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
...@@ -14,7 +16,7 @@ else() ...@@ -14,7 +16,7 @@ else()
cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
cc_library(pooling SRCS pooling.cc DEPS device_context) cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
endif() endif()
cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
......
...@@ -14,10 +14,12 @@ limitations under the License. */ ...@@ -14,10 +14,12 @@ limitations under the License. */
#include <immintrin.h> #include <immintrin.h>
#include "hl_functions.h" #include "hl_functions.h"
// TODO(qingqing) refine this dependence
#include "paddle/cuda/src/avx_mathfun.h"
namespace hppl { namespace hppl {
extern __m256 exp(__m256 a); __m256 exp(__m256 a) { return exp256_ps(a); }
__m256 relu(const __m256 a) { __m256 relu(const __m256 a) {
__m256 tmp = _mm256_set1_ps(0.0f); __m256 tmp = _mm256_set1_ps(0.0f);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <math.h>
#include "hl_functions.h"
namespace hppl {
namespace typef {
float relu(const float a) {
return a > static_cast<float>(0.0) ? a : static_cast<float>(0.0);
}
float sigmoid(const float a) {
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
float tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<float>(1.0) / (static_cast<float>(1.0) + exp(-tmp));
}
float tanh(const float a) {
float tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
float linear(const float a) { return a; }
float relu(const float a, const float b) { return a * (b > 0.0 ? 1.0 : 0.0); }
float sigmoid(const float a, const float b) {
return a * b * (static_cast<float>(1) - b);
}
float tanh(const float a, const float b) {
return a * (static_cast<float>(1) - b * b);
}
float linear(const float a, const float b) { return a; }
} // namespace typef
namespace typed {
double relu(const double a) {
return a > static_cast<double>(0.0) ? a : static_cast<double>(0.0);
}
double sigmoid(const double a) {
const double min = SIGMOID_THRESHOLD_MIN;
const double max = SIGMOID_THRESHOLD_MAX;
double tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<double>(1.0) / (static_cast<double>(1.0) + exp(-tmp));
}
double tanh(const double a) {
double tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
double linear(const double a) { return a; }
double relu(const double a, const double b) {
return a * (b > 0.0 ? 1.0 : 0.0);
}
double sigmoid(const double a, const double b) {
return a * b * (static_cast<double>(1) - b);
}
double tanh(const double a, const double b) {
return a * (static_cast<double>(1) - b * b);
}
double linear(const double a, const double b) { return a; }
} // namespace typed
} // namespace hppl
...@@ -34,83 +34,28 @@ limitations under the License. */ ...@@ -34,83 +34,28 @@ limitations under the License. */
#ifndef __NVCC__ #ifndef __NVCC__
namespace hppl { namespace hppl {
namespace typef { namespace typef {
/* float relu(const float a);
* forward activation float sigmoid(const float a);
*/ float tanh(const float a);
float relu(const float a) { float linear(const float a);
return a > static_cast<float>(0.0) ? a : static_cast<float>(0.0);
}
float sigmoid(const float a) {
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
float tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<float>(1.0) / (static_cast<float>(1.0) + exp(-tmp));
}
float tanh(const float a) {
float tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
float linear(const float a) { return a; }
/*
* backward activation
*/
float relu(const float a, const float b) { return a * (b > 0.0 ? 1.0 : 0.0); }
float sigmoid(const float a, const float b) { float relu(const float a, const float b);
return a * b * (static_cast<float>(1) - b); float sigmoid(const float a, const float b);
} float tanh(const float a, const float b);
float linear(const float a, const float b);
float tanh(const float a, const float b) {
return a * (static_cast<float>(1) - b * b);
}
float linear(const float a, const float b) { return a; }
} // namespace typef } // namespace typef
namespace typed { namespace typed {
/* double relu(const double a);
* forward activation double sigmoid(const double a);
*/ double tanh(const double a);
double relu(const double a) { double linear(const double a);
return a > static_cast<double>(0.0) ? a : static_cast<double>(0.0);
} double relu(const double a, const double b);
double sigmoid(const double a, const double b);
double sigmoid(const double a) { double tanh(const double a, const double b);
const double min = SIGMOID_THRESHOLD_MIN; double linear(const double a, const double b);
const double max = SIGMOID_THRESHOLD_MAX;
double tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<double>(1.0) / (static_cast<double>(1.0) + exp(-tmp));
}
double tanh(const double a) {
double tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
double linear(const double a) { return a; }
/*
* backward activation
*/
double relu(const double a, const double b) {
return a * (b > 0.0 ? 1.0 : 0.0);
}
double sigmoid(const double a, const double b) {
return a * b * (static_cast<double>(1) - b);
}
double tanh(const double a, const double b) {
return a * (static_cast<double>(1) - b * b);
}
double linear(const double a, const double b) { return a; }
} // namespace typed } // namespace typed
} // namespace hppl } // namespace hppl
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
#include "paddle/platform/cuda_helper.h" #include "paddle/platform/cuda_helper.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#include <glog/logging.h>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -29,11 +31,10 @@ namespace detail { ...@@ -29,11 +31,10 @@ namespace detail {
* grid(frameBlocks, batchBlocks) * grid(frameBlocks, batchBlocks)
*/ */
template <class T, class Op, bool isBatch> template <class T, class Op, bool isBatch>
__global__ void KeLstmForward( __global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
Op op, LstmMetaValue<T> value, int frameSize, int batchSize, int batchSize, activation_mode_t active_node,
typename hppl::ForwardActType<T>::type active_node, activation_mode_t active_gate,
typename hppl::ForwardActType<T>::type active_gate, activation_mode_t active_state) {
typename hppl::ForwardActType<T>::type active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frameIdx >= frameSize) return;
...@@ -69,8 +70,10 @@ __global__ void KeLstmForward( ...@@ -69,8 +70,10 @@ __global__ void KeLstmForward(
rPrevState = value.prevStateValue[frameIdx]; rPrevState = value.prevStateValue[frameIdx];
} }
hppl::gpu::ForwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate),
act(active_state));
value.gateValue[frameIdx] = rValueIn; value.gateValue[frameIdx] = rValueIn;
value.gateValue[frameIdx + frameSize] = rValueIg; value.gateValue[frameIdx + frameSize] = rValueIg;
...@@ -87,11 +90,11 @@ __global__ void KeLstmForward( ...@@ -87,11 +90,11 @@ __global__ void KeLstmForward(
* grid(frameBlocks, batchBlocks) * grid(frameBlocks, batchBlocks)
*/ */
template <class T, class Op, bool isBatch> template <class T, class Op, bool isBatch>
__global__ void KeLstmBackward( __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad, int frameSize, LstmMetaGrad<T> grad, int frameSize,
int batchSize, typename hppl::BackwardActType<T>::type active_node, int batchSize, activation_mode_t active_node,
typename hppl::BackwardActType<T>::type active_gate, activation_mode_t active_gate,
typename hppl::BackwardActType<T>::type active_state) { activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return; if (frameIdx >= frameSize) return;
...@@ -142,10 +145,11 @@ __global__ void KeLstmBackward( ...@@ -142,10 +145,11 @@ __global__ void KeLstmBackward(
rPrevState = value.prevStateValue[frameIdx]; rPrevState = value.prevStateValue[frameIdx];
} }
hppl::gpu::BackwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg, op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg,
rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad,
rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad,
active_node, active_gate, active_state); act(active_node), act(active_gate), act(active_state));
grad.gateGrad[frameIdx] = rGradIn; grad.gateGrad[frameIdx] = rGradIn;
grad.gateGrad[frameIdx + frameSize] = rGradIg; grad.gateGrad[frameIdx + frameSize] = rGradIg;
...@@ -196,22 +200,16 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op, ...@@ -196,22 +200,16 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
} }
using type = typename hppl::ForwardActType<T>::type;
hppl::gpu::ForwardAct<T> act;
type act_node = act(active_node);
type act_gate = act(active_gate);
type act_state = act(active_state);
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) { if (batchSize == 1) {
KeLstmForward<T, Op, KeLstmForward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, act_node, act_gate, act_state); op, value, frameSize, batchSize, active_node, active_gate, active_gate);
} else { } else {
KeLstmForward<T, Op, KeLstmForward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, act_node, act_gate, act_state); op, value, frameSize, batchSize, active_node, active_gate, active_gate);
} }
} }
...@@ -235,22 +233,18 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op, ...@@ -235,22 +233,18 @@ void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
} }
using type = typename hppl::BackwardActType<T>::type;
hppl::gpu::BackwardAct<T> act;
type act_node = act(active_node);
type act_gate = act(active_gate);
type act_state = act(active_state);
auto stream = auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream(); reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) { if (batchSize == 1) {
KeLstmBackward<T, Op, KeLstmBackward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>( /* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, act_node, act_gate, act_state); op, value, grad, frameSize, batchSize, active_node, active_gate,
active_state);
} else { } else {
KeLstmBackward<T, Op, KeLstmBackward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>( /* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, act_node, act_gate, act_state); op, value, grad, frameSize, batchSize, active_node, active_gate,
active_state);
} }
} }
......
...@@ -72,6 +72,8 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> { ...@@ -72,6 +72,8 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> {
}; };
template class LstmUnitFunctor<platform::CPUPlace, float>; template class LstmUnitFunctor<platform::CPUPlace, float>;
template class LstmUnitFunctor<platform::CPUPlace, double>;
template class LstmUnitGradFunctor<platform::CPUPlace, float>;
template class LstmUnitGradFunctor<platform::CPUPlace, double>; template class LstmUnitGradFunctor<platform::CPUPlace, double>;
} // namespace math } // namespace math
......
...@@ -26,18 +26,9 @@ struct LstmUnitFunctor<platform::GPUPlace, T> { ...@@ -26,18 +26,9 @@ struct LstmUnitFunctor<platform::GPUPlace, T> {
LstmMetaValue<T> value, int frame_size, int batch_size, LstmMetaValue<T> value, int frame_size, int batch_size,
std::string gate_act, std::string cell_act, std::string gate_act, std::string cell_act,
std::string cand_act) { std::string cand_act) {
for (int b = 0; b < batch_size; b++) { detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
detail::gpu_lstm_forward(context, detail::forward::lstm<T>(), value, frame_size, batch_size, ActiveType(cand_act),
frame_size, batch_size, ActiveType(cand_act), ActiveType(gate_act), ActiveType(cell_act));
ActiveType(gate_act), ActiveType(cell_act));
value.gateValue += frame_size * 4;
value.stateValue += frame_size;
value.stateActiveValue += frame_size;
value.outputValue += frame_size;
if (value.prevStateValue) {
value.prevStateValue += frame_size;
}
}
} }
}; };
...@@ -47,32 +38,15 @@ struct LstmUnitGradFunctor<platform::GPUPlace, T> { ...@@ -47,32 +38,15 @@ struct LstmUnitGradFunctor<platform::GPUPlace, T> {
LstmMetaValue<T> value, LstmMetaGrad<T> grad, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frame_size, int batch_size, std::string gate_act, int frame_size, int batch_size, std::string gate_act,
std::string cell_act, std::string cand_act) { std::string cell_act, std::string cand_act) {
for (int b = 0; b < batch_size; b++) { detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, frame_size, batch_size, ActiveType(cand_act),
grad, frame_size, batch_size, ActiveType(gate_act), ActiveType(cell_act));
ActiveType(cand_act), ActiveType(gate_act),
ActiveType(cell_act));
value.gateValue += frame_size * 4;
value.stateValue += frame_size;
value.stateActiveValue += frame_size;
value.outputValue += frame_size;
if (value.prevStateValue) {
value.prevStateValue += frame_size;
}
grad.gateGrad += frame_size * 4;
grad.stateGrad += frame_size;
grad.stateActiveGrad += frame_size;
grad.outputGrad += frame_size;
if (grad.prevStateGrad) {
grad.prevStateGrad += frame_size;
}
}
} }
}; };
template class LstmUnitFunctor<platform::GPUPlace, float>; template class LstmUnitFunctor<platform::GPUPlace, float>;
template class LstmUnitFunctor<platform::GPUPlace, double>;
template class LstmUnitGradFunctor<platform::GPUPlace, float>;
template class LstmUnitGradFunctor<platform::GPUPlace, double>; template class LstmUnitGradFunctor<platform::GPUPlace, double>;
} // namespace math } // namespace math
......
...@@ -53,7 +53,7 @@ struct LstmMetaGrad { ...@@ -53,7 +53,7 @@ struct LstmMetaGrad {
T *checkOgGrad; T *checkOgGrad;
}; };
activation_mode_t ActiveType(const std::string &type) { inline activation_mode_t ActiveType(const std::string &type) {
if (type == "sigmoid") { if (type == "sigmoid") {
return HL_ACTIVATION_SIGMOID; return HL_ACTIVATION_SIGMOID;
} else if (type == "relu") { } else if (type == "relu") {
......
...@@ -59,7 +59,7 @@ class LoDTensor2BatchFunctor { ...@@ -59,7 +59,7 @@ class LoDTensor2BatchFunctor {
}; };
std::vector<SeqInfo> seq_info; std::vector<SeqInfo> seq_info;
for (size_t seq_id = 0; seq_id < lod.size(); ++seq_id) { for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
int length = lod[seq_id + 1] - lod[seq_id]; int length = lod[seq_id + 1] - lod[seq_id];
seq_info.emplace_back(lod[seq_id], length, seq_id); seq_info.emplace_back(lod[seq_id], length, seq_id);
} }
...@@ -83,10 +83,11 @@ class LoDTensor2BatchFunctor { ...@@ -83,10 +83,11 @@ class LoDTensor2BatchFunctor {
// The batch number represents batch size after rearranging the // The batch number represents batch size after rearranging the
// input LodTensor. It is also the maximum length of input sequence. // input LodTensor. It is also the maximum length of input sequence.
auto batch_lods = batch.lod();
if (batch_lods.size() == 0) { paddle::framework::LoD batch_lods;
batch_lods.resize(2); batch_lods.push_back(std::vector<size_t>{0});
} batch_lods.push_back(std::vector<size_t>{0});
// batch_lods[0] is the start positions for batch LoDTensor // batch_lods[0] is the start positions for batch LoDTensor
int num_batch = (size_t)seq_info[0].length; int num_batch = (size_t)seq_info[0].length;
batch_lods[0].resize(num_batch + 1); batch_lods[0].resize(num_batch + 1);
...@@ -115,6 +116,7 @@ class LoDTensor2BatchFunctor { ...@@ -115,6 +116,7 @@ class LoDTensor2BatchFunctor {
} }
batch_starts[n + 1] = batch_id; batch_starts[n + 1] = batch_id;
} }
batch.set_lod(batch_lods);
CopyMatrixRowsFunctor<Place, T> to_batch; CopyMatrixRowsFunctor<Place, T> to_batch;
to_batch(context, lod_tensor, seq2batch_idx, batch, true); to_batch(context, lod_tensor, seq2batch_idx, batch, true);
...@@ -130,12 +132,13 @@ class Batch2LoDTensorFunctor { ...@@ -130,12 +132,13 @@ class Batch2LoDTensorFunctor {
auto in_lod = batch.lod(); auto in_lod = batch.lod();
PADDLE_ENFORCE_EQ(in_lod.size(), 2UL, PADDLE_ENFORCE_EQ(in_lod.size(), 2UL,
"The LoD size of input `batch` should be 2."); "The LoD size of input `batch` should be 2.");
auto out_lod = lod_tensor.lod(); auto out_lod = lod_tensor.lod()[0];
PADDLE_ENFORCE_EQ(out_lod[0][0], out_lod[1].size()); auto num = out_lod[out_lod.size() - 1];
PADDLE_ENFORCE_EQ(out_lod[0][0], lod_tensor.dims()[0]); PADDLE_ENFORCE_EQ(num, lod_tensor.dims()[0]);
PADDLE_ENFORCE_EQ(out_lod[0][0], batch.dims()[0]); PADDLE_ENFORCE_EQ(num, in_lod[1].size());
PADDLE_ENFORCE_EQ(num, batch.dims()[0]);
CopyMatrixRowsFunctor<Place, T> to_seq; CopyMatrixRowsFunctor<Place, T> to_seq;
size_t* index = out_lod[1].data(); size_t* index = in_lod[1].data();
to_seq(context, batch, index, lod_tensor, false); to_seq(context, batch, index, lod_tensor, false);
} }
}; };
......
...@@ -2,17 +2,26 @@ import unittest ...@@ -2,17 +2,26 @@ import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
SIGMOID_THRESHOLD_MIN = -40.0
SIGMOID_THRESHOLD_MAX = 13.0
EXP_MAX_INPUT = 40.0
def identity(x): def identity(x):
return x return x
def sigmoid(x): def sigmoid(x):
return 1. / (1. + np.exp(-x)) y = np.copy(x)
y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
return 1. / (1. + np.exp(-y))
def tanh(x): def tanh(x):
return 2. * sigmoid(2. * x) - 1. y = -2. * x
y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
return (2. / (1. + np.exp(y))) - 1.
def relu(x): def relu(x):
...@@ -35,7 +44,7 @@ def lstm( ...@@ -35,7 +44,7 @@ def lstm(
g = np.dot(h_pre, w_h) # 1 x 4D g = np.dot(h_pre, w_h) # 1 x 4D
g = g + x g = g + x
g = np.reshape(g, (1, g.size)) g = np.reshape(g, (1, g.size))
c, g_i, g_f, g_o = np.split(g, 4, axis=1) c_tmp, g_i, g_f, g_o = np.split(g, 4, axis=1)
if w_c is None: if w_c is None:
g_i = gate_act(g_i) # 1 x D g_i = gate_act(g_i) # 1 x D
g_f = gate_act(g_f) # 1 x D g_f = gate_act(g_f) # 1 x D
...@@ -43,7 +52,7 @@ def lstm( ...@@ -43,7 +52,7 @@ def lstm(
w_ic, w_fc, w_oc = np.split(w_c, 3, axis=1) w_ic, w_fc, w_oc = np.split(w_c, 3, axis=1)
g_i = gate_act(g_i + w_ic * c_pre) # 1 x D g_i = gate_act(g_i + w_ic * c_pre) # 1 x D
g_f = gate_act(g_f + w_fc * c_pre) # 1 x D g_f = gate_act(g_f + w_fc * c_pre) # 1 x D
c = g_f * c_pre + g_i * cand_act(c) # 1 x D c = g_f * c_pre + g_i * cand_act(c_tmp) # 1 x D
if w_c is None: if w_c is None:
g_o = gate_act(g_o) # 1 x D g_o = gate_act(g_o) # 1 x D
...@@ -51,12 +60,14 @@ def lstm( ...@@ -51,12 +60,14 @@ def lstm(
_, _, w_oc = np.split(w_c, 3, axis=1) _, _, w_oc = np.split(w_c, 3, axis=1)
g_o = gate_act(g_o + w_oc * c) # 1 x D g_o = gate_act(g_o + w_oc * c) # 1 x D
h = g_o * cell_act(c) h = g_o * cell_act(c)
return h, c bg = np.concatenate((cand_act(c_tmp), g_i, g_f, g_o), axis=1)
return h, c, bg
offset = lod[0] offset = lod[0]
batch_size = len(offset) - 1 batch_size = len(offset) - 1
hidden = [] hidden = []
cell = [] cell = []
gate = []
if w_b is not None: if w_b is not None:
input = input + np.tile(w_b, (offset[-1], 1)) input = input + np.tile(w_b, (offset[-1], 1))
for i in range(batch_size): for i in range(batch_size):
...@@ -64,44 +75,62 @@ def lstm( ...@@ -64,44 +75,62 @@ def lstm(
seq_len = offset[i + 1] - offset[i] seq_len = offset[i + 1] - offset[i]
x = input[offset[i]:offset[i + 1], :] x = input[offset[i]:offset[i + 1], :]
h_pre = h0[i] # 1 x D h_pre = h0[i] # 1 x D
c_pre = h0[i] # 1 x D c_pre = c0[i] # 1 x D
for j in range(seq_len): for j in range(seq_len):
# compute one step # compute one step
h_pre, c_pre = _step(x[j], w_h, w_c, h_pre, c_pre, gate_act, h_pre, c_pre, g_pre = _step(x[j], w_h, w_c, h_pre, c_pre, gate_act,
cell_act, cand_act) cell_act, cand_act)
hidden.append(h_pre.flatten()) hidden.append(h_pre.flatten())
cell.append(c_pre.flatten()) cell.append(c_pre.flatten())
gate.append(g_pre.flatten())
hidden = np.array(hidden).astype("float64") hidden = np.array(hidden).astype("float64")
cell = np.array(cell).astype("float64") cell = np.array(cell).astype("float64")
gate = np.array(gate).astype("float64")
assert gate.shape == input.shape
assert hidden.shape == (input.shape[0], input.shape[1] / 4) assert hidden.shape == (input.shape[0], input.shape[1] / 4)
assert cell.shape == (input.shape[0], input.shape[1] / 4) assert cell.shape == (input.shape[0], input.shape[1] / 4)
return hidden, cell return hidden, cell, gate
class LstmUnitTest(OpTest): class LstmUnitTest(OpTest):
def set_data(self): def set_data(self):
lod = [[0, 2, 6, 9]] D = 4
shape = (9, 64) #lod = [[0, 2, 6, 9]]
lod = [[0, 1]]
x = np.random.normal(size=(9, 4 * 64)).astype("float64") shape = (1, D)
h0 = np.random.normal(size=(4, 64)).astype("float64")
c0 = np.random.normal(size=(4, 64)).astype("float64") x = np.random.normal(size=(1, 4 * D)).astype("float64")
w = np.random.normal(size=(64, 4 * 64)).astype("float64") h0 = np.zeros((4, D)).astype("float64")
b = np.random.normal(size=(1, 7 * 64)).astype("float64") c0 = np.zeros((4, D)).astype("float64")
w = np.random.normal(size=(D, 4 * D)).astype("float64")
w_b = b[:, 4 * 64] b = np.random.normal(size=(1, 7 * D)).astype("float64")
w_c = b[:, 4 * 64:]
h, c = lstm(x, lod, h0, c0, w, w_b, w_c, False, sigmoid, tanh, tanh) w_b = b[:, 0:4 * D]
w_c = b[:, 4 * D:]
self.inputs = {'Input': x, 'H0': h0, 'C0': c0, 'Weight': w, 'Bias': b} #h, c, g = lstm(x, lod, h0, c0, w, w_b, w_c, False, sigmoid, tanh, tanh)
self.inputs = {'Hidden': h, 'Cell': c} h, c, g = lstm(x, lod, h0, c0, w, w_b, w_c, False, identity, identity,
identity)
g_sort = np.zeros_like(x)
#idx = [2,6,0,3,7,1,4,8,5]
#for i, j in enumerate(idx):
# g_sort[i, :] = g[j, :]
self.inputs = {
'Input': (x, lod),
'H0': h0,
'C0': c0,
'Weight': w,
'Bias': b
}
self.outputs = {'Hidden': h, 'Cell': c, 'BatchGate': g_sort}
self.attrs = { self.attrs = {
'usePeepholes': True, 'usePeepholes': True,
'isReverse': False, 'isReverse': False,
'gateActivation': 'sigmoid', 'gateActivation': 'linear',
'cellActivation': 'tanh', 'cellActivation': 'linear',
'candidateActivation': 'tanh' 'candidateActivation': 'linear'
} }
def setUp(self): def setUp(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册