提交 6a630f27 编写于 作者: C caoying03

Merge branch 'develop' into crf

......@@ -127,6 +127,7 @@ include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/nccl)
include(cudnn) # set cudnn libraries, must before configure
include(configure) # add paddle env configuration
......@@ -159,7 +160,7 @@ set(EXTERNAL_LIBS
if(WITH_GPU)
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
endif(WITH_GPU)
......
......@@ -62,11 +62,11 @@ else()
FIND_PACKAGE(CUDA REQUIRED)
if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
endif()
if(NOT CUDNN_FOUND)
message(FATAL_ERROR "Paddle need cudnn to compile")
message(FATAL_ERROR "Paddle needs cudnn to compile")
endif()
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
......
INCLUDE(ExternalProject)
SET(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
INCLUDE_DIRECTORIES(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
if(WITH_DSO)
# If we use DSO, we do not build nccl, just download the dependencies
set(NCCL_BUILD_COMMAND "")
set(NCCL_INSTALL_COMMAND "")
set(NCCL_INSTALL_DIR "")
else()
# otherwise, we build nccl and link it.
set(NCCL_BUILD_COMMAND "make -j 8")
set(NCCL_INSTALL_COMMAND "make install")
SET(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
endif()
ExternalProject_Add(
extern_nccl
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/NVIDIA/nccl.git"
GIT_TAG "v1.3.4-1"
PREFIX "${NCCL_SOURCE_DIR}"
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND "${NCCL_BUILD_COMMAND}"
INSTALL_COMMAND "${NCCL_INSTALL_COMMAND}"
INSTALL_DIR "${NCCL_INSTALL_DIR}"
TEST_COMMAND ""
)
if (WITH_DSO)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
add_library(nccl STATIC ${dummyfile})
else()
add_library(nccl INTERFACE)
endif()
else()
ADD_LIBRARY(nccl STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET nccl PROPERTY IMPORTED_LOCATION
${NCCL_INSTALL_DIR}/lib/libnccl.a)
endif()
add_dependencies(nccl extern_nccl)
LIST(APPEND external_project_dependencies nccl)
......@@ -174,7 +174,7 @@ decoder_inputs = paddle.layer.fc(
1. 两者都是对梯度的截断,但截断时机不同,前者在 :code:`optimzier` 更新网络参数时应用;后者在激活函数反向计算时被调用;
2. 截断对象不同:前者截断可学习参数的梯度,后者截断回传给前层的梯度;
除此之外,还可以通过减小学习或者对数据进行归一化处理来解决这类问题。
除此之外,还可以通过减小学习或者对数据进行归一化处理来解决这类问题。
5. 如何调用 infer 接口输出多个layer的预测结果
-----------------------------------------------
......
......@@ -87,11 +87,8 @@ class OpInfoMap {
}
}
template <typename Callback>
void IterAllInfo(Callback callback) {
for (auto& it : map_) {
callback(it.first, it.second);
}
const std::unordered_map<std::string, const OpInfo>& map() const {
return map_;
}
private:
......
......@@ -18,6 +18,10 @@ limitations under the License. */
namespace paddle {
namespace framework {
VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); }
void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
}
......
......@@ -75,9 +75,9 @@ class VarDescBind {
int32_t GetLodLevel() const;
VarDesc::VarType GetType() const { return desc_.type(); }
VarDesc::VarType GetType() const;
void SetType(VarDesc::VarType type) { desc_.set_type(type); }
void SetType(VarDesc::VarType type);
bool Persistable() const { return desc_.persistable(); }
......
......@@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
copyInVal_ = nullptr;
if (act.grad && algo == algorithm::eltwise_tanh) {
// tanh need save src input for backward
inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
CHECK(copyInVal_) << "should not be emptry";
pipelineFwd_.push_back(*copyInVal_);
......@@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
algorithm algo = getAlgo(this->getName());
float alpha = getBwdAlpha();
float beta = getBeta();
grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
auto eng = CPUEngine::Instance().getEngine();
auto bwdDesc = eltwise_bwd::desc(
algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
......@@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) {
int ic = cnt_ / bs / ih / iw;
CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
val_ = MKLDNNMatrix::create(
act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
{bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
CHECK(val_);
val_->downSpatial();
}
......
......@@ -21,8 +21,8 @@ namespace paddle {
typedef enum {
MKLDNN_BASE = 1, // basical info of MKLDNN
MKLDNN_TESTS = 1, // gtest info of MKLDNN
MKLDNN_SIZES = 2, // size info of MKLDNN
MKLDNN_FMTS = 3, // format info of MKLDNN
MKLDNN_FMTS = 2, // format info of MKLDNN
MKLDNN_SIZES = 3, // size info of MKLDNN
MKLDNN_ALL = 4, // show all info of MKLDNN
} MKLDNN_LOG_LEVEL;
......
......@@ -116,8 +116,6 @@ void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
resetFwdBuffers(fwdPD_, in, wgt, bias, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
printValueFormatFlow();
}
void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
......@@ -135,12 +133,6 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
printGradFormatFlow();
}
void MKLDNNConvLayer::updateInputData() {
cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
}
void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
......@@ -211,11 +203,18 @@ void MKLDNNConvLayer::resetFwdBuffers(
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(pd);
resetInValue(pd, in);
resetInValue(
in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
resetOutValue(out, pd->dst_primitive_desc());
resetWgtBiasValue(pd, wgt, bias);
resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
resetOutValue(pd, out);
if (biases_ && biases_->getW()) {
resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
} else {
bias = nullptr;
}
}
void MKLDNNConvLayer::resetFwdPipeline(
......@@ -225,104 +224,12 @@ void MKLDNNConvLayer::resetFwdPipeline(
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
if (cvtInVal_) {
pipeline.push_back(*cvtInVal_);
}
if (bias) {
fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
} else {
fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
}
pipeline.push_back(*fwd_);
if (cvtOutVal_) {
pipeline.push_back(*cvtOutVal_);
}
}
void MKLDNNConvLayer::resetInValue(
std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
// create buffer and reorder if input value do not match
cpuInVal_ = nullptr;
cvtInVal_ = nullptr;
MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr);
if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
in = dnnIn;
return;
}
if (dnnIn) {
if (dnnIn->getFormat() == format::nc) {
CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
// create a new one with nchw format and same data
memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
}
if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
in = dnnIn;
return;
}
cpuInVal_ = dnnIn;
in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
CHECK(cvtInVal_) << "should not be emptry";
} else {
memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
// create new mkldnn matrix
in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
CHECK(cvtInVal_) << "should not be emptry";
} else {
in = cpuInVal_;
}
}
}
void MKLDNNConvLayer::resetWgtBiasValue(
std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias) {
wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc());
VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
bias = (biases_ && biases_->getW())
? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc())
: nullptr;
}
void MKLDNNConvLayer::resetOutValue(
std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
// create reorder if output value has cpu device and pd do not match
cpuOutVal_ = nullptr;
cvtOutVal_ = nullptr;
if (!outputIsOnlyMKLDNN()) {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) {
out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc());
cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
CHECK(cvtOutVal_) << "should not be empty";
} else {
cpuOut->setData(output_.value->getData());
cpuOutVal_ = out;
}
// when output is cpu device, change the mkldnn output value and make them
// share the same data. Then if next layer use inputlayer->getOuputValue()
// to achieve the input value, it will get the right data.
output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
return;
}
output_.value = std::dynamic_pointer_cast<Matrix>(out);
}
void MKLDNNConvLayer::resetBwdWgtPD(
......@@ -331,8 +238,8 @@ void MKLDNNConvLayer::resetBwdWgtPD(
loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
// create backward weight using input, output and weight value memory desc
CHECK(inVal_) << "Should have input value";
CHECK(outVal_) << "Should have output value";
CHECK(inVal_) << "Should have internal input value";
CHECK(outVal_) << "Should have internal output value";
CHECK(wgtVal_) << "Should have weight value";
algorithm algo = algorithm::convolution_direct;
padding_kind padKind = padding_kind::zero;
......@@ -372,8 +279,8 @@ void MKLDNNConvLayer::resetBwdDataPD(
memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
CHECK(inVal_) << "Should have input value";
CHECK(outVal_) << "Should have output value";
CHECK(inVal_) << "Should have internal input value";
CHECK(outVal_) << "Should have internal output value";
// create backward data using input and output value memory desc
// but using weight memory desc with any format
auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
......@@ -399,12 +306,27 @@ void MKLDNNConvLayer::resetBwdBuffers(
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(wgtPD);
resetOutGrad(wgtPD, out);
resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
resetWgtBiasGrad(wgtPD, wgt, bias);
resetWithMatrix(
wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
CHECK(wgtVal_ != nullptr &&
wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
<< "primitive desc of weight grad and value should be equal";
resetInGrad(dataPD, in);
bias = nullptr;
if (biases_ && biases_->getWGrad()) {
resetWithMatrix(
bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
CHECK(bias && biasVal_ &&
bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
<< "primitive desc of bias grad should equal the bias value";
}
if (dataPD == nullptr) {
return;
}
resetInGrad(in, dataPD->diff_src_primitive_desc());
resetWgtValBwdData(dataPD, wgtValBwdData_);
}
......@@ -416,10 +338,7 @@ void MKLDNNConvLayer::resetBwdPipeline(
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
if (cvtOutGrad_) {
pipeline.push_back(*cvtOutGrad_);
}
CHECK(inVal_);
// add bwdWgt handle
if (bias) {
bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
......@@ -431,99 +350,13 @@ void MKLDNNConvLayer::resetBwdPipeline(
if (dataPD == nullptr) {
return;
}
if (cvtWgtVal_) {
pipeline.push_back(*cvtWgtVal_);
}
// add bwdData handle
CHECK(wgtValBwdData_) << "Should have weight memory";
bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
pipeline.push_back(*bwdData_);
if (cvtInGrad_) {
pipeline.push_back(*cvtInGrad_);
}
}
void MKLDNNConvLayer::resetOutGrad(
std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
cpuOutGrad_ = nullptr;
cvtOutGrad_ = nullptr;
CHECK(outVal_ != nullptr &&
outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
<< "primitive desc of out grad and value should be equal";
if (outputIsOnlyMKLDNN()) {
MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
} else {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
// always share the same grad data of CPU output
// then the activation can get the right grad from output_.grad
output_.grad->setData(cpuOut->getData());
// same PrimitiveDesc with cpuInVal_
CHECK(cpuOutVal_);
cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
// create reorder if primitive desc does not match
if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
CHECK(cvtOutGrad_);
} else {
out = cpuOutGrad_;
}
}
}
void MKLDNNConvLayer::resetWgtBiasGrad(
std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias) {
wgt = MKLDNNMatrix::create(weight_->getWGrad(),
wgtPD->diff_weights_primitive_desc());
CHECK(nullptr != wgtVal_ &&
wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
<< "primitive desc of weight grad and value should be equal";
VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat();
bias = nullptr;
if (biasVal_ == nullptr) {
return;
}
bias = MKLDNNMatrix::create(biases_->getWGrad(),
wgtPD->diff_bias_primitive_desc());
CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
<< "primitive desc of bias grad should equal the bias value";
}
void MKLDNNConvLayer::resetInGrad(
std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
MKLDNNMatrixPtr& in) {
in = nullptr;
cpuInGrad_ = nullptr;
cvtInGrad_ = nullptr;
if (dataPD == nullptr) {
return;
}
if (inputIsOnlyMKLDNN()) {
MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
CHECK(nullptr != inVal_ &&
in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
<< "primitive desc of input grad and value should be equal";
} else {
const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
// same PrimitiveDesc with cpuInVal_
CHECK(cpuInVal_);
cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
in = cpuInGrad_;
// create reorder if PrimitiveDesc does not match
if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
dataPD->diff_src_primitive_desc());
cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
CHECK(cvtInGrad_);
}
}
}
void MKLDNNConvLayer::resetWgtValBwdData(
......@@ -537,8 +370,7 @@ void MKLDNNConvLayer::resetWgtValBwdData(
// since the primitive_desc would be different with wgtVal_
CHECK(wgtVal_) << "should have weight value";
if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
wgtValBwdData_ =
MKLDNNMatrix::create(nullptr, dataPD->weights_primitive_desc());
wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
CHECK(cvtWgtVal_);
} else {
......
......@@ -48,17 +48,6 @@ protected:
// save forward primitive_desc, which can be used backward
std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
// MKLDNNMatrixPtr which should be created from CPU Device
MKLDNNMatrixPtr cpuInVal_;
MKLDNNMatrixPtr cpuInGrad_;
MKLDNNMatrixPtr cpuOutVal_;
MKLDNNMatrixPtr cpuOutGrad_;
// convert handle between CPU device and MKLDNN device
std::shared_ptr<mkldnn::reorder> cvtInVal_;
std::shared_ptr<mkldnn::reorder> cvtInGrad_;
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
// whether the weight has been init
bool hasInitedWgt_;
......@@ -94,8 +83,6 @@ public:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateInputData() override;
void updateWeights(const UpdateCallback& callback) override;
void convertWeightsFromPaddle() override;
......@@ -109,26 +96,6 @@ public:
<< ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
}
void printValueFormatFlow() override {
if (cpuInVal_) {
VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
}
MKLDNNLayer::printValueFormatFlow();
if (cpuOutVal_) {
VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
}
}
void printGradFormatFlow() override {
if (cpuInGrad_) {
VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
}
MKLDNNLayer::printGradFormatFlow();
if (cpuOutGrad_) {
VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
}
}
protected:
/**
* load the dims settings of this conv
......@@ -162,23 +129,6 @@ protected:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* reset MKLDNNMatrix of input value
*/
void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in);
/**
* reset MKLDNNMatrix of weight and bias value
*/
void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias);
/**
* reset MKLDNNMatrix of output value
*/
void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& out);
/**
* reset the backward weight primitive descriptor.
*/
......@@ -207,22 +157,6 @@ protected:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* reset MKLDNNMatrix of output grad
*/
void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
MKLDNNMatrixPtr& out);
/**
* reset MKLDNNMatrix of weight and bias grad
*/
void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias);
/**
* reset MKLDNNMatrix of input grad
*/
void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
MKLDNNMatrixPtr& in);
/**
* reset MKLDNNMatrix of weight value for backward data
* since the primitive_desc would be different with wgtVal_
......
......@@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
hasInitedWgt_ = true;
}
......@@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
CHECK(wgtVal_) << "should have been initialized";
bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
}
......@@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
resetFwdPD(fwdPD_, in, wgt, bias, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
printValueFormatFlow();
}
void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
......@@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
resetBwdDataPD(bwdDataPD, in, out);
resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
printGradFormatFlow();
}
void MKLDNNFcLayer::updateInputData() {
inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
}
void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
......@@ -139,51 +131,30 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetInValue(in);
resetWgtBiasValue(wgt, bias);
resetOutValue(out);
}
void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
if (inputIsOnlyMKLDNN()) {
const MatrixPtr& dnnIn = getInputValue(0);
in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
CHECK(in) << "Input should be MKLDNNMatrix";
} else {
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
in = MKLDNNMatrix::create(
cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
}
CHECK(in);
in->downSpatial();
}
void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias) {
auto outPD =
MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
resetOutValue(out, outPD);
format wgtFmt = format::oihw;
if (inVal_->getFormat() == format::nChw8c) {
if (in->getFormat() == format::nChw8c) {
wgtFmt = format::oIhw8i;
} else if (inVal_->getFormat() == format::nChw16c) {
} else if (in->getFormat() == format::nChw16c) {
wgtFmt = format::oIhw16i;
}
wgt = MKLDNNMatrix::create(
weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
auto wgtPD =
MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
resetWithMatrix(wgt, weight_->getW(), wgtPD);
wgt->downSpatial();
VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
bias = (biases_ && biases_->getW())
? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
: nullptr;
}
void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
if (!outputIsOnlyMKLDNN()) {
// fc cpu output value do not need create convert, just share data
getOutput(CPU_DEVICE).value->setData(out->getData());
if (biases_ && biases_->getW()) {
auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
resetWithMatrix(bias, biases_->getW(), biasPD);
} else {
bias = nullptr;
}
output_.value = std::dynamic_pointer_cast<Matrix>(out);
}
void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
......@@ -219,7 +190,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
} else {
fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
}
pipeline.push_back(*fwd_);
}
......@@ -227,44 +197,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
resetOutGrad(out);
resetWgtBiasGrad(wgt, bias);
resetInGrad(in);
}
void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
CHECK(outVal_);
if (outputIsOnlyMKLDNN()) {
MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
} else {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
output_.grad->setData(cpuOut->getData());
out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
}
}
CHECK(inVal_ && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc());
void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias) {
CHECK(wgtVal_);
wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
if (biasVal_) {
resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
} else {
bias = nullptr;
if (biasVal_ == nullptr) {
return;
}
bias =
MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
}
void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
in = nullptr;
if (inputLayers_[0]->getOutput().grad == nullptr) {
return;
}
CHECK(inVal_);
MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
}
void MKLDNNFcLayer::resetBwdWgtPD(
......
......@@ -66,8 +66,6 @@ public:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateInputData() override;
void updateWeights(const UpdateCallback& callback) override;
void convertWeightsFromPaddle() override;
......@@ -84,9 +82,6 @@ protected:
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
void resetInValue(MKLDNNMatrixPtr& in);
void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
void resetOutValue(MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr wgt,
......@@ -109,9 +104,6 @@ protected:
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
void resetOutGrad(MKLDNNMatrixPtr& out);
void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
void resetInGrad(MKLDNNMatrixPtr& in);
void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNLayer.h"
using namespace mkldnn; // NOLINT
typedef memory::format format;
namespace paddle {
bool MKLDNNLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
// set device id before Layer::init
setDevice(MKLDNN_DEVICE);
// change param device to MKLDNN device
setParamsDevice(MKLDNN_DEVICE, parameterMap);
if (!Layer::init(layerMap, parameterMap)) {
return false;
}
setOutputMap();
checkCPUOutputsNumber();
stream_.reset(new MKLDNNStream());
engine_ = CPUEngine::Instance().getEngine();
return true;
}
void MKLDNNLayer::forward(PassType passType) {
passType_ = passType;
{
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
CHECK(!inputLayers_.empty());
copySeqInfoToOutputs();
size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
if (inputElemenCnt_ != elemenCnt) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
// reset when input total sizes changed, not only the batchsize
inputElemenCnt_ = elemenCnt;
pipelineFwd_.clear();
reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
// all cpu device output grad or value share output's
shareCPUDevice();
resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
// MKLDNNLayer output value should be MKLDNNMatrix
// so external output value is necessary.
// Then external input value is not necessary,
// since input may be mkldnn internal buffer.
CHECK(extOutVal_) << "external output value is necessary";
output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
CHECK(inVal_ && outVal_) << "internal memories are necessary";
if (cvtInVal_) {
pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
}
if (cvtOutVal_) {
pipelineFwd_.push_back(*cvtOutVal_);
}
convertWeightsFromPaddle();
printSizeInfo();
printValueFormat();
needResetBwd_ = true;
}
if (inputLayers_[0]->getType() == "data") {
// Update input value data when input layer is "data" type,
// since the input value data address might be changed.
CHECK(extInVal_);
extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
}
if (!outputOnlyMKLDNN_) {
clearGrads();
}
stream_->submit(pipelineFwd_);
}
{
REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
forwardActivation();
}
}
void MKLDNNLayer::backward(const UpdateCallback& callback) {
if (needResetBwd_) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
pipelineBwd_.clear();
pipelineMergeGrad_.clear();
mergeGrad_ = nullptr;
resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
// external output grad is not necessary
// since output may be mkldnn internal buffer or merge them directly.
CHECK(outGrad_) << "internal output grad is necessary";
if (extOutGrad_) {
CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
<< "the external buffer should share the same data with output_.grad";
}
if (cvtOutGrad_) {
pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
}
if (cvtInGrad_) {
pipelineBwd_.push_back(*cvtInGrad_);
}
printGradFormat();
needResetBwd_ = false;
}
// merge grad must before backward activation
if (mergeGrad_) {
REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
stream_->submit(pipelineMergeGrad_);
}
{
REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
backwardActivation();
}
{
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
stream_->submit(pipelineBwd_);
}
{
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
updateWeights(callback);
}
}
void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
const Argument& input = inputLayers_[0]->getOutput();
batchsize = input.getBatchSize();
int h = input.getFrameHeight();
int w = input.getFrameWidth();
if (h != 0) {
height = h;
}
if (w != 0) {
width = w;
}
}
void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
output_.setFrameHeight(height);
output_.setFrameWidth(width);
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
outputOtherDevice_[i].setFrameHeight(height);
outputOtherDevice_[i].setFrameWidth(width);
}
}
void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
const MatrixPtr& mat,
memory::primitive_desc pd) {
dnn = nullptr;
if (mat == nullptr) {
return;
}
dnn = MKLDNNMatrix::create(pd, mat);
}
void MKLDNNLayer::resetInValue(
MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
cvtInVal_ = nullptr;
extInVal_ = nullptr;
in = nullptr;
CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
auto extPD = MKLDNNMatrix::createPrimitiveDesc(
{bs_, ic_, ih_, iw_}, format::nchw, engine_);
const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
if (in == nullptr || in->getFormat() == format::nc) {
in = MKLDNNMatrix::create(extPD, inMat);
}
extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
if (in->getFormat() == format::nc) {
CHECK(ih_ == 1 && iw_ == 1);
}
if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
return;
}
// need create reorder
in = MKLDNNMatrix::create(*intPD);
extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
CHECK(cvtInVal_) << "should not be emptry";
}
void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
memory::primitive_desc intPD) {
cvtOutVal_ = nullptr;
out = MKLDNNMatrix::create(intPD, output_.value);
extOutVal_ = out;
if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
return;
}
// need create reorder
CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
extOutVal_ = MKLDNNMatrix::create(
memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value);
out = MKLDNNMatrix::create(intPD);
cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
CHECK(cvtOutVal_) << "should not be empty";
}
void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
memory::primitive_desc intPD) {
cvtInGrad_ = nullptr;
extInGrad_ = nullptr;
in = nullptr;
LayerPtr& input = inputLayers_[0];
if (input->getOutputGrad() == nullptr) {
// no need input grad
return;
}
CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
<< "only support input is MKLDNN layer or only have one output layer";
// when input is a mkldnn branch node,
// this layer will save input grad to a internal buffer,
// and the mkldnn input layer will merge them to actual prev->output_.grad
const MatrixPtr& inMat =
input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
in = MKLDNNMatrix::create(intPD, inMat);
Argument& arg = input->getOutput(this->getName());
arg.grad = std::dynamic_pointer_cast<Matrix>(in);
CHECK(inVal_);
CHECK(inVal_->getPrimitiveDesc() == intPD) << "the primitive desc must equal";
if (inputIsOnlyMKLDNN()) {
return;
}
extInGrad_ = in;
if (isPaddleFormat(extInGrad_->getFormat())) {
return;
}
// need create reorder
// TODO(TJ): add macro definition to simplify it
CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
<< "should have external input value and the format must be nchw(nc)";
extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
<< "should have internal input value and primitive desc must equal";
in = MKLDNNMatrix::create(intPD);
cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
CHECK(cvtInGrad_);
}
void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
memory::primitive_desc intPD) {
cvtOutGrad_ = nullptr;
extOutGrad_ = nullptr;
out = nullptr;
MatrixPtr& outMat = output_.grad;
out = MKLDNNMatrix::create(intPD, outMat);
resetMergeGrad(out);
if (outputIsOnlyMKLDNN()) {
return;
}
CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
extOutGrad_ = out;
if (isPaddleFormat(extOutGrad_->getFormat())) {
return;
}
// need create reorder
CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
<< "should have external output value and the format must be nchw(nc)";
extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
<< "should have internal output value and primitive desc must equal";
out = MKLDNNMatrix::create(intPD);
cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
CHECK(cvtOutGrad_);
}
void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
mergeGrad_ = nullptr;
pipelineMergeGrad_.clear();
if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
// do not merge when output is not all MKLDNN or only one output
return;
}
CHECK(out) << "should have reset internal ouput grad";
std::vector<double> scales(outputMap_.size(), 1.0);
std::vector<memory::primitive_desc> srcPDs;
std::vector<primitive::at> srcs;
for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
MKLDNNMatrixPtr src =
std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
CHECK(src) << "should be MKLDNNMatrix";
auto srcDims = src->getDims();
auto dstDims = out->getDims();
CHECK_EQ(srcDims.size(), dstDims.size());
for (size_t i = 0; i < srcDims.size(); ++i) {
CHECK_EQ(srcDims[i], dstDims[i]);
}
VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first
<< ", format " << src->getFormat();
srcPDs.push_back(src->getPrimitiveDesc());
srcs.push_back(*src);
}
// TODO(TJ): remove me when mkldnn sum support different formats
for (size_t i = 1; i < srcPDs.size(); ++i) {
CHECK(srcPDs[0] == srcPDs[i]);
}
tmpOutGrad_ = out;
tmpCvt_ = nullptr;
if (out->getPrimitiveDesc() != srcPDs[0]) {
tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
CHECK(tmpCvt_);
pipelineMergeGrad_.push_back(*tmpCvt_);
}
auto sumPD =
sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
}
} // namespace paddle
......@@ -58,11 +58,31 @@ protected:
std::vector<mkldnn::primitive> pipelineFwd_;
std::vector<mkldnn::primitive> pipelineBwd_;
// MKLDNNMatrixPtr with internal format
/* Value and grad are seperated as internal and external buffers.
* Each MKLDNNLayer must init or reset internal buffer at least,
* and the external buffer format is always nchw of nc(when h==w==1),
* which is the same format as paddle.
* The output_.value and output_.grad always save the external data,
* when mixed with cpu device.
* When all layers are mkldnn layers, they could save internal data.
*/
// below MKLDNNMatrix buffers are all internal buffers
MKLDNNMatrixPtr inVal_;
MKLDNNMatrixPtr inGrad_;
MKLDNNMatrixPtr outVal_;
MKLDNNMatrixPtr outGrad_;
// below are external value and grad
MKLDNNMatrixPtr extInVal_;
MKLDNNMatrixPtr extInGrad_;
MKLDNNMatrixPtr extOutVal_;
MKLDNNMatrixPtr extOutGrad_;
// convert handle between external and internal buffers
std::shared_ptr<mkldnn::reorder> cvtInVal_;
std::shared_ptr<mkldnn::reorder> cvtInGrad_;
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
// weight and bias are always internal buffers
MKLDNNMatrixPtr wgtVal_;
MKLDNNMatrixPtr wgtGrad_;
MKLDNNMatrixPtr biasVal_;
......@@ -91,6 +111,7 @@ public:
oh_(0),
ow_(0),
needResetBwd_(true),
outputOnlyMKLDNN_(false),
engine_(mkldnn::engine::cpu, 0),
stream_(nullptr),
fwd_(nullptr),
......@@ -99,92 +120,9 @@ public:
~MKLDNNLayer() {}
virtual bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
// set device id before Layer::init
setDevice(MKLDNN_DEVICE);
// change param device to MKLDNN device
setParamsDevice(MKLDNN_DEVICE, parameterMap);
if (!Layer::init(layerMap, parameterMap)) {
return false;
}
setOutputMap();
checkCPUOutputsNumber();
stream_.reset(new MKLDNNStream());
engine_ = CPUEngine::Instance().getEngine();
return true;
}
void forward(PassType passType) override {
passType_ = passType;
{
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
CHECK(!inputLayers_.empty());
copySeqInfoToOutputs();
size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt();
if (inputElemenCnt_ != elemenCnt) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
// reset when input total sizes changed, not only the batchsize
inputElemenCnt_ = elemenCnt;
pipelineFwd_.clear();
reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
convertWeightsFromPaddle();
needResetBwd_ = true;
}
if (inputLayers_[0]->getType() == "data") {
updateInputData();
}
if (!outputOnlyMKLDNN_) {
clearGrads();
}
stream_->submit(pipelineFwd_);
}
/* activation */ {
REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
forwardActivation();
}
}
void backward(const UpdateCallback& callback) override {
if (needResetBwd_) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
pipelineBwd_.clear();
pipelineMergeGrad_.clear();
mergeGrad_ = nullptr;
resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
needResetBwd_ = false;
}
// merge grad must before backward activation
if (mergeGrad_) {
REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
stream_->submit(pipelineMergeGrad_);
}
{
REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
backwardActivation();
}
{
REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
stream_->submit(pipelineBwd_);
}
{
REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
updateWeights(callback);
}
}
virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
virtual void forward(PassType passType);
virtual void backward(const UpdateCallback& callback);
/**
* reshape the input image sizes
......@@ -195,7 +133,7 @@ public:
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
/**
* reset the mkldnn forward primitve and memory
* reset the mkldnn forward primitve and memories
* only would be called when input size changes
*/
virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
......@@ -205,7 +143,7 @@ public:
MKLDNNMatrixPtr& out) = 0;
/**
* reset the mkldnn backward primitve and memory for mkldnn fc
* reset the mkldnn backward primitve and memories
* only would be called when needed
*/
virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
......@@ -214,12 +152,6 @@ public:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) = 0;
/**
* Update input value data when input layer is "data" type.
* Since the input value data address might be changed.
*/
virtual void updateInputData() {}
/**
* Update weights and biases if necessary.
*/
......@@ -246,131 +178,78 @@ protected:
/**
* reshape the input image sizes and input batchsize
*/
virtual void reshapeInput(int& batchsize, int& height, int& width) {
const Argument& input = inputLayers_[0]->getOutput();
batchsize = input.getBatchSize();
int h = input.getFrameHeight();
int w = input.getFrameWidth();
if (h != 0) {
height = h;
}
if (w != 0) {
width = w;
}
}
void reshapeInput(int& batchsize, int& height, int& width);
/**
* reshape output image sizes
*/
virtual void reshapeOutput(size_t height, size_t width) {
output_.setFrameHeight(height);
output_.setFrameWidth(width);
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
outputOtherDevice_[i].setFrameHeight(height);
outputOtherDevice_[i].setFrameWidth(width);
}
}
void reshapeOutput(size_t height, size_t width);
/**
* reset the output grad matrix from primitive desc.
* and reset the merge grad primitive if needed.
* note: when this layer has serval outputs,
* it could not be mixed with cpu device,
* since it can not get memory desc from cpu device.
* reset MKLDNNMatrix from Matrix and internal primitive desc.
* reset nullptr if matrix or primitive desc is empty
*/
virtual void resetOutGrad(MKLDNNMatrixPtr& out,
mkldnn::memory::primitive_desc pd) {
CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
mergeGrad_ = nullptr;
pipelineMergeGrad_.clear();
out = MKLDNNMatrix::create(output_.grad, pd);
if (outputMap_.size() <= 1) {
return;
}
std::vector<double> scales(outputMap_.size(), 1.0);
std::vector<mkldnn::memory::primitive_desc> srcPDs;
std::vector<mkldnn::primitive::at> srcs;
for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
MKLDNNMatrixPtr src =
std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
CHECK(src) << "should be MKLDNNMatrix";
auto srcDims = src->getDims();
auto dstDims = out->getDims();
CHECK_EQ(srcDims.size(), dstDims.size());
for (size_t i = 0; i < srcDims.size(); ++i) {
CHECK_EQ(srcDims[i], dstDims[i]);
}
srcPDs.push_back(src->getPrimitiveDesc());
srcs.push_back(*src);
}
// TODO(TJ): remove me when mkldnn sum support different formats
for (size_t i = 1; i < srcPDs.size(); ++i) {
CHECK(srcPDs[0] == srcPDs[i]);
}
tmpOutGrad_ = nullptr;
tmpCvt_ = nullptr;
if (out->getPrimitiveDesc() != srcPDs[0]) {
tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
CHECK(tmpCvt_);
pipelineMergeGrad_.push_back(*tmpCvt_);
} else {
tmpOutGrad_ = out;
}
void resetWithMatrix(MKLDNNMatrixPtr& dnn,
const MatrixPtr& mat,
mkldnn::memory::primitive_desc pd);
auto sumPD = mkldnn::sum::primitive_desc(
tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
}
/**
* reset input value from input MKLDNNMatrix and internal primitive desc.
* reset both internal and external buffer and create reorder if necessary.
*/
void resetInValue(
MKLDNNMatrixPtr& in,
const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
/**
* reset input grad from primitive desc.
* this function is avaiable for input is only mkldnn
* or input do not care cpu device
* reset output value from internal primitive desc.
* reset both internal and external buffer and create reorder if necessary.
*/
virtual void resetInGrad(MKLDNNMatrixPtr& in,
mkldnn::memory::primitive_desc pd) {
LayerPtr& input = inputLayers_[0];
const MatrixPtr& grad =
input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
in = MKLDNNMatrix::create(grad, pd);
Argument& arg = input->getOutput(this->getName());
arg.grad = std::dynamic_pointer_cast<Matrix>(in);
}
void resetOutValue(MKLDNNMatrixPtr& out,
mkldnn::memory::primitive_desc intPD);
/**
* print info about sizes
* reset input grad from internal primitive desc.
* reset both internal and external buffer and create reorder if necessary.
*/
virtual void printSizeInfo() {
VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
<< ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
<< ", oh: " << oh_ << ", ow: " << ow_;
}
void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
/**
* Print the mkldnn memory format flow of value
* reset output grad from internal primitive desc.
* merge grad if necessary.
* reset both internal and external buffer and create reorder if necessary.
* note: about merge grad, when this layer has several outputs,
* it could not be mixed with cpu device,
* since it can not get memory desc from cpu device.
*/
virtual void printValueFormatFlow() {
if (inVal_ && outVal_) {
VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>> "
<< outVal_->getFormat();
}
}
void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
/**
* reset the merge grad primitive if necessary.
* note: do not support the grads mixed with cpu device,
* since it can not get memory desc from cpu device.
*/
void resetMergeGrad(MKLDNNMatrixPtr& out);
protected:
/**
* Set deviceId of this layer.
*/
void setDevice(int id) { deviceId_ = id; }
/**
* Print the mkldnn memory format flow of grad
* check the format is nchw or nc,
* which is supported by Paddle default memory layout
*/
virtual void printGradFormatFlow() {
if (inGrad_ && outGrad_) {
VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<< "
<< outGrad_->getFormat();
bool isPaddleFormat(mkldnn::memory::format fmt) {
if (fmt == mkldnn::memory::format::nchw ||
fmt == mkldnn::memory::format::nc) {
return true;
} else {
return false;
}
}
protected:
/**
* If input only has MKLDNN device.
* Otherwise, only support the previous layer using CPU device.
......@@ -380,7 +259,6 @@ protected:
if (prevDevice == MKLDNN_DEVICE) {
return true;
} else {
// do not support GPU yet
CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
return false;
}
......@@ -400,20 +278,76 @@ protected:
}
/**
* Set deviceId of this layer.
* print info about sizes
*/
void setDevice(int id) { deviceId_ = id; }
virtual void printSizeInfo() {
VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
<< ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
<< ", oh: " << oh_ << ", ow: " << ow_;
}
/**
* print the mkldnn memory format of value
*/
virtual void printValueFormat() {
if (extInVal_) {
VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
}
if (inVal_) {
VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
}
if (wgtVal_) {
VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
}
if (biasVal_) {
VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
}
}
/**
* print the mkldnn memory format of grad
*/
virtual void printGradFormat() {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
}
if (inGrad_) {
VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
}
if (extInGrad_) {
VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
}
if (wgtGrad_) {
VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
}
if (biasGrad_) {
VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
}
}
private:
/**
* clear all grad
*/
void clearGrads() {
if (output_.grad) {
output_.grad->zeroMem();
}
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
if (outputOtherDevice_[i].grad) {
outputOtherDevice_[i].grad->zeroMem();
}
}
}
/**
* Set deviceId of the params used in this layer.
......@@ -449,6 +383,19 @@ private:
}
}
/**
* if have cpu device, share value and grad data with output_
*/
void shareCPUDevice() {
if (outputIsOnlyMKLDNN()) {
return;
}
for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
outputOtherDevice_[i].value = output_.value;
outputOtherDevice_[i].grad = output_.grad;
}
}
/**
* Check the cpu device number of outputOtherDevice_.
* should have only one at most.
......
......@@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
resetFwdPD(fwdPD_, in, out);
resetFwdPipeline(pipeline, fwdPD_, in, out);
printValueFormatFlow();
}
void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
......@@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
resetBwdPD(pd, in, out);
resetBwdPipeline(pipeline, pd, in, out);
printGradFormatFlow();
}
void MKLDNNPoolLayer::updateInputData() {
inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
}
void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
resetInValue(in);
resetOutValue(out);
}
void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
if (inputIsOnlyMKLDNN()) {
const MatrixPtr& dnnIn = getInputValue(0);
in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
CHECK(in) << "Input should be MKLDNNMatrix";
} else {
CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
in = MKLDNNMatrix::create(
cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
}
}
void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
CHECK(inVal_) << "Should reset input value first";
memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
out = MKLDNNMatrix::create(
output_.value, outDims, inVal_->getFormat(), engine_);
// create reorder if output value has cpu device and pd do not match
cpuOutVal_ = nullptr;
cvtOutVal_ = nullptr;
if (!outputIsOnlyMKLDNN()) {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
CHECK(cvtOutVal_) << "should not be emptry";
} else {
cpuOut->setData(output_.value->getData());
cpuOutVal_ = out;
}
output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
return;
}
output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
CHECK(in);
auto outPD =
MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
resetOutValue(out, outPD);
}
void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr out) {
memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
memory::dims kernels = memory::dims{fh_, fw_};
memory::dims strides = memory::dims{sh_, sw_};
memory::dims padL = memory::dims{ph_, pw_};
......@@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline(
? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
: std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
pipeline.push_back(*fwd_);
if (cvtOutVal_) {
pipeline.push_back(*cvtOutVal_);
}
}
void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
resetOutGrad(out);
resetInGrad(in);
}
void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
cpuOutGrad_ = nullptr;
cvtOutGrad_ = nullptr;
CHECK(outVal_);
if (outputIsOnlyMKLDNN()) {
MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
} else {
const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
// always share the same grad data of CPU output
// then the activation can get the right grad from output_.grad
output_.grad->setData(cpuOut->getData());
cpuOutGrad_ = MKLDNNMatrix::create(
cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
CHECK(cvtOutGrad_) << "should not be emptry";
} else {
out = cpuOutGrad_;
}
}
}
void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
in = nullptr;
if (inputLayers_[0]->getOutput().grad == nullptr) {
return;
}
CHECK(inVal_);
MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
CHECK(inVal_ && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc());
}
void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
pd = nullptr;
if (in == nullptr) {
return;
}
memory::dims kernels = memory::dims{fh_, fw_};
memory::dims strides = memory::dims{sh_, sw_};
memory::dims padL = memory::dims{ph_, pw_};
memory::dims padR = getPaddingR();
CHECK(in);
CHECK(out);
auto bwdDesc = pool_bwd::desc(poolAlgo_,
in->getMemoryDesc(),
......@@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline(
std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
if (cvtOutGrad_) {
pipeline.push_back(*cvtOutGrad_);
if (pd == nullptr) {
return;
}
bwdData_ =
......
......@@ -38,13 +38,6 @@ protected:
// pooling_avg or pooling_max
mkldnn::algorithm poolAlgo_;
// MKLDNNMatrixPtr which should be created from CPU Device
MKLDNNMatrixPtr cpuOutVal_;
MKLDNNMatrixPtr cpuOutGrad_;
// convert handle between CPU device and MKLDNN device
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
// save forward primitive_desc, which can be used backward
std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
// according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
......@@ -74,8 +67,6 @@ public:
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override;
void updateInputData() override;
void printSizeInfo() override {
MKLDNNLayer::printSizeInfo();
VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
......@@ -90,8 +81,6 @@ protected:
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetInValue(MKLDNNMatrixPtr& in);
void resetOutValue(MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
MKLDNNMatrixPtr out);
......@@ -106,8 +95,6 @@ protected:
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetOutGrad(MKLDNNMatrixPtr& out);
void resetInGrad(MKLDNNMatrixPtr& in);
void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
......
......@@ -97,7 +97,7 @@ void MKLDNNTester::randomWgtDatas() {
parameters_[REF][i]->randomize();
dnnValue->copyFrom(*refValue);
VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
printVector(dnnValue);
}
}
......@@ -109,7 +109,7 @@ void MKLDNNTester::randomBotDatas() {
dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
dataLayers_[DNN][i]->getOutputValue()->copyFrom(
*(dataLayers_[REF][i]->getOutputValue()));
VLOG(lvl_) << "Input " << i << " data:";
VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
printMatrix(dataLayers_[REF][i]->getOutputValue());
}
}
......@@ -118,12 +118,12 @@ void MKLDNNTester::randomTopDiffs() {
refLayer_->getOutputGrad()->randomizeUniform();
dnnLayer_->getOutput(CPU_DEVICE)
.grad->copyFrom(*(refLayer_->getOutputGrad()));
VLOG(lvl_) << "Random Backward Input, TopDiff: ";
VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
printMatrix(refLayer_->getOutputGrad());
}
void MKLDNNTester::checkForward() {
VLOG(MKLDNN_ALL) << "Check Forward";
VLOG(MKLDNN_TESTS) << "Check Forward";
printTopDatas();
double delta =
compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
......@@ -131,15 +131,15 @@ void MKLDNNTester::checkForward() {
}
void MKLDNNTester::checkBackwardData() {
VLOG(MKLDNN_ALL) << "Check Backward Data";
VLOG(MKLDNN_TESTS) << "Check Backward Data";
// TODO(TJ): uncomment me when batch norm ready
// const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
printMatrix(dnnDiff);
VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
printMatrix(refDiff);
double delta = compareMatrix(dnnDiff, refDiff);
......@@ -153,7 +153,7 @@ void MKLDNNTester::checkBackwardData() {
}
void MKLDNNTester::checkBackwardWgts() {
VLOG(MKLDNN_ALL) << "Check Backward Weight";
VLOG(MKLDNN_TESTS) << "Check Backward Weight";
CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
vector<VectorPtr> dnnWgts; // used to temply save mkldnn weights
saveWgt(parameters_[DNN], dnnWgts);
......@@ -165,9 +165,11 @@ void MKLDNNTester::checkBackwardWgts() {
for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
<< parameters_[DNN][i]->getName();
printVector(dnn);
VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
VLOG(MKLDNN_ALL) << "Reference Result: weight value "
<< parameters_[REF][i]->getName();
printVector(ref);
double delta = compareVector(dnn, ref);
......@@ -240,7 +242,8 @@ void MKLDNNTester::printTopDatas() {
}
for (int n = 0; n < NUM; ++n) {
VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
<< " Forward Result: OutputValue";
printMatrix(testLayers_[n]->getOutputValue());
}
}
......@@ -252,7 +255,7 @@ void MKLDNNTester::printMatrix(const MatrixPtr& m) {
std::ostringstream ostr;
m->print(ostr);
VLOG(lvl_) << std::endl << ostr.str();
VLOG(MKLDNN_ALL) << std::endl << ostr.str();
}
void MKLDNNTester::printVector(const VectorPtr& v) {
......@@ -262,7 +265,7 @@ void MKLDNNTester::printVector(const VectorPtr& v) {
std::ostringstream ostr;
v->print(ostr, v->getSize());
VLOG(lvl_) << std::endl << ostr.str();
VLOG(MKLDNN_ALL) << std::endl << ostr.str();
}
double MKLDNNTester::getDelta(const real* d1,
......@@ -314,7 +317,7 @@ void MKLDNNTester::runOnce() {
UpdateCallback updateCallback = [](Parameter* para) {
auto& grad = para->getBuf(PARAMETER_GRADIENT);
auto& value = para->getBuf(PARAMETER_VALUE);
real lr = 1e-3;
real lr = 1e-2;
value->add(*grad, lr);
grad->zeroMem();
};
......@@ -340,10 +343,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
size_t batchSize,
size_t inputImgH,
size_t inputImgW,
bool printDetails,
size_t iter,
float epsilon,
bool log,
int level) {
float epsilon) {
CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
<< "should be MKLDNN layer or MKLDNN activation";
......@@ -359,10 +361,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
ih_ = inputImgH;
iw_ = inputImgW;
log_ = printDetails;
iter_ = iter;
eps_ = epsilon;
log_ = log;
lvl_ = level;
// Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
reset(dnn, ref, batchSize);
......@@ -531,9 +532,11 @@ void MKLDNNTester::getOutResult(const std::string& configPath,
void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size();
for (size_t i = 0; i < ref.outValues.size(); i++) {
EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
}
VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size();
for (size_t i = 0; i < ref.paraValues.size(); i++) {
EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
}
......@@ -544,9 +547,10 @@ void MKLDNNTester::runBranchesTest(const std::string& configPath,
float eps) {
DataIn in;
initArgument(in, configPath, iter);
DataOut outCpu, outDnn;
VLOG(MKLDNN_TESTS) << "runing cpu network";
getOutResult(configPath, in, outCpu, false, iter);
VLOG(MKLDNN_TESTS) << "runing mkldnn network";
getOutResult(configPath, in, outDnn, true, iter);
compareResult(outCpu, outDnn, eps);
......
......@@ -58,8 +58,6 @@ protected:
size_t iter_;
/// whether to print out the details
bool log_;
/// vlog level to print the matrix details datas
int lvl_;
/// epsilon
float eps_;
/// input image size, default 1
......@@ -70,7 +68,6 @@ public:
iter_ = iter;
eps_ = epsilon;
log_ = false;
lvl_ = MKLDNN_ALL;
}
~MKLDNNTester() {}
......@@ -81,10 +78,9 @@ public:
size_t batchSize,
size_t inputImgH = 1,
size_t inputImgW = 1,
bool printDetails = false,
size_t iter = 3,
float epsilon = 1e-4,
bool log = false,
int level = MKLDNN_ALL);
float epsilon = 1e-4);
static void runBranchesTest(const std::string& configPath,
size_t iter = 3,
float eps = 1e-4);
......
......@@ -51,7 +51,10 @@ def test_sparse_non_value_no_seq(setting, filename):
yield [(i + 1) * (j + 1) for j in xrange(10)]
@provider(input_types=[sparse_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
@provider(input_types=[
sparse_float_vector(
30000, seq_type=SequenceType.NO_SEQUENCE)
])
def test_sparse_value_no_seq(setting, filename):
for i in xrange(200):
yield [((i + 1) * (j + 1), float(j) / float(i + 1)) for j in xrange(10)]
......
......@@ -18,7 +18,7 @@ using namespace mkldnn; // NOLINT
namespace paddle {
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
memory::desc md = pd.desc();
size_t ndims = md.data.ndims;
int* dims = md.data.dims;
......@@ -41,12 +41,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
}
MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
memory::dims dims,
MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
memory::format fmt,
engine& eg,
MatrixPtr m,
mkldnn::memory::data_type dtype) {
return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
}
std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
......
......@@ -40,24 +40,37 @@ public:
/**
* Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
*/
static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
MatrixPtr m = nullptr);
/**
* Create MKLDNNMatrix from a MatrixPtr and memory details info
*/
static MKLDNNMatrixPtr create(
MatrixPtr m,
mkldnn::memory::dims dims,
mkldnn::memory::format fmt,
mkldnn::engine& eg,
MatrixPtr m = nullptr,
mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
/**
* Create primitive descriptor.
* default with f32 dtype
*/
static mkldnn::memory::primitive_desc createPrimitiveDesc(
const mkldnn::memory::dims dims,
const mkldnn::memory::format& fmt,
const mkldnn::engine& eg,
const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
}
/**
* Create Memory descriptor.
* default with any format and f32 dtype
*/
static mkldnn::memory::desc createMemoryDesc(
const mkldnn::memory::dims& dims,
const mkldnn::memory::dims dims,
const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
return mkldnn::memory::desc(dims, dtype, fmt);
......
......@@ -115,7 +115,8 @@ set(DEPS_OPS
softmax_with_cross_entropy_op
sum_op
pool_op
pool_with_index_op)
pool_with_index_op
lstm_op)
op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
......@@ -126,6 +127,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(sum_op DEPS net_op)
op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
......
......@@ -114,7 +114,7 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
// im2col
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0], strides[1],
paddings[0], paddings[1]);
paddings[0], paddings[0], paddings[1], paddings[1]);
// gemm
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
......@@ -213,7 +213,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
Tensor in_grad_slice =
in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[1]);
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
}
}
}
......@@ -235,7 +236,8 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[1]);
strides[1], paddings[0], paddings[0], paddings[1],
paddings[1]);
// gemm
Tensor filter_grad_slice =
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2dtranspose_op.h"
namespace paddle {
namespace operators {
void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of Conv2DTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Filter"),
"Input(Filter) of Conv2DTransposeOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Output"),
"Output(Output) of Conv2DTransposeOp should not be null.");
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
for (size_t i = 0; i < paddings.size(); ++i) {
PADDLE_ENFORCE_EQ(paddings[i], 0,
"No Padding allowed in conv transpose op.");
}
PADDLE_ENFORCE_EQ(in_dims.size(), 4,
"Conv2DTransposeOp input should be 4-D tensor.");
PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
"Conv2DTransposeOp filter should be 4-D tensor.");
PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
"input and kernel input dimension should be equal.");
auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2];
auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3];
ctx->SetOutputDim("Output",
{in_dims[0], filter_dims[1], output_height, output_width});
}
Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"Input",
"(Tensor) The input tensor of convolution transpose operator. "
"The format of input tensor is NCHW. Where N is batch size, C is the "
"number of input channels, H and W is the height and width of image.");
AddInput("Filter",
"(Tensor) The filter tensor of convolution transpose operator."
"The format of the filter tensor is CMHW, where C is the number of "
"output image channels, M is the number of input image channels, "
"H and W is height and width of filter. "
"We enforce groups number == 1 and padding == 0 in "
"convolution transpose Scenario.");
AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator."
"The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides",
"strides of convolution transpose operator.")
.SetDefault({1, 1});
AddAttr<std::vector<int>>("paddings",
"paddings of convolution transpose operator.")
.SetDefault({0, 0});
AddComment(R"DOC(
The convolution transpose operation calculates the output based on the input, filter
and strides, paddings, groups parameters. The size of each dimension of the
parameters is checked in the infer-shape.
)DOC");
}
void Conv2DTransposeOpGrad::InferShape(
framework::InferShapeContext* ctx) const {
auto in_dims = ctx->GetInputDim("Input");
auto filter_dims = ctx->GetInputDim("Filter");
if (ctx->HasOutput(framework::GradVarName("Input"))) {
ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
}
if (ctx->HasOutput(framework::GradVarName("Filter"))) {
ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
}
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(conv2dtranspose, ops::Conv2DTransposeOp,
ops::Conv2DTransposeOpMaker, conv2dtranspose_grad,
ops::Conv2DTransposeOpGrad);
REGISTER_OP_CPU_KERNEL(
conv2dtranspose,
ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
conv2dtranspose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/conv2dtranspose_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
conv2dtranspose,
ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>);
REGISTER_OP_GPU_KERNEL(
conv2dtranspose_grad,
ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/im2col.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
// Define Op classes in .h file so that other conv transpose
// operator implementations can reuse the code.
class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
Conv2DTransposeOpMaker(framework::OpProto* proto,
framework::OpAttrChecker* op_checker);
};
class Conv2DTransposeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override;
};
class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override;
};
template <typename Place, typename T>
class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
// The filter will be reshaped, so it should not be constant pointer
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* output = context.Output<Tensor>("Output");
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// TODO(Zhuoyuan): Paddings can be added in future.
// groups will alway be disabled in conv2dtranspose.
const int batch_size = input->dims()[0];
const int m = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int k_h = filter.dims()[2];
const int k_w = filter.dims()[3];
const int c = output->dims()[1]; // output channels
const int o_h = output->dims()[2];
const int o_w = output->dims()[3];
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kCFO, Place, T>
col2im;
// use col_shape in the im2col and col2im calculation
DDim col_shape = {c, k_h, k_w, h, w};
// use col_matrix_shape in the gemm calculation
DDim col_matrix_shape = {c * k_h * k_w, h * w};
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
Tensor col_matrix;
col_matrix.ShareDataWith(col);
col_matrix.Resize(col_matrix_shape);
DDim output_shape = {c, o_h, o_w};
DDim input_matrix_shape = {m, h * w};
DDim filter_matrix_shape = {m, c * k_h * k_w};
filter.Resize(filter_matrix_shape);
// convolution transpose: gemm + col2im (similar to conv-backward on input)
output->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*output);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; i++) {
// batch with size (M, h * w)
Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// filter size: (M, c * k_h * k_w)
// output size: (c, o_h, o_w)
Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
// col_matrix = filter * input_batch
// of shape (c * k_h * k_w, h * w)
math::matmul<Place, T>(context.device_context(), filter, true,
input_batch, false, T(1.0), &col_matrix, T(0.0));
col2im(context.device_context(), output_batch, col, strides[0],
strides[1], 0, 0, 0, 0);
}
}
};
template <typename Place, typename T>
class GemmConv2DTransposeGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* output_grad =
context.Input<Tensor>(framework::GradVarName("Output"));
// For filter, we do not use const pointer b/c we will do reshape,
// but we should avoid modifying its value.
Tensor filter = *context.Input<Tensor>("Filter");
Tensor* input_grad =
context.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad =
context.Output<Tensor>(framework::GradVarName("Filter"));
std::vector<int> strides = context.Attr<std::vector<int>>("strides");
// Actually, no paddings and groups allowed in conv transpose.
std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
const int batch_size = input->dims()[0];
const int m = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int k_h = filter.dims()[2];
const int k_w = filter.dims()[3];
const int c = output_grad->dims()[1]; // output channels
const int o_h = output_grad->dims()[2];
const int o_w = output_grad->dims()[3];
// Only im2col functor required for bp to get to the right shape
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO, Place, T>
im2col;
// use col_shape in the im2col and col2im calculation
DDim col_shape = {c, k_h, k_w, h, w};
// use col_matrix_shape in the gemm calculation
DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace());
// col_matrix shares the same piece of data with col,
// but will be reshaped into a two-dimensional matrix shape
// to call the matrix multiplication interface.
DDim output_shape = {c, o_h, o_w};
DDim input_matrix_shape = {m, h * w};
DDim filter_matrix_shape = {m, c * k_h * k_w};
filter.Resize(filter_matrix_shape);
// convolution transpose grad on input:
// im2col + gemm (similar to conv-forward)
// input need to compute gradient
if (input_grad) {
Tensor col_matrix;
col_matrix.ShareDataWith(col);
DDim col_matrix_shape = {c * k_h * k_w, h * w};
col_matrix.Resize(col_matrix_shape);
input_grad->mutable_data<T>(context.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; i++) {
// batch with size (c, o_h * o_w)
Tensor output_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_shape);
// filter of size (m, c * k_h * k_w)
// batch with size (m, h, w)
Tensor input_grad_batch =
input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
// im2col: dy from (c, o_h, o_w) -> (c * k_h * k_w, h * w)
im2col(context.device_context(), output_grad_batch, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
// gemm: dx = filter * dy
// (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, c, h)
math::matmul<Place, T>(context.device_context(), filter, false,
col_matrix, false, T(1.0), &input_grad_batch,
T(0.0));
}
}
// filter gradient required
if (filter_grad) {
Tensor col_matrix_f;
col_matrix_f.ShareDataWith(col);
DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
col_matrix_f.Resize(col_matrix_shape_f);
filter_grad->mutable_data<T>(context.GetPlace());
Tensor filter_grad_ = *filter_grad;
filter_grad_.Resize(filter_matrix_shape);
auto t = framework::EigenVector<T>::Flatten(filter_grad_);
t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
for (int i = 0; i < batch_size; ++i) {
// batch with size (c, o_h, o_w)
Tensor output_grad_batch =
output_grad->Slice(i, i + 1).Resize(output_shape);
// input batch
Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
// im2col: (c * h * w, k_h * k_w)
im2col(context.device_context(), output_grad_batch, col, strides[0],
strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
// gemm: d_filter = x * y_grad^T
// (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, c, h)
math::matmul<Place, T>(context.device_context(), in_batch, false,
col_matrix_f, true, T(1.0), &filter_grad_,
T(1.0));
}
}
}
};
} // namespace operators
} // namespace paddle
......@@ -59,7 +59,8 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
"The input should be a k-D tensor(k > 0 and k < 7)");
AddInput("Y",
"The input used as reference for cropping"
" with the same dimension as X. ");
" with the same dimension as X. ")
.AsDispensable();
AddOutput("Out",
"The output of crop op "
"with the same dimension as X.");
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
class FCOp : public NetOp {
public:
FCOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE(!Inputs("X").empty(),
"Inputs(X) of FCOp should not be null.");
PADDLE_ENFORCE(!Inputs("W").empty(),
"Inputs(W) of FCOp should not be null.");
PADDLE_ENFORCE(!Outputs("MulOut").empty(),
"Outputs(MulOut) of FCOp should not be null.");
PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
"Output(Out) of FCOp should not be null.");
auto x = Inputs("X");
auto w = Inputs("W");
auto mul_out = Outputs("MulOut");
PADDLE_ENFORCE_EQ(
x.size(), w.size(),
"The size of inputs X(%d) should be the same as that of weights W(%d).",
x.size(), w.size());
PADDLE_ENFORCE_EQ(mul_out.size(), x.size(),
"The size of intermediate mul_out(%d) should be the same "
"as that of inputs X(%d).",
mul_out.size(), x.size());
size_t n = x.size();
PADDLE_ENFORCE_GE(n, static_cast<size_t>(1),
"The size of inputs X(%d) should be no less than 1.", n);
auto x_num_col_dims = Attr<std::vector<int>>("xNumColDims");
// Set all values or set no values (use the default value)
if (!x_num_col_dims.empty()) {
PADDLE_ENFORCE_EQ(x_num_col_dims.size(), n,
"The size of attribute xNumColDims(%d) should be the "
"same as that of inputs X(%d).",
x_num_col_dims.size(), n);
} else {
x_num_col_dims.resize(n);
for (size_t i = 0; i < n; i++) {
x_num_col_dims[i] = 1;
}
}
// mul_out[i] = X[i] * W[i]
for (size_t i = 0; i < n; i++) {
framework::AttributeMap mul_attr;
mul_attr["x_num_col_dims"] = static_cast<int>(x_num_col_dims[i]);
mul_attr["y_num_col_dims"] = static_cast<int>(1);
AppendOp(
framework::OpRegistry::CreateOp("mul", {{"X", {x[i]}}, {"Y", {w[i]}}},
{{"Out", {mul_out[i]}}}, mul_attr));
}
// sum_out = X[0] * W[0] + ... + X[n-1] * W[n-1]
auto sum_out = mul_out[0];
if (n > 1) {
PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName,
"Output(SumOut) of FCOp should not be null when the "
"size of Inputs(X) > 1.");
sum_out = Output("SumOut");
AppendOp(framework::OpRegistry::CreateOp("sum", {{"X", {mul_out}}},
{{"Out", {sum_out}}}, {}));
} else {
if (Output("SumOut") != framework::kEmptyVarName) {
this->Rename(Output("SumOut"), framework::kEmptyVarName);
}
}
// add_out = sum_out + b
auto b = Input("B");
auto add_out = sum_out;
if (b != framework::kEmptyVarName) {
PADDLE_ENFORCE_NE(
Output("AddOut"), framework::kEmptyVarName,
"Output(AddOut) of FCOp should not be null when Input(B) is set.");
add_out = Output("AddOut");
AppendOp(framework::OpRegistry::CreateOp(
"elementwise_add", {{"X", {sum_out}}, {"Y", {Input("B")}}},
{{"Out", {add_out}}}, {}));
} else {
if (Output("AddOut") != framework::kEmptyVarName) {
this->Rename(Output("AddOut"), framework::kEmptyVarName);
}
}
auto activation = Attr<std::string>("activation");
AppendOp(framework::OpRegistry::CreateOp(activation, {{"X", {add_out}}},
{{"Y", {Output("Out")}}}, {}));
CompleteAddOp(false);
}
};
class FCOpMaker : public framework::OpProtoAndCheckerMaker {
public:
FCOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(A vector of Tensors) each input Tensor can be of arbitrary "
"dimension, and will be reshaped to a 2-D matrix of size "
"(minibatch, number_of_input_features) according to attribute "
"xNumColDims.")
.AsDuplicable();
AddInput("W",
"(A vector of Tensors) the weights of FC operator, a "
"vector of 2-D matrix of size "
"(number_of_input_features, number_of_neurons).")
.AsDuplicable();
AddInput("B",
"(Tensor) the bias of FC operator, a 1-D vector of size "
"number_of_neurons.");
AddOutput("Out",
"(Tensor) the activated output matrix of FC operator, a 2-D "
"matrix of size (minibatch, number_of_neurons).");
AddOutput("MulOut",
"(A vector of Tensors) the intermediate outputs of FC operator, "
"each Tensor saving the product of X_i * W_i.")
.AsIntermediate()
.AsDuplicable();
AddOutput(
"SumOut",
"(Tensor) the intermediate output of FC operator, "
"saving the sum of the products of X and W, that is sum{X_i * W_i}.")
.AsIntermediate();
AddOutput("AddOut",
"(Tensor) the non-actived output of FC operator, "
"saving sum{X_i * W_i} + B.")
.AsIntermediate();
AddAttr<std::string>(
"activation",
"(string, default identity) the activation type of FC operator.")
.SetDefault("identity")
.InEnum({"identity", "sigmoid", "softmax"});
AddAttr<std::vector<int>>(
"xNumColDims",
"(std::vector<int>) The inputs Tensors of FC operator can be of "
"more than 2 dimensions. In that case, each input Tensor `X_i` will be "
"reshaped to a 2-D matrix. The matrix's first dimension "
"(the length of column) will be the product of `X_i`'s last "
"`xNumColDims_i` dimensions, that is "
"`X_i.dims[0] x ... x X_i.dims[xNumColDims_i - 1]`. "
"The matrix's second dimension (the length of row) will be the product "
"of `X_i`'s first `rank - xNumColDims_i` dimensions, that is "
"`X_i.dims[xNumColDims_i] x ... x X_i.dims[rank - 1]`)")
.SetDefault(std::vector<int>{});
AddComment(R"DOC(
Fully Connected Operator, known as Fully Connected Layer or Inner Product Layer
in Convolutional Neural Networks. Neurons in a fully connected layer have
full connections to all activations in the previous layer.
It computes an inner product of a set of
learned weights with a matrix multiplication followed by a bias offset
(optionally).
Equation:
Out = Act(sum_n{X_i * W_i} + B)
where X_i is Tensor that will be reshaped to a 2-D matrix of size (M x K),
usually M is the minibatch size and K is the number of input features.
W_i is a 2-D matrix of size (K x N), where N means the number of neurons
in the fully connected layer. B is a 1-D vector of size N.
Thus, the output Out is a 2-D matrix of size (M x N).
Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
All the inputs can carry the LoD (Level of Details) information,
or not. But the output only shares the LoD with first input (`X[0]`).
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(fc, ops::FCOp, ops::FCOpMaker);
......@@ -54,8 +54,7 @@ class GRUUnitOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(
weight_width, frame_size * 3,
"The shape of Weight matrix must be [frame_size, frame_size * 3].");
auto bias = Input("Bias");
if (bias != framework::kEmptyVarName) {
if (ctx->HasInput("Bias")) {
auto bias_dims = ctx->GetInputDim("Bias");
int bias_height = bias_dims[0];
int bias_width = bias_dims[1];
......@@ -89,7 +88,8 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
"weights of output candidate with shape [frame_size, frame_size]");
AddInput("Bias",
"(Tensor) Bias vector with shape [1, frame_size * 3] concating "
"bias of the update gate, reset gate and output candidate.");
"bias of the update gate, reset gate and output candidate.")
.AsDispensable();
AddOutput("Gate",
"(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
"output of update gate, reset gate and output candidate")
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/net_op.h"
#include "paddle/operators/scale_op.h"
namespace paddle {
namespace operators {
// The identity operator is an alias of the scale operator. This is also an
// example for creating an alias for an existing operator.
template <typename AttrType>
class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
public:
IdentityOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The input tensor of identity operator.");
AddOutput("Y", "The output tensor of identity operator.");
AddComment(R"DOC(
The identity operator is an alias of the scale operator
with the attribute scale fixed to 1.0.
)DOC");
}
};
template <typename AttrType>
class IdentityOp : public NetOp {
public:
IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
"Input(X) of IdentityOp should not be null.");
PADDLE_ENFORCE_NE(Output("Y"), framework::kEmptyVarName,
"Output(Y) of IdentityOp should not be null.");
AppendOp(framework::OpRegistry::CreateOp(
"scale", {{"X", {Input("X")}}}, {{"Out", {Output("Y")}}},
{{"scale", static_cast<AttrType>(1)}}));
CompleteAddOp(false);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
ops::IdentityOpMaker<float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
class InterpOp : public NetOp {
public:
InterpOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: NetOp(type, inputs, outputs, attrs) {
PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
"Input(X) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Input("Y"), framework::kEmptyVarName,
"Input(Y) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Input("W"), framework::kEmptyVarName,
"Input(W) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Output("SubOut"), framework::kEmptyVarName,
"Output(SubOut) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Output("MulOut"), framework::kEmptyVarName,
"Output(MulOut) of InterpOp should not be null.");
PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
"Output(Out) of InterpOp should not be null.");
// SubOut = X - Y
auto x = Input("X");
auto y = Input("Y");
auto sub_out = Output("SubOut");
AppendOp(framework::OpRegistry::CreateOp(
"elementwise_sub", {{"X", {x}}, {"Y", {y}}}, {{"Out", {sub_out}}}, {}));
// MulOut = SubOut * W = (X - Y) * W
auto w = Input("W");
auto mul_out = Output("MulOut");
AppendOp(framework::OpRegistry::CreateOp(
"elementwise_mul", {{"X", {sub_out}}, {"Y", {w}}}, {{"Out", {mul_out}}},
{{"axis", 0}}));
// Out = MulOut + Y = (X - Y) * W + Y = X * W + Y * (1 - W)
AppendOp(framework::OpRegistry::CreateOp("elementwise_add",
{{"X", {mul_out}}, {"Y", {y}}},
{{"Out", {Output("Out")}}}, {}));
CompleteAddOp(false);
}
};
class InterpOpMaker : public framework::OpProtoAndCheckerMaker {
public:
InterpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(Tensor), 2-D Matrix of shape [batch_size, data_dim]"
"containing data samples, the first input of interp_op");
AddInput("Y",
"(Tensor), 2-D Matrix of shape `[batch_size, data_dim]`"
"containing data samples, the second input of interp_op");
AddInput("W",
"(Tensor), 1-D Vector of shape [batch_size],"
"the interpolated values in the half-open interval [0.0, 1.0)");
AddOutput("SubOut",
"(Tensor), the intermediate subtraction outputs, saving X - Y.")
.AsIntermediate();
AddOutput("MulOut",
"(Tensor), the intermediate multiplication outputs,"
"saving the elementwise multiplication of (X - Y) and W.")
.AsIntermediate();
AddOutput("Out",
"(Tensor), the output of interp_op, same shape with X,"
"returns the first-dimensional piecewise linear interpolant "
"between X and Y");
AddComment(R"DOC(
Linear Interpolation with two inputs, used in NEURAL TURING MACHINE.
Equation:
Out.row[i] = X.row[i] * W[i] + Y.row[i] * (1 - W[i])
= (X.row[i] - Y.row[i]) * W[i] + Y.row[i]
Example:
X = [[1,2],[3,4]],
Y = [[2,1],[4,3]],
W = [0.3, 0.4]
Then, Out = [[1.7,1.3],[3.6,3.4]]
where 1.7 = 1*0.3+2*(1-0.3),
1.3 = 2*0.3+1*(1-0.3),
3.6 = 3*0.4+4*(1-0.4),
3.4 = 4*0.4+3*(1-0.4)
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(interp, ops::InterpOp, ops::InterpOpMaker);
......@@ -499,8 +499,6 @@ class LinearChainCrfGradOpKernel<platform::CPUPlace, T>
auto alpha_mat = EigenMatrix<T>::From(*alpha);
auto beta_mat = EigenMatrix<T>::From(*beta);
auto x_grad_mat = EigenMatrix<T>::From(*emission_grad);
x_grad_mat.setConstant(ll_grad);
auto* place = ctx.GetEigenDevice<platform::CPUPlace>();
x_grad_mat.device(*place) = alpha_mat * beta_mat;
x_grad_mat /= x_grad_mat.sum(Eigen::DSizes<int, 1>(1))
......@@ -556,7 +554,9 @@ REGISTER_OP(linear_chain_crf, ops::LinearChainCrfOp, ops::LinearChainCrfOpMaker,
linear_chain_crf_grad, ops::LinearChainCrfGradOp);
REGISTER_OP_CPU_KERNEL(
linear_chain_crf,
ops::LinearChainCrfOpKernel<paddle::platform::CPUPlace, float>);
ops::LinearChainCrfOpKernel<paddle::platform::CPUPlace, float>,
ops::LinearChainCrfOpKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
linear_chain_crf_grad,
ops::LinearChainCrfGradOpKernel<paddle::platform::CPUPlace, float>);
ops::LinearChainCrfGradOpKernel<paddle::platform::CPUPlace, float>,
ops::LinearChainCrfGradOpKernel<paddle::platform::CPUPlace, double>);
......@@ -32,6 +32,9 @@ class LookupTableOp : public framework::OperatorWithKernel {
auto table_dims = ctx->GetInputDim("W");
auto ids_dims = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
ctx->ShareLoD("Ids", /*->*/ "Out");
}
......@@ -53,7 +56,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
" which is a learnable parameter.");
AddInput("Ids",
"An input with type int32 or int64"
"contains the ids to be looked up in W.");
"contains the ids to be looked up in W."
"Ids must be a column vector with rank = 2."
"The 2nd dimension size must be 1");
AddOutput("Out", "The lookup results, which have the same type with W.");
AddComment(R"DOC(
This operator is used to perform lookups on the parameter W,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/lstm_op.h"
namespace paddle {
namespace operators {
class LSTMOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
"Output(Hidden) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTM should not be null.");
auto x_dims = ctx->GetInputDim("Input");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
if (ctx->HasInput("H0")) {
PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(Cell) and Input(Hidden) of LSTM should not "
"be null at the same time.");
auto h_dims = ctx->GetInputDim("H0");
auto c_dims = ctx->GetInputDim("C0");
PADDLE_ENFORCE(h_dims == c_dims,
"The dimension of Input(H0) and Input(C0) "
"should be the same.");
}
int frame_size = x_dims[1] / 4;
auto w_dims = ctx->GetInputDim("Weight");
PADDLE_ENFORCE_EQ(w_dims.size(), 2,
"The rank of Input(Weight) should be 2.");
PADDLE_ENFORCE_EQ(w_dims[0], frame_size,
"The first dimension of Input(Weight) "
"should be %d.",
frame_size);
PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
"The second dimension of Input(Weight) "
"should be 4 * %d.",
frame_size);
auto b_dims = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1.");
if (ctx->Attrs().Get<bool>("usePeepholes")) {
PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
"The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection",
frame_size);
} else {
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes connection",
frame_size);
}
ctx->SetOutputDim("Hidden", {x_dims[0], frame_size});
ctx->SetOutputDim("Cell", {x_dims[0], frame_size});
ctx->SetOutputDim("BatchGate", x_dims);
ctx->ShareLoD("Input", "Hidden");
ctx->ShareLoD("Input", "Cell");
}
};
class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LSTMOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input",
"(LoDTensor) the first input is a LodTensor, which support "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X 4D), where, T is the "
"total time steps in this mini-batch, D is the hidden size.");
AddInput("H0",
"(Tensor, optional) the initial hidden state is an optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size, D is the hidden size.");
AddInput("C0",
"(Tensor, optional) the initial cell state is an optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size. `H0` and `C0` can be NULL but only at the same time");
AddInput("Weight",
"(Tensor) the learnable hidden-hidden weights."
" - The shape is (D x 4D), where D is the hidden size. "
" - Weight = {W_ch, W_ih, W_fh, W_oh}");
AddInput("Bias",
"(Tensor) the learnable weights, which contains two parts: "
"input-hidden bias weight and peephole connections weight if "
"setting `usePeepholes` True. "
"1. `usePeepholes = False` "
" - The shape is (1 x 4D). "
" - Bias = {b_c, b_i, b_f, b_o}."
"2. `usePeepholes = True` "
" - The shape is (1 x 7D). "
" - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
AddOutput("BatchGate",
"(LoDTensor) This LoDTensor contains input gate, forget gate "
"and output gate after the nonlinear computation. This "
"LoDTensor has the same shape with the reorganized input, which "
"was also be called batch input. The LoD size is 2. The first "
"LoD is the batch offsets and the second LoD contains the "
"indexes, which denote the position of reorganized sequence "
"in the raw input.")
.AsIntermediate();
AddOutput("Hidden",
"(LoDTensor) the hidden state lod tensor of LSTM operator. "
"The shape and lod is the same with the `Input`.");
AddOutput("Cell",
"(LoDTensor) the cell state lod tensor of LSTM operator. "
"The shape and lod is the same with the `Input`.");
AddAttr<bool>("usePeepholes",
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections.")
.SetDefault(true);
AddAttr<bool>("isReverse",
"(bool, defalut: False) "
"whether to compute reversed LSTM.")
.SetDefault(false);
AddAttr<std::string>(
"gateActivation",
"(string, default: sigmoid)"
"The activation for input gate, forget gate and output "
"gate, `sigmoid` by default.")
.SetDefault("sigmoid");
AddAttr<std::string>("cellActivation",
"(string, default: tanh)"
"The activation for cell output, `tanh` by defalut.")
.SetDefault("tanh");
AddAttr<std::string>("candidateActivation",
"(string, default: tanh)"
"The activation for candidate hidden state, "
"`tanh` by default.")
.SetDefault("tanh");
AddComment(R"DOC(Long-Short Term Memory (LSTM) Operator
The defalut implementation is diagonal/peephole connection [1], the formula is
as follows
i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
c_t = f_t ⊙ c_{t-1} + i_t ⊙ \tilde{c_t}
h_t = o_t ⊙ act_h(c_t)
where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix
of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$
are diagonal weight matrices for peephole connections. In our implenmention,
We use vectors to reprenset these diagonal weight matrices. The b terms
denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$
is the non-line actications, such as logistic sigmoid function, and
\f$i, f, o\f$ and \f$c\f$ are respectively the input gate, forget gate,
output gate and cell activation vectors, all of which are the same size as
the cell output activation vector \f$h\f$.
The ⊙ is the element-wise product of the vectors, \f$act_g\f$ and \f$act_h\f$
are the cell input and cell output activation functions, `tanh` is usually
used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Set `usePeepholes` False to disable peephole connection [2]. The formula
is omitted here.
@note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
operations on the input x_{t} were NOT included in this operator.
Users can choose to use fully-connect operator before LSTM operator.
[1] Hasim Sak, Andrew Senior, and Francoise Beaufays. Long short-term memory
recurrent neural network architectures for large scale acoustic modeling.
INTERSPEECH, 2014.
[2] S. Hochreiter and J. Schmidhuber. Long Short-Term Memory.
Neural Computation, 9(8):1735-1780, 1997.
)DOC");
}
};
class LSTMGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
"Input(Hidden@GRAD) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cell")),
"Input(Cell@GRAD) should not be null");
ctx->SetOutputDim(framework::GradVarName("Weight"),
ctx->GetInputDim("Weight"));
ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->GetInputDim("Bias"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::CPUPlace, float>,
ops::LSTMKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(lstm_grad,
ops::LSTMGradKernel<paddle::platform::CPUPlace, float>,
ops::LSTMGradKernel<paddle::platform::CPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/lstm_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::GPUPlace, float>,
ops::LSTMKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(lstm_grad,
ops::LSTMGradKernel<paddle::platform::GPUPlace, float>,
ops::LSTMGradKernel<paddle::platform::GPUPlace, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence2batch.h"
namespace paddle {
namespace operators {
using framework::LoDTensor;
using framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename Place, typename T>
class LSTMKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<framework::LoDTensor>("Input");
auto* weight = ctx.Input<framework::Tensor>("Weight");
auto* bias = ctx.Input<framework::Tensor>("Bias");
auto* batch_gate = ctx.Output<framework::LoDTensor>("BatchGate");
batch_gate->mutable_data<T>(ctx.GetPlace());
auto* hidden_out = ctx.Output<framework::LoDTensor>("Hidden");
hidden_out->mutable_data<T>(ctx.GetPlace());
auto* cell_out = ctx.Output<framework::LoDTensor>("Cell");
cell_out->mutable_data<T>(ctx.GetPlace());
// Now the function ShareLoD in InferShape is not implemented.
// So copy LoD here.
ctx.ShareLoD("Input", "Hidden");
ctx.ShareLoD("Input", "Cell");
bool is_reverse = ctx.Attr<bool>("isReverse");
math::LoDTensor2BatchFunctor<Place, T> to_batch;
to_batch(ctx.device_context(), *input, *batch_gate, is_reverse);
auto in_dims = input->dims();
int frame_size = static_cast<int>(in_dims[1] / 4);
framework::DDim dims({in_dims[0], frame_size});
if (bias) {
Eigen::array<int, 2> extents({{1, 4 * frame_size}});
Eigen::array<int, 2> offsets({{0, 0}});
auto b = EigenMatrix<T>::From(*bias);
auto gate = EigenMatrix<T>::From(*batch_gate);
gate.device(ctx.GetEigenDevice<Place>()) =
gate +
b.slice(offsets, extents)
.reshape(Eigen::array<int, 2>({{1, frame_size * 4}}))
.broadcast(
Eigen::array<int, 2>({{static_cast<int>(in_dims[0]), 1}}));
}
math::LstmMetaValue<T> lstm_value;
T* bias_data = const_cast<T*>(bias->data<T>());
// the code style in LstmMetaValue will be updated later.
lstm_value.checkIg = bias_data + 4 * frame_size;
lstm_value.checkFg = lstm_value.checkIg + frame_size;
lstm_value.checkOg = lstm_value.checkFg + frame_size;
lstm_value.prevStateValue = nullptr;
framework::LoDTensor batch_out, batch_cell, batch_cell_pre_act;
batch_out.mutable_data<T>(dims, ctx.GetPlace());
batch_cell.mutable_data<T>(dims, ctx.GetPlace());
batch_cell_pre_act.mutable_data<T>(dims, ctx.GetPlace());
auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
auto gate_act = ctx.Attr<std::string>("gateActivation");
auto cell_act = ctx.Attr<std::string>("cellActivation");
auto cand_act = ctx.Attr<std::string>("candidateActivation");
for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor out_t = batch_out.Slice(bstart, bend);
Tensor cell_t = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act_t = batch_cell_pre_act.Slice(bstart, bend);
int cur_batch_size = bend - bstart;
if (n != 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size;
auto pre_hidden_t = batch_out.Slice(pre_h_start, pre_h_end);
math::matmul<Place, T>(ctx.device_context(), pre_hidden_t, false,
*weight, false, static_cast<T>(1.0), &gate_t,
static_cast<T>(1.0));
}
// else if : FIXME support the initial hidden and cell
lstm_value.gateValue = gate_t.data<T>();
lstm_value.outputValue = out_t.data<T>();
lstm_value.stateValue = cell_t.data<T>();
lstm_value.stateActiveValue = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<Place, T>::compute(ctx.device_context(), lstm_value,
frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
lstm_value.prevStateValue = lstm_value.stateValue;
}
math::Batch2LoDTensorFunctor<Place, T> to_seq;
batch_out.set_lod(batch_gate->lod());
// restore the output hidden in LoDTensor from the batch hidden
to_seq(ctx.device_context(), batch_out, *hidden_out);
batch_cell.set_lod(batch_gate->lod());
// restore the output cell state in LoDTensor from the batch cell
to_seq(ctx.device_context(), batch_cell, *cell_out);
}
};
template <typename Place, typename T>
class LSTMGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {}
};
} // namespace operators
} // namespace paddle
......@@ -19,7 +19,6 @@
namespace paddle {
namespace operators {
using framework::LoDTensor;
using framework::Tensor;
template <typename T>
......
add_subdirectory(detail)
if(WITH_GPU)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
......@@ -7,6 +9,8 @@ if(WITH_GPU)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
......@@ -14,6 +18,8 @@ else()
cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(vol2col SRCS vol2col.cc DEPS device_context)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
endif()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
......
......@@ -22,8 +22,6 @@ namespace {
template <typename T>
__global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
const int N, const int D) {
// TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
// CUDA_1D_KERNEL_LOOP(i, N) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
i += blockDim.x * gridDim.x) {
PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
......
if(WITH_AVX)
cc_library(activation_functions SRCS hl_cpu_functions.cc hl_avx_functions.cc)
else()
cc_library(activation_functions SRCS hl_cpu_functions.cc)
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_ACTIVATION_FUNCTIONS_H_
#define HL_ACTIVATION_FUNCTIONS_H_
#include "hl_functions.h"
#include "paddle/operators/math/lstm_compute.h"
/**
* Active functions: sigmoid, relu, tanh and linear.
*/
#define FLOAT_ACTIVE_FUNCTION \
{ \
hppl::typef::sigmoid, hppl::typef::relu, hppl::typef::tanh, \
hppl::typef::linear \
}
#define DOUBLE_ACTIVE_FUNCTION \
{ \
hppl::typed::sigmoid, hppl::typed::relu, hppl::typed::tanh, \
hppl::typed::linear \
}
#define AVX_ACTIVE_FUNCTION \
{ hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
namespace hppl {
using activation_mode_t = paddle::operators::math::activation_mode_t;
/**
* Hppl supports sigmoid, relu, tanh, linear active functions
* for neural networks' forward and backward activation.
*/
template <class T>
class Active {
public:
typedef T (*forward)(T);
typedef T (*backward)(T, T);
};
template <typename T>
struct ForwardActType;
template <>
struct ForwardActType<float> {
using type = Active<float>::forward;
};
template <>
struct ForwardActType<double> {
using type = Active<double>::forward;
};
template <typename T>
struct BackwardActType;
template <>
struct BackwardActType<float> {
using type = Active<float>::backward;
};
template <>
struct BackwardActType<double> {
using type = Active<double>::backward;
};
#ifdef __NVCC__
namespace gpu {
static __device__ Active<float>::forward forward[] = FLOAT_ACTIVE_FUNCTION;
static __device__ Active<float>::backward backward[] = FLOAT_ACTIVE_FUNCTION;
static __device__ Active<double>::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION;
static __device__ Active<double>::backward backward_d[] =
DOUBLE_ACTIVE_FUNCTION;
template <typename T>
struct ForwardAct {
__device__ typename ForwardActType<T>::type operator()(
activation_mode_t type);
};
template <>
struct ForwardAct<float> {
__device__ ForwardActType<float>::type operator()(activation_mode_t type) {
return forward[type];
}
};
template <>
struct ForwardAct<double> {
__device__ ForwardActType<double>::type operator()(activation_mode_t type) {
return forward_d[type];
}
};
template <typename T>
struct BackwardAct {
__device__ typename BackwardActType<T>::type operator()(
activation_mode_t type);
};
template <>
struct BackwardAct<float> {
__device__ BackwardActType<float>::type operator()(activation_mode_t type) {
return backward[type];
}
};
template <>
struct BackwardAct<double> {
__device__ BackwardActType<double>::type operator()(activation_mode_t type) {
return backward_d[type];
}
};
} // namespace gpu
#else
namespace cpu {
static Active<float>::forward forward[] = FLOAT_ACTIVE_FUNCTION;
static Active<float>::backward backward[] = FLOAT_ACTIVE_FUNCTION;
static Active<double>::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION;
static Active<double>::backward backward_d[] = DOUBLE_ACTIVE_FUNCTION;
template <typename T>
struct ForwardAct {
typename ForwardActType<T>::type operator()(activation_mode_t type);
};
template <>
struct ForwardAct<float> {
ForwardActType<float>::type operator()(activation_mode_t type) {
return forward[type];
}
};
template <>
struct ForwardAct<double> {
ForwardActType<double>::type operator()(activation_mode_t type) {
return forward_d[type];
}
};
template <typename T>
struct BackwardAct {
typename BackwardActType<T>::type operator()(activation_mode_t type);
};
template <>
struct BackwardAct<float> {
BackwardActType<float>::type operator()(activation_mode_t type) {
return backward[type];
}
};
template <>
struct BackwardAct<double> {
BackwardActType<double>::type operator()(activation_mode_t type) {
return backward_d[type];
}
};
} // namespace cpu
#ifdef __AVX__
namespace avx {
static Active<__m256>::forward forward[] = AVX_ACTIVE_FUNCTION;
static Active<__m256>::backward backward[] = AVX_ACTIVE_FUNCTION;
} // namespace avx
#endif
#endif
} // namespace hppl
#endif // HL_ACTIVATION_FUNCTIONS_H_
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <immintrin.h>
#include "hl_functions.h"
// TODO(qingqing) refine this dependence
#include "paddle/cuda/src/avx_mathfun.h"
namespace hppl {
__m256 exp(__m256 a) { return exp256_ps(a); }
__m256 relu(const __m256 a) {
__m256 tmp = _mm256_set1_ps(0.0f);
return _mm256_max_ps(a, tmp);
}
__m256 sigmoid(const __m256 a) {
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
__m256 tmp = _mm256_max_ps(a, min);
tmp = _mm256_min_ps(tmp, max);
tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
tmp = exp(tmp);
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
return tmp;
}
__m256 tanh(const __m256 a) {
__m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
__m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
tmp = _mm256_min_ps(tmp, max);
tmp = exp(tmp);
return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
_mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
_mm256_set1_ps(1.0f));
}
__m256 linear(const __m256 a) { return a; }
__m256 relu(const __m256 a, const __m256 b) {
return _mm256_mul_ps(
a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
_mm256_set1_ps(1.0f)));
}
__m256 sigmoid(const __m256 a, const __m256 b) {
return _mm256_mul_ps(_mm256_mul_ps(a, b),
_mm256_sub_ps(_mm256_set1_ps(1.0f), b));
}
__m256 tanh(const __m256 a, const __m256 b) {
return _mm256_mul_ps(
a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
}
__m256 linear(const __m256 a, const __m256 b) { return a; }
} // namespace hppl
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_AVX_FUNCTIONS_H_
#define HL_AVX_FUNCTIONS_H_
#include <immintrin.h>
namespace hppl {
__m256 relu(const __m256 a);
__m256 sigmoid(const __m256 a);
__m256 tanh(const __m256 a);
__m256 linear(const __m256 a);
__m256 relu(const __m256 a, const __m256 b);
__m256 sigmoid(const __m256 a, const __m256 b);
__m256 tanh(const __m256 a, const __m256 b);
__m256 linear(const __m256 a, const __m256 b);
} // namespace hppl
#endif // HL_AVX_FUNCTIONS_H_
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <math.h>
#include "hl_functions.h"
namespace hppl {
namespace typef {
float relu(const float a) {
return a > static_cast<float>(0.0) ? a : static_cast<float>(0.0);
}
float sigmoid(const float a) {
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
float tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<float>(1.0) / (static_cast<float>(1.0) + exp(-tmp));
}
float tanh(const float a) {
float tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
float linear(const float a) { return a; }
float relu(const float a, const float b) { return a * (b > 0.0 ? 1.0 : 0.0); }
float sigmoid(const float a, const float b) {
return a * b * (static_cast<float>(1) - b);
}
float tanh(const float a, const float b) {
return a * (static_cast<float>(1) - b * b);
}
float linear(const float a, const float b) { return a; }
} // namespace typef
namespace typed {
double relu(const double a) {
return a > static_cast<double>(0.0) ? a : static_cast<double>(0.0);
}
double sigmoid(const double a) {
const double min = SIGMOID_THRESHOLD_MIN;
const double max = SIGMOID_THRESHOLD_MAX;
double tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<double>(1.0) / (static_cast<double>(1.0) + exp(-tmp));
}
double tanh(const double a) {
double tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(tmp))) - 1.0;
}
double linear(const double a) { return a; }
double relu(const double a, const double b) {
return a * (b > 0.0 ? 1.0 : 0.0);
}
double sigmoid(const double a, const double b) {
return a * b * (static_cast<double>(1) - b);
}
double tanh(const double a, const double b) {
return a * (static_cast<double>(1) - b * b);
}
double linear(const double a, const double b) { return a; }
} // namespace typed
} // namespace hppl
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_FUNCTIONS_H_
#define HL_FUNCTIONS_H_
/**
* sigmoid threshold maximum
*/
#define SIGMOID_THRESHOLD_MIN -40.0
/**
* sigmoid threshold minimum
*/
#define SIGMOID_THRESHOLD_MAX 13.0
/**
* The maximum input value for exp, used to avoid overflow problem.
* currently only used for tanh function.
*/
#define EXP_MAX_INPUT 40.0
#ifndef __NVCC__
namespace hppl {
namespace typef {
float relu(const float a);
float sigmoid(const float a);
float tanh(const float a);
float linear(const float a);
float relu(const float a, const float b);
float sigmoid(const float a, const float b);
float tanh(const float a, const float b);
float linear(const float a, const float b);
} // namespace typef
namespace typed {
double relu(const double a);
double sigmoid(const double a);
double tanh(const double a);
double linear(const double a);
double relu(const double a, const double b);
double sigmoid(const double a, const double b);
double tanh(const double a, const double b);
double linear(const double a, const double b);
} // namespace typed
} // namespace hppl
#ifdef __AVX__
#include "hl_avx_functions.h"
#endif
#else
#include "hl_gpu_functions.h"
#endif
#endif // HL_FUNCTIONS_H_
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef HL_GPU_FUNCTIONS_CUH_
#define HL_GPU_FUNCTIONS_CUH_
#include "hl_base.h"
namespace hppl {
namespace typef {
__device__ static float relu(const float a) { return a > 0.0f ? a : 0.0f; }
__device__ static float sigmoid(const float a) {
const float min = SIGMOID_THRESHOLD_MIN;
const float max = SIGMOID_THRESHOLD_MAX;
float tmp = (a < min) ? min : ((a > max) ? max : a);
return __fdividef(1.0f, 1.0f + __expf(-tmp));
}
__device__ static float tanh(const float a) {
float tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return __fdividef(2.0f, (1.0f + __expf(-2.0f * tmp))) - 1.0f;
}
__device__ static float linear(const float a) { return a; }
__device__ static float relu(const float a, const float b) {
return a * (b > 0.0f ? 1.0f : 0.0f);
}
__device__ static float sigmoid(const float a, const float b) {
return a * b * (1.0f - b);
}
__device__ static float tanh(const float a, const float b) {
return a * (1.0f - b * b);
}
__device__ static float linear(const float a, const float b) { return a; }
} // namespace typef
namespace typed {
__device__ static double relu(const double a) { return a > 0.0 ? a : 0.0; }
__device__ static double sigmoid(const double a) {
const double min = SIGMOID_THRESHOLD_MIN;
const double max = SIGMOID_THRESHOLD_MAX;
double tmp = (a < min) ? min : ((a > max) ? max : a);
return 1.0 / (1.0 + exp(-tmp));
}
__device__ static double tanh(const double a) {
double tmp = -2.0 * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (2.0 / (1.0 + exp(-2.0 * a))) - 1.0;
}
__device__ static double linear(const double a) { return a; }
__device__ static double relu(const double a, const double b) {
return a * (b > 0.0 ? 1.0 : 0.0);
}
__device__ static double sigmoid(const double a, const double b) {
return a * b * (1 - b);
}
__device__ static double tanh(const double a, const double b) {
return a * (1.0 - b * b);
}
__device__ static double linear(const double a, const double b) { return a; }
} // namespace typef
} // namespace hppl
#endif // HL_GPU_FUNCTIONS_CUH_
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/operators/math/lstm_compute.h"
namespace paddle {
namespace operators {
namespace math {
namespace detail {
#ifndef __NVCC__
template <class T, class Op>
void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rCheckI;
T rCheckF;
T rCheckO;
T rState;
T rPrevState = 0;
T rStateAtv;
T rOut;
T *valueIn = value.gateValue;
T *valueIg = value.gateValue + frameSize;
T *valueFg = value.gateValue + frameSize * 2;
T *valueOg = value.gateValue + frameSize * 3;
for (int i = 0; i < frameSize; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = value.checkIg[i];
rCheckF = value.checkFg[i];
rCheckO = value.checkOg[i];
if (value.prevStateValue) {
rPrevState = value.prevStateValue[i];
}
hppl::cpu::ForwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate),
act(active_state));
valueIn[i] = rValueIn;
valueIg[i] = rValueIg;
valueFg[i] = rValueFg;
valueOg[i] = rValueOg;
value.stateValue[i] = rState;
value.stateActiveValue[i] = rStateAtv;
value.outputValue[i] = rOut;
}
}
template <class T, class Op>
void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rGradIn;
T rGradIg;
T rGradFg;
T rGradOg;
T rPrevState = 0;
T rPrevStateGrad;
T rState;
T rStateGrad;
T rStateAtv;
T rOutputGrad;
T rCheckI;
T rCheckF;
T rCheckO;
T rCheckIGrad;
T rCheckFGrad;
T rCheckOGrad;
T *valueIn = value.gateValue;
T *valueIg = value.gateValue + frameSize;
T *valueFg = value.gateValue + frameSize * 2;
T *valueOg = value.gateValue + frameSize * 3;
T *gradIn = grad.gateGrad;
T *gradIg = grad.gateGrad + frameSize;
T *gradFg = grad.gateGrad + frameSize * 2;
T *gradOg = grad.gateGrad + frameSize * 3;
for (int i = 0; i < frameSize; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = value.checkIg[i];
rCheckF = value.checkFg[i];
rCheckO = value.checkOg[i];
rState = value.stateValue[i];
rStateAtv = value.stateActiveValue[i];
rOutputGrad = grad.outputGrad[i];
rStateGrad = grad.stateGrad[i];
if (value.prevStateValue) {
rPrevState = value.prevStateValue[i];
}
hppl::cpu::BackwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
rCheckOGrad, act(active_node), act(active_gate), act(active_state));
gradIn[i] = rGradIn;
gradIg[i] = rGradIg;
gradFg[i] = rGradFg;
gradOg[i] = rGradOg;
grad.stateGrad[i] = rStateGrad;
if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
}
if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
}
}
template <class T, class Op>
void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
#ifdef __AVX__
__m256 rValueIn;
__m256 rValueIg;
__m256 rValueFg;
__m256 rValueOg;
__m256 rCheckI;
__m256 rCheckF;
__m256 rCheckO;
__m256 rState;
__m256 rPrevState = _mm256_set1_ps(0.0f);
__m256 rStateAtv;
__m256 rOut;
__m256 *valueIn = (__m256 *)value.gateValue;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
for (int i = 0; i < frameSize / 8; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = ((__m256 *)value.checkIg)[i];
rCheckF = ((__m256 *)value.checkFg)[i];
rCheckO = ((__m256 *)value.checkOg)[i];
if (value.prevStateValue) {
rPrevState = ((__m256 *)value.prevStateValue)[i];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, hppl::avx::forward[active_node],
hppl::avx::forward[active_gate], hppl::avx::forward[active_state]);
valueIn[i] = rValueIn;
valueIg[i] = rValueIg;
valueFg[i] = rValueFg;
valueOg[i] = rValueOg;
((__m256 *)value.stateValue)[i] = rState;
((__m256 *)value.stateActiveValue)[i] = rStateAtv;
((__m256 *)value.outputValue)[i] = rOut;
}
#endif
}
template <class T, class Op>
void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
#ifdef __AVX__
__m256 rValueIn;
__m256 rValueIg;
__m256 rValueFg;
__m256 rValueOg;
__m256 rGradIn;
__m256 rGradIg;
__m256 rGradFg;
__m256 rGradOg;
__m256 rPrevState = _mm256_set1_ps(0.0f);
__m256 rPrevStateGrad;
__m256 rStateGrad;
__m256 rState;
__m256 rStateAtv;
__m256 rOutputGrad;
__m256 rCheckI;
__m256 rCheckF;
__m256 rCheckO;
__m256 rCheckIGrad;
__m256 rCheckFGrad;
__m256 rCheckOGrad;
__m256 *valueIn = (__m256 *)value.gateValue;
__m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
__m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
__m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
__m256 *gradIn = (__m256 *)grad.gateGrad;
__m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize);
__m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2);
__m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3);
for (int i = 0; i < frameSize / 8; i++) {
rValueIn = valueIn[i];
rValueIg = valueIg[i];
rValueFg = valueFg[i];
rValueOg = valueOg[i];
rCheckI = ((__m256 *)value.checkIg)[i];
rCheckF = ((__m256 *)value.checkFg)[i];
rCheckO = ((__m256 *)value.checkOg)[i];
rState = ((__m256 *)value.stateValue)[i];
rStateAtv = ((__m256 *)value.stateActiveValue)[i];
rOutputGrad = ((__m256 *)grad.outputGrad)[i];
rStateGrad = ((__m256 *)grad.stateGrad)[i];
if (value.prevStateValue) {
rPrevState = ((__m256 *)value.prevStateValue)[i];
}
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
rCheckOGrad, hppl::avx::backward[active_node],
hppl::avx::backward[active_gate], hppl::avx::backward[active_state]);
gradIn[i] = rGradIn;
gradIg[i] = rGradIg;
gradFg[i] = rGradFg;
gradOg[i] = rGradOg;
((__m256 *)grad.stateGrad)[i] = rStateGrad;
if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad;
if (value.prevStateValue) {
if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad;
if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad;
}
if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad;
}
#endif
}
template <class T, class Op>
void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
active_gate, active_state);
} else {
naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
active_gate, active_state);
}
}
template <class T, class Op>
void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
active_gate, active_state);
} else {
naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
active_gate, active_state);
}
}
#endif
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <type_traits>
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/platform/cuda_helper.h"
#include "paddle/platform/device_context.h"
#include <glog/logging.h>
namespace paddle {
namespace operators {
namespace math {
namespace detail {
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class T, class Op, bool isBatch>
__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
value.gateValue += batchIdx * frameSize * 4;
value.outputValue += batchIdx * frameSize;
value.stateValue += batchIdx * frameSize;
value.stateActiveValue += batchIdx * frameSize;
}
T rState;
T rPrevState = 0;
T rStateAtv;
T rOut;
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rCheckI = value.checkIg[frameIdx];
T rCheckF = value.checkFg[frameIdx];
T rCheckO = value.checkOg[frameIdx];
rValueIn = value.gateValue[frameIdx];
rValueIg = value.gateValue[frameIdx + frameSize];
rValueFg = value.gateValue[frameIdx + frameSize * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3];
if (value.prevStateValue) {
if (isBatch) value.prevStateValue += batchIdx * frameSize;
rPrevState = value.prevStateValue[frameIdx];
}
hppl::gpu::ForwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate),
act(active_state));
value.gateValue[frameIdx] = rValueIn;
value.gateValue[frameIdx + frameSize] = rValueIg;
value.gateValue[frameIdx + frameSize * 2] = rValueFg;
value.gateValue[frameIdx + frameSize * 3] = rValueOg;
value.stateValue[frameIdx] = rState;
value.stateActiveValue[frameIdx] = rStateAtv;
value.outputValue[frameIdx] = rOut;
}
/*
* threads(framePerBlock, batchPerBlock)
* grid(frameBlocks, batchBlocks)
*/
template <class T, class Op, bool isBatch>
__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
LstmMetaGrad<T> grad, int frameSize,
int batchSize, activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
if (frameIdx >= frameSize) return;
int batchIdx = 0;
if (isBatch) {
batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (batchIdx >= batchSize) return;
value.gateValue += batchIdx * frameSize * 4;
value.stateValue += batchIdx * frameSize;
value.stateActiveValue += batchIdx * frameSize;
grad.gateGrad += batchIdx * frameSize * 4;
grad.stateGrad += batchIdx * frameSize;
grad.outputGrad += batchIdx * frameSize;
}
T rValueIn;
T rValueIg;
T rValueFg;
T rValueOg;
T rGradIn;
T rGradIg;
T rGradFg;
T rGradOg;
T rPrevState = 0;
T rPrevStateGrad;
T rState;
T rStateGrad;
T rStateAtv;
T rOutputGrad;
T rCheckI = value.checkIg[frameIdx];
T rCheckF = value.checkFg[frameIdx];
T rCheckO = value.checkOg[frameIdx];
T rCheckIGrad;
T rCheckFGrad;
T rCheckOGrad;
rValueIn = value.gateValue[frameIdx];
rValueIg = value.gateValue[frameIdx + frameSize];
rValueFg = value.gateValue[frameIdx + frameSize * 2];
rValueOg = value.gateValue[frameIdx + frameSize * 3];
rState = value.stateValue[frameIdx];
rStateAtv = value.stateActiveValue[frameIdx];
rOutputGrad = grad.outputGrad[frameIdx];
rStateGrad = grad.stateGrad[frameIdx];
if (value.prevStateValue) {
if (isBatch) value.prevStateValue += batchIdx * frameSize;
rPrevState = value.prevStateValue[frameIdx];
}
hppl::gpu::BackwardAct<T> act;
op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg,
rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad,
rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad,
act(active_node), act(active_gate), act(active_state));
grad.gateGrad[frameIdx] = rGradIn;
grad.gateGrad[frameIdx + frameSize] = rGradIg;
grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
grad.stateGrad[frameIdx] = rStateGrad;
if (grad.prevStateGrad) {
if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
grad.prevStateGrad[frameIdx] = rPrevStateGrad;
}
if (isBatch) {
if (value.prevStateValue) {
if (grad.checkIgGrad)
paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx,
rCheckIGrad);
if (grad.checkFgGrad)
paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx,
rCheckFGrad);
}
if (grad.checkOgGrad)
paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad);
} else {
if (value.prevStateValue) {
if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
}
if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
}
}
template <class T, class Op>
void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
} else {
/* framePerBlock = 32 batchPerBlock = 32 */
threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
}
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) {
KeLstmForward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate,
active_state);
} else {
KeLstmForward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, frameSize, batchSize, active_node, active_gate,
active_state);
}
}
template <class T, class Op>
void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
LstmMetaValue<T> value, LstmMetaGrad<T> grad,
int frameSize, int batchSize,
activation_mode_t active_node,
activation_mode_t active_gate,
activation_mode_t active_state) {
dim3 threads;
dim3 grid;
if (batchSize == 1) {
int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
int frameBlocks = (frameSize + 1024 - 1) / 1024;
threads = dim3(framePerBlock, 1);
grid = dim3(frameBlocks, 1);
} else {
/* framePerBlock = 32 batchPerBlock = 32 */
threads = dim3(32, 32);
grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
}
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
if (batchSize == 1) {
KeLstmBackward<T, Op,
/* isBatch= */ false><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate,
active_state);
} else {
KeLstmBackward<T, Op,
/* isBatch= */ true><<<grid, threads, 0, stream>>>(
op, value, grad, frameSize, batchSize, active_node, active_gate,
active_state);
}
}
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/detail/hl_activation_functions.h"
#include "paddle/platform/hostdevice.h"
#include <type_traits>
namespace paddle {
namespace operators {
namespace math {
namespace detail {
namespace forward {
template <class T>
class lstm {
public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
T &prevState, T &state, T &stateAtv, T &output,
T &checkI, T &checkF, T &checkO,
typename hppl::ForwardActType<T>::type actInput,
typename hppl::ForwardActType<T>::type actGate,
typename hppl::ForwardActType<T>::type actState) {
valueIn = actInput(valueIn);
valueIg = actGate(valueIg + prevState * checkI);
valueFg = actGate(valueFg + prevState * checkF);
state = valueIn * valueIg + prevState * valueFg;
valueOg = actGate(valueOg + state * checkO);
stateAtv = actState(state);
output = valueOg * stateAtv;
}
#ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
static const bool avx = false;
#else
// Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
__m256 &valueOg, __m256 &prevState, __m256 &state,
__m256 &stateAtv, __m256 &output, __m256 &checkI,
__m256 &checkF, __m256 &checkO,
hppl::Active<__m256>::forward actInput,
hppl::Active<__m256>::forward actGate,
hppl::Active<__m256>::forward actState) {
valueIn = actInput(valueIn);
valueIg = actGate(_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)));
valueFg = actGate(_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)));
state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg),
_mm256_mul_ps(prevState, valueFg));
valueOg = actGate(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)));
stateAtv = actState(state);
output = _mm256_mul_ps(valueOg, stateAtv);
}
#endif
#endif
};
} // namespace forward
namespace backward {
template <class T>
class lstm {
public:
HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
T &gradIn, T &gradIg, T &gradFg, T &gradOg,
T &prevState, T &prevStateGrad, T &state,
T &stateGrad, T &stateAtv, T &outputGrad,
T &checkI, T &checkF, T &checkO, T &checkIGrad,
T &checkFGrad, T &checkOGrad,
typename hppl::BackwardActType<T>::type actInput,
typename hppl::BackwardActType<T>::type actGate,
typename hppl::BackwardActType<T>::type actState) {
gradOg = actGate(outputGrad * stateAtv, valueOg);
stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO;
gradIn = actInput(stateGrad * valueIg, valueIn);
gradIg = actGate(stateGrad * valueIn, valueIg);
gradFg = actGate(stateGrad * prevState, valueFg);
prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
checkIGrad = gradIg * prevState;
checkFGrad = gradFg * prevState;
checkOGrad = gradOg * state;
}
#ifndef __NVCC__
#ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default
static const bool avx = false;
#else
// Only float support AVX optimization
static const bool avx = std::is_same<T, float>::value;
HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
__m256 &valueOg, __m256 &gradIn, __m256 &gradIg,
__m256 &gradFg, __m256 &gradOg, __m256 &prevState,
__m256 &prevStateGrad, __m256 &state,
__m256 &stateGrad, __m256 &stateAtv,
__m256 &outputGrad, __m256 &checkI, __m256 &checkF,
__m256 &checkO, __m256 &checkIGrad,
__m256 &checkFGrad, __m256 &checkOGrad,
hppl::Active<__m256>::backward actInput,
hppl::Active<__m256>::backward actGate,
hppl::Active<__m256>::backward actState) {
gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg);
stateGrad = _mm256_add_ps(
actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad);
stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
gradIn = actInput(_mm256_mul_ps(stateGrad, valueIg), valueIn);
gradIg = actGate(_mm256_mul_ps(stateGrad, valueIn), valueIg);
gradFg = actGate(_mm256_mul_ps(stateGrad, prevState), valueFg);
prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI),
_mm256_mul_ps(gradFg, checkF));
prevStateGrad =
_mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
checkIGrad = _mm256_mul_ps(gradIg, prevState);
checkFGrad = _mm256_mul_ps(gradFg, prevState);
checkOGrad = _mm256_mul_ps(gradOg, state);
}
#endif
#endif
};
} // namespace backward
} // namespace detail
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -29,8 +29,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width) {
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
......@@ -41,6 +41,22 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int filter_width = col.dims()[2];
int output_height = col.dims()[3];
int output_width = col.dims()[4];
PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) /
stride_height +
1,
output_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) /
stride_width +
1,
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent.");
int channels_col = input_channels * filter_height * filter_width;
const T* im_data = im.data<T>();
......@@ -52,16 +68,14 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int c_im = c / filter_width / filter_height;
for (int h = 0; h < output_height; ++h) {
for (int w = 0; w < output_width; ++w) {
int im_row_idx = h * stride_height + h_offset;
int im_col_idx = w * stride_width + w_offset;
if ((im_row_idx - padding_height) < 0 ||
(im_row_idx - padding_height) >= input_height ||
(im_col_idx - padding_width) < 0 ||
(im_col_idx - padding_width) >= input_width) {
int im_row_idx = h * stride_height + h_offset - padding_up;
int im_col_idx = w * stride_width + w_offset - padding_left;
if (im_row_idx < 0 || im_row_idx >= input_height || im_col_idx < 0 ||
im_col_idx >= input_width) {
col_data[(c * output_height + h) * output_width + w] = T(0);
} else {
im_row_idx += c_im * input_height - padding_height;
im_col_idx -= padding_width;
im_row_idx += c_im * input_height;
col_data[(c * output_height + h) * output_width + w] =
im_data[im_row_idx * input_width + im_col_idx];
}
......@@ -82,7 +96,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int padding_height, int padding_width) {
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -92,6 +107,22 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
int filter_width = col.dims()[2];
int output_height = col.dims()[3];
int output_width = col.dims()[4];
PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) /
stride_height +
1,
output_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) /
stride_width +
1,
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent.");
int channels_col = input_channels * filter_height * filter_width;
T* im_data = im.data<T>();
......@@ -103,14 +134,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
int c_im = c / filter_width / filter_height;
for (int h = 0; h < output_height; ++h) {
for (int w = 0; w < output_width; ++w) {
int im_row_idx = h * stride_height + h_offset;
int im_col_idx = w * stride_width + w_offset;
if ((im_row_idx - padding_height) >= 0 &&
(im_row_idx - padding_height) < input_height &&
(im_col_idx - padding_width) >= 0 &&
(im_col_idx - padding_width) < input_width) {
im_row_idx += c_im * input_height - padding_height;
im_col_idx -= padding_width;
int im_row_idx = h * stride_height + h_offset - padding_up;
int im_col_idx = w * stride_width + w_offset - padding_left;
if ((im_row_idx) >= 0 && (im_row_idx) < input_height &&
(im_col_idx) >= 0 && (im_col_idx) < input_width) {
im_row_idx += c_im * input_height;
im_data[im_row_idx * input_width + im_col_idx] +=
col_data[(c * output_height + h) * output_width + w];
}
......@@ -140,8 +169,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width) {
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -152,6 +181,21 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
int output_height = col.dims()[0];
int output_width = col.dims()[1];
PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) /
stride_height +
1,
output_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) /
stride_width +
1,
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent.");
const T* im_data = im.data<T>();
T* col_data = col.data<T>();
......@@ -163,10 +207,10 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
for (int filter_col_idx = 0; filter_col_idx < filter_width;
++filter_col_idx) {
int im_row_offset =
col_row_idx * stride_height + filter_row_idx - padding_height;
col_row_idx * stride_height + filter_row_idx - padding_up;
int im_col_offset =
col_col_idx * stride_width + filter_col_idx - padding_width;
int col_offset = (((col_row_idx * output_width + col_col_idx) *
col_col_idx * stride_width + filter_col_idx - padding_left;
int col_offset = ((((col_row_idx)*output_width + col_col_idx) *
input_channels +
channel) *
filter_height +
......@@ -201,7 +245,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int padding_height, int padding_width) {
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -212,6 +257,21 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
int output_height = col.dims()[0];
int output_width = col.dims()[1];
PADDLE_ENFORCE_EQ(
(input_height + padding_up + padding_down - filter_height) /
stride_height +
1,
output_height,
"Output_height and padding(padding_up, padding_down) are "
"inconsistent.");
PADDLE_ENFORCE_EQ(
(input_width + padding_left + padding_right - filter_width) /
stride_width +
1,
output_width,
"output_width and padding(padding_left, padding_right) are "
"inconsistent.");
T* im_data = im.data<T>();
const T* col_data = col.data<T>();
......@@ -223,9 +283,9 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
for (int filter_col_idx = 0; filter_col_idx < filter_width;
++filter_col_idx) {
int im_row_offset =
col_row_idx * stride_height + filter_row_idx - padding_height;
col_row_idx * stride_height + filter_row_idx - padding_up;
int im_col_offset =
col_col_idx * stride_width + filter_col_idx - padding_width;
col_col_idx * stride_width + filter_col_idx - padding_left;
int col_offset = (((col_row_idx * output_width + col_col_idx) *
input_channels +
channel) *
......
......@@ -66,8 +66,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width) {
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
......@@ -79,6 +79,15 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
int output_height = col.dims()[3];
int output_width = col.dims()[4];
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
stride_height +
1 ==
output_height);
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
stride_width +
1 ==
output_width);
int num_outputs = input_channels * output_height * output_width;
int blocks = (num_outputs + 1024 - 1) / 1024;
int block_x = 512;
......@@ -89,8 +98,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
im.data<T>(), num_outputs, input_height, input_width, filter_height,
filter_width, stride_height, stride_width, padding_height,
padding_width, output_height, output_width, col.data<T>());
filter_width, stride_height, stride_width, padding_up, padding_left,
output_height, output_width, col.data<T>());
}
};
......@@ -152,7 +161,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int padding_height, int padding_width) {
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
......@@ -164,8 +174,18 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
int output_height = col.dims()[3];
int output_width = col.dims()[4];
size_t num_kernels = input_channels * (input_height + 2 * padding_height) *
(input_width + 2 * padding_width);
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
stride_height +
1 ==
output_height);
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
stride_width +
1 ==
output_width);
size_t num_kernels = input_channels *
(input_height + padding_up + padding_down) *
(input_width + padding_left + padding_right);
size_t blocks = (num_kernels + 1024 - 1) / 1024;
size_t block_x = 512;
......@@ -178,10 +198,10 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
col2im<T><<<grid, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
num_kernels, col.data<T>(), input_height + 2 * padding_height,
input_width + 2 * padding_width, input_channels, filter_height,
filter_width, stride_height, stride_width, padding_height,
padding_width, output_height, output_width, im.data<T>());
num_kernels, col.data<T>(), input_height + padding_up + padding_down,
input_width + padding_left + padding_left, input_channels,
filter_height, filter_width, stride_height, stride_width, padding_up,
padding_left, output_height, output_width, im.data<T>());
}
};
......@@ -238,8 +258,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width) {
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -250,6 +270,15 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
int output_height = col.dims()[0];
int output_width = col.dims()[1];
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
stride_height +
1 ==
output_height);
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
stride_width +
1 ==
output_width);
int block_dim_x = 0;
int block_dim_y = 0;
if (filter_height <= 4 && filter_width <= 4) {
......@@ -274,8 +303,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
filter_height, filter_width, stride_height, stride_width,
padding_height, padding_width, output_height, output_width);
filter_height, filter_width, stride_height, stride_width, padding_up,
padding_left, output_height, output_width);
}
};
......@@ -322,7 +351,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int padding_height, int padding_width) {
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right) {
PADDLE_ENFORCE(im.dims().size() == 3);
PADDLE_ENFORCE(col.dims().size() == 5);
int input_channels = im.dims()[0];
......@@ -333,6 +363,15 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
int output_height = col.dims()[0];
int output_width = col.dims()[1];
PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
stride_height +
1 ==
output_height);
PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
stride_width +
1 ==
output_width);
int block_dim_x = 0;
int block_dim_y = 0;
if (filter_height <= 4 && filter_width <= 4) {
......@@ -357,8 +396,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
filter_height, filter_width, stride_height, stride_width,
padding_height, padding_width, output_height, output_width);
filter_height, filter_width, stride_height, stride_width, padding_up,
padding_left, output_height, output_width);
}
};
......
......@@ -74,8 +74,8 @@ class Im2ColFunctor {
public:
void operator()(const platform::DeviceContext& context,
const framework::Tensor& im, framework::Tensor& col,
int stride_height, int stride_width, int padding_height,
int padding_width);
int stride_height, int stride_width, int padding_up,
int padding_down, int padding_left, int padding_right);
};
template <ColFormat Format, typename Place, typename T>
......@@ -83,7 +83,8 @@ class Col2ImFunctor {
public:
void operator()(const platform::DeviceContext& context, framework::Tensor& im,
const framework::Tensor& col, int stride_height,
int stride_width, int padding_height, int padding_width);
int stride_width, int padding_up, int padding_down,
int padding_left, int padding_right);
};
} // namespace math
......
......@@ -35,6 +35,12 @@ void testIm2col() {
*
* output_ocf = [0, 1, 3, 4
* 1, 2, 4, 5]
*
* col2im_cfo = [0, 2, 2
* 3, 4, 5]
*
* col2im_ocf = [0, 2, 2
* 3, 4, 5]
*/
int input_height = 2;
int input_width = 3;
......@@ -59,7 +65,7 @@ void testIm2col() {
new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
#else
PADDLE_THROW("no GPU support");
#endif // PADDLE_ONLY_CPU
#endif // PADDLE_WITH_CUDA
}
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
......@@ -71,6 +77,7 @@ void testIm2col() {
output_ocf.mutable_data<float>(
{output_height, output_width, 1, filter_size, filter_size}, *place);
// Im2Col
paddle::operators::math::Im2ColFunctor<
paddle::operators::math::ColFormat::kCFO, Place, float>
im2col;
......@@ -78,8 +85,13 @@ void testIm2col() {
paddle::operators::math::ColFormat::kOCF, Place, float>
im2col_ocf;
im2col(*context, input, output_cfo, stride, stride, padding, padding);
im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding);
im2col(*context, input, output_cfo, stride, stride, padding, padding, padding,
padding);
im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding,
padding, padding);
float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
float* out_cfo_ptr;
if (paddle::platform::is_cpu_place(*place)) {
......@@ -88,14 +100,9 @@ void testIm2col() {
output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context);
out_cfo_ptr = output_tmp.data<float>();
}
EXPECT_EQ(out_cfo_ptr[0], 0);
EXPECT_EQ(out_cfo_ptr[1], 1);
EXPECT_EQ(out_cfo_ptr[2], 1);
EXPECT_EQ(out_cfo_ptr[3], 2);
EXPECT_EQ(out_cfo_ptr[4], 3);
EXPECT_EQ(out_cfo_ptr[5], 4);
EXPECT_EQ(out_cfo_ptr[6], 4);
EXPECT_EQ(out_cfo_ptr[7], 5);
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
}
float* out_ocf_ptr;
if (paddle::platform::is_cpu_place(*place)) {
......@@ -104,14 +111,60 @@ void testIm2col() {
output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context);
out_ocf_ptr = output_tmp.data<float>();
}
EXPECT_EQ(out_ocf_ptr[0], 0);
EXPECT_EQ(out_ocf_ptr[1], 1);
EXPECT_EQ(out_ocf_ptr[2], 3);
EXPECT_EQ(out_ocf_ptr[3], 4);
EXPECT_EQ(out_ocf_ptr[4], 1);
EXPECT_EQ(out_ocf_ptr[5], 2);
EXPECT_EQ(out_ocf_ptr[6], 4);
EXPECT_EQ(out_ocf_ptr[7], 5);
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
}
// Col2Im: kCFO
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kCFO, Place, float>
col2im;
paddle::operators::math::Col2ImFunctor<
paddle::operators::math::ColFormat::kOCF, Place, float>
col2im_ocf;
float col2im_data[] = {0, 2, 2, 3, 8, 5};
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
input.CopyFrom(input_tmp, *place, *context);
}
col2im(*context, input, output_cfo, stride, stride, padding, padding, padding,
padding);
float* in_ptr;
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
// Col2Im: kOCF
memset(input_ptr, 0, 6 * sizeof(float));
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
input.CopyFrom(input_tmp, *place, *context);
}
col2im_ocf(*context, input, output_ocf, stride, stride, padding, padding,
padding, padding);
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
in_ptr = input_tmp.data<float>();
}
for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]);
}
}
TEST(math, im2col) {
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
......@@ -25,3 +25,4 @@ nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册