From 2a98cba2d2249e43de32040b91ef07eaccb72334 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 15 Sep 2017 17:12:19 +0800 Subject: [PATCH] enable mkldnn_pool forward and backward --- paddle/gserver/layers/MKLDNNPoolLayer.cpp | 183 ++++++++++++++++++++-- paddle/gserver/layers/MKLDNNPoolLayer.h | 45 ++++++ 2 files changed, 216 insertions(+), 12 deletions(-) diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp index ed7684f2b..7ef7ee494 100644 --- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp +++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "MKLDNNPoolLayer.h" +#include "paddle/math/MathUtils.h" #include "paddle/utils/Logging.h" using namespace mkldnn; // NOLINT @@ -28,17 +29,49 @@ bool MKLDNNPoolLayer::init(const LayerMap& layerMap, return false; } + /* the size of inputs for pool-layer is 1 */ + CHECK_EQ(config_.inputs_size(), 1); + const PoolConfig& conf = config_.inputs(0).pool_conf(); + ic_ = conf.channels(); + ih_ = conf.img_size_y(); + iw_ = conf.img_size(); + oc_ = ic_; + oh_ = conf.output_y(); + ow_ = conf.output_x(); + fh_ = conf.size_y(); + fw_ = conf.size_x(); + ph_ = conf.padding_y(); + pw_ = conf.padding(); + sh_ = conf.stride_y(); + sw_ = conf.stride(); + + const std::string& type = conf.pool_type(); + if (type == "max-projection") { + poolAlgo_ = algorithm::pooling_max; + } else if (type == "avg-projection") { + // TODO(TJ): support choosing exclude or include when paddle support it + // paddle only support pooling_avg_exclude_padding yet + poolAlgo_ = algorithm::pooling_avg_exclude_padding; + } else { + LOG(FATAL) << "unknow pooling type!"; + } + return true; } void MKLDNNPoolLayer::reshape( int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) { reshapeInput(bs, ih, iw); + // ic_ and oc can not be changed + CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic) + << "Input channel can not be changed"; // cal output sizes - // oc can not be changed - + // paddle used false caffeMode for pooling + oh = outputSize(ih, fh_, ph_, sh_, false); + ow = outputSize(iw, fw_, pw_, sw_, false); reshapeOutput(oh, ow); + resizeOutput(bs, oc * oh * ow); printSizeInfo(); @@ -81,40 +114,166 @@ void MKLDNNPoolLayer::updateInputData() { void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { resetInValue(in); + resetOutValue(out); } -void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {} +void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) { + if (inputIsOnlyMKLDNN()) { + const MatrixPtr& dnnIn = getInputValue(0); + in = std::dynamic_pointer_cast(dnnIn); + CHECK(in) << "Input should be MKLDNNMatrix"; + } else { + CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet"; + const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE); + in = MKLDNNMatrix::create( + cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_); + } +} -void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {} +void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) { + CHECK(inVal_) << "Should reset input value first"; + memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; + out = MKLDNNMatrix::create( + output_.value, outDims, inVal_->getFormat(), engine_); + output_.value = std::dynamic_pointer_cast(out); + + // create reorder if output value has cpu device and pd do not match + cpuOutVal_ = nullptr; + cvtOutVal_ = nullptr; + if (!outputIsOnlyMKLDNN()) { + const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value; + cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_); + if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) { + cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_); + CHECK(cvtOutVal_) << "should not be emptry"; + } else { + // CPU output share the same data of MKLDNN output + cpuOut->setData(out->getData()); + cpuOutVal_ = out; + } + } +} void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr in, - MKLDNNMatrixPtr out) {} + MKLDNNMatrixPtr out) { + memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_}; + memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; + memory::dims kernels = memory::dims{fh_, fw_}; + memory::dims strides = memory::dims{sh_, sw_}; + memory::dims padL = memory::dims{ph_, pw_}; + memory::dims padR = getPaddingR(); + padding_kind padKind = padding_kind::zero; + prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring + : prop_kind::forward_training; + auto fwdDesc = pool_fwd::desc(pk, + poolAlgo_, + in->getMemoryDesc(), + out->getMemoryDesc(), + strides, + kernels, + padL, + padR, + padKind); + pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_)); + + // prepare workspace if necessary + workspace_ = + (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max) + ? std::make_shared(memory(pd->workspace_primitive_desc())) + : nullptr; +} void MKLDNNPoolLayer::resetFwdPipeline( - std::vector& pipeline, + std::vector& pipeline, std::shared_ptr& pd, MKLDNNMatrixPtr& in, - MKLDNNMatrixPtr& out) {} + MKLDNNMatrixPtr& out) { + pipeline.clear(); + fwd_ = workspace_ + ? std::make_shared(pool_fwd(*pd, *in, *out, *workspace_)) + : std::make_shared(pool_fwd(*pd, *in, *out)); + pipeline.push_back(*fwd_); + + if (cvtOutVal_) { + pipeline.push_back(*cvtOutVal_); + } +} void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out) { resetOutGrad(out); + resetInGrad(in); } -void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {} +void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) { + CHECK(outVal_) << "Should have output value"; + out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc()); + + // create reorder if output value has cpu device and pd do not match + cpuOutGrad_ = nullptr; + cvtOutGrad_ = nullptr; + if (!outputIsOnlyMKLDNN()) { + const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; + cpuOutGrad_ = MKLDNNMatrix::create( + cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_); + if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) { + cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); + CHECK(cvtOutGrad_) << "should not be emptry"; + } else { + // share the same data of CPU output + output_.grad->setData(cpuOut->getData()); + out = cpuOutGrad_; + } + } +} -void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {} +void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) { + in = nullptr; + const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad; + if (inGrad == nullptr) { + return; + } + CHECK(inVal_); + in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc()); +} void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr& pd, MKLDNNMatrixPtr& in, - MKLDNNMatrixPtr& out) {} + MKLDNNMatrixPtr& out) { + memory::dims kernels = memory::dims{fh_, fw_}; + memory::dims strides = memory::dims{sh_, sw_}; + memory::dims padL = memory::dims{ph_, pw_}; + memory::dims padR = getPaddingR(); + CHECK(in); + CHECK(out); + auto bwdDesc = pool_bwd::desc(poolAlgo_, + in->getMemoryDesc(), + out->getMemoryDesc(), + strides, + kernels, + padL, + padR, + padding_kind::zero); + pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_)); +} void MKLDNNPoolLayer::resetBwdPipeline( - std::vector& pipeline, + std::vector& pipeline, std::shared_ptr& pd, MKLDNNMatrixPtr& in, - MKLDNNMatrixPtr& out) {} + MKLDNNMatrixPtr& out) { + pipeline.clear(); + if (cvtOutGrad_) { + pipeline.push_back(*cvtOutGrad_); + } + + bwdData_ = + workspace_ + ? std::make_shared(pool_bwd(*pd, *out, *workspace_, *in)) + : std::make_shared(pool_bwd(*pd, *out, *in)); + pipeline.push_back(*bwdData_); +} } // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h index d797ee955..891e15a7e 100644 --- a/paddle/gserver/layers/MKLDNNPoolLayer.h +++ b/paddle/gserver/layers/MKLDNNPoolLayer.h @@ -28,8 +28,28 @@ typedef mkldnn::pooling_backward pool_bwd; */ class MKLDNNPoolLayer : public MKLDNNLayer { protected: + // padding height and width + int ph_, pw_; + // stride height and width + int sh_, sw_; + // filter(kenerl) height and width + int fh_, fw_; + + // pooling_avg or pooling_max + mkldnn::algorithm poolAlgo_; + + // MKLDNNMatrixPtr which should be created from CPU Device + MKLDNNMatrixPtr cpuOutVal_; + MKLDNNMatrixPtr cpuOutGrad_; + // convert handle between CPU device and MKLDNN device + std::shared_ptr cvtOutVal_; + std::shared_ptr cvtOutGrad_; + // save forward primitive_desc, which can be used backward std::shared_ptr fwdPD_; + // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/ + // test_pooling_forward.cpp, pool need workspace for backward + std::shared_ptr workspace_; public: explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {} @@ -56,6 +76,13 @@ public: void updateInputData() override; + void printSizeInfo() override { + MKLDNNLayer::printSizeInfo(); + VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_ + << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_ + << ", sw: " << sw_; + } + protected: /** * Forward functions: reset buffers(input, output), @@ -88,6 +115,24 @@ protected: std::shared_ptr& pd, MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); + + /** + * get padding_r according to + * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/ + * test_pooling_forward.cpp + */ + mkldnn::memory::dims getPaddingR() const { + mkldnn::memory::dims padR = {ph_, pw_}; + for (int i = 0; i < 2; ++i) { + if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) { + ++padR[0]; + } + if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) { + ++padR[1]; + } + } + return padR; + } }; } // namespace paddle -- GitLab