Merge pull request #4032 from tensor-tang/mkldnn-conv

Add MKLDNNConvLayer

Merge pull request #4032 from tensor-tang/mkldnn-conv
Add MKLDNNConvLayer
654344b9 · Tao Luo · GitHub · c86e7e2a · f2317b67 · 654344b9
6 changed file
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConvLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
+
+bool MKLDNNConvLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_EQ(inputLayers_.size(), 1) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(config_.shared_biases()) << "Only support shared biases yet";
+
+  oc_ = config_.num_filters();
+  const ConvConfig& conf = config_.inputs(0).conv_conf();
+  ic_ = conf.channels();
+  fw_ = conf.filter_size();
+  fh_ = conf.filter_size_y();
+  pw_ = conf.padding();
+  ph_ = conf.padding_y();
+  dw_ = conf.dilation();
+  dh_ = conf.dilation_y();
+  sw_ = conf.stride();
+  sh_ = conf.stride_y();
+  gp_ = conf.groups();
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  caffeMode_ = conf.caffe_mode();
+  CHECK(caffeMode_) << "Only support caffe mode yet";
+  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
+  // check group setting
+  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
+  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
+
+  // create weight
+  size_t height = oc_ / gp_;
+  size_t width = ic_ * fh_ * fw_;
+  CHECK_EQ(parameters_[0]->getSize(), height * width);
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+  }
+  return true;
+}
+
+void MKLDNNConvLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  CHECK(wgtVal_) << "should have been initialized";
+  // the paddle weight format is oihw or goihw
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNConvLayer::convertWeightsToPaddle() {
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNConvLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+
+  // cal output sizes
+  // oc can not be changed
+  int fh = (fh_ - 1) * dh_ + 1;
+  int fw = (fw_ - 1) * dw_ + 1;
+  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
+  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
+
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+
+  printSizeInfo();
+}
+
+void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
+                               MKLDNNMatrixPtr& in,
+                               MKLDNNMatrixPtr& wgt,
+                               MKLDNNMatrixPtr& bias,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdPD(fwdPD_);
+
+  resetFwdBuffers(fwdPD_, in, wgt, bias, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
+
+  printValueFormatFlow();
+}
+
+void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
+                               MKLDNNMatrixPtr& in,
+                               MKLDNNMatrixPtr& wgt,
+                               MKLDNNMatrixPtr& bias,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
+
+  resetBwdWgtPD(bwdWgtPD);
+
+  resetBwdDataPD(bwdDataPD);
+
+  resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+
+  resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
+
+  printGradFormatFlow();
+}
+
+void MKLDNNConvLayer::updateInputData() {
+  cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+}
+
+void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
+                                       memory::dims& bias,
+                                       memory::dims& stride,
+                                       memory::dims& dilation,
+                                       memory::dims& padL,
+                                       memory::dims& padR) {
+  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
+                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
+  bias = memory::dims{oc_};
+  stride = memory::dims{sh_, sw_};
+  padL = memory::dims{ph_, pw_};
+  padR = getPaddingR();
+  // note: mkldnn dilation start from 0
+  dilation = memory::dims{dh_ - 1, dw_ - 1};
+}
+
+void MKLDNNConvLayer::resetFwdPD(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
+  // dims for conv
+  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  conv_fwd::desc fwdDesc =
+      biases_ && biases_->getW()
+          ? conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(biasDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind)
+          : conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind);
+  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
+}
+
+void MKLDNNConvLayer::resetFwdBuffers(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(pd);
+  resetInValue(pd, in);
+
+  resetWgtBiasValue(pd, wgt, bias);
+
+  resetOutValue(pd, out);
+}
+
+void MKLDNNConvLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+
+  if (cvtInVal_) {
+    pipeline.push_back(*cvtInVal_);
+  }
+
+  if (bias) {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
+  } else {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
+  }
+  pipeline.push_back(*fwd_);
+
+  if (cvtOutVal_) {
+    pipeline.push_back(*cvtOutVal_);
+  }
+}
+
+void MKLDNNConvLayer::resetInValue(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
+  const MatrixPtr& inMat = inputLayers_[0]->getOutput().value;
+  in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
+
+  // create buffer and reorder if input value do not match
+  cpuInVal_ = nullptr;
+  cvtInVal_ = nullptr;
+  if (inputIsOnlyMKLDNN()) {
+    MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+    CHECK(dnnIn) << "Input should be MKLDNNMatrix";
+    if (dnnIn->getPrimitiveDesc() != in->getPrimitiveDesc()) {
+      CHECK_EQ(dnnIn->getFormat(), format::nc);
+      CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
+      // create a new one with nchw format and same data
+      memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
+      dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
+      CHECK(dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc());
+    }
+    in = dnnIn;
+  } else {
+    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
+    memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+    cpuInVal_ = MKLDNNMatrix::create(cpuIn, inDims, format::nchw, engine_);
+    if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
+      // create new mkldnn matrix
+      in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
+      cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
+      CHECK(cvtInVal_) << "should not be emptry";
+    } else {
+      in = cpuInVal_;
+    }
+  }
+}
+
+void MKLDNNConvLayer::resetWgtBiasValue(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias) {
+  wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc());
+  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
+
+  bias = nullptr;
+  if (biases_ && biases_->getW()) {
+    bias = MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc());
+  }
+}
+
+void MKLDNNConvLayer::resetOutValue(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
+  out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
+
+  // change original output value from cpu matrix to mkldnn matrix
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
+
+  // create reorder if output value has cpu device and pd do not match
+  cpuOutVal_ = nullptr;
+  cpuOutVal_ = nullptr;
+  if (!outputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
+    memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
+    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
+      CHECK(cvtOutVal_) << "should not be emptry";
+    } else {
+      // CPU output share the same data of MKLDNN output
+      cpuOut->setData(out->getData());
+      cpuOutVal_ = out;
+    }
+  }
+}
+
+void MKLDNNConvLayer::resetBwdWgtPD(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  // create backward weight using input, output and weight value memory desc
+  CHECK(inVal_) << "Should have input value";
+  CHECK(outVal_) << "Should have output value";
+  CHECK(wgtVal_) << "Should have weight value";
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  auto bwdWgtDesc = biasVal_ != nullptr
+                        ? conv_bwdWgt::desc(algo,
+                                            inVal_->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            biasVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind)
+                        : conv_bwdWgt::desc(algo,
+                                            inVal_->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind);
+  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
+  CHECK(pd->src_primitive_desc() == inVal_->getPrimitiveDesc())
+      << "primitive desc of in value should equal";
+  CHECK(pd->diff_dst_primitive_desc() == outVal_->getPrimitiveDesc())
+      << "primitive desc of out grad should equal the out value";
+  CHECK(pd->diff_weights_primitive_desc() == wgtVal_->getPrimitiveDesc())
+      << "primitive desc of weight grad should equal the weight value";
+}
+
+void MKLDNNConvLayer::resetBwdDataPD(
+    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
+    return;
+  }
+
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+  CHECK(inVal_) << "Should have input value";
+  CHECK(outVal_) << "Should have output value";
+  // create backward data using input and output value memory desc
+  // but using weight memory desc with any format
+  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
+                                        inVal_->getMemoryDesc(),
+                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
+                                        outVal_->getMemoryDesc(),
+                                        strides,
+                                        padL,
+                                        padR,
+                                        padding_kind::zero);
+  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
+  CHECK(pd->diff_src_primitive_desc() == inVal_->getPrimitiveDesc())
+      << "primitive desc of in grad should equal the in value";
+  CHECK(pd->diff_dst_primitive_desc() == outVal_->getPrimitiveDesc())
+      << "primitive desc of out grad should equal";
+}
+
+void MKLDNNConvLayer::resetBwdBuffers(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(wgtPD);
+  resetOutGrad(wgtPD, out);
+
+  resetWgtBiasGrad(wgtPD, wgt, bias);
+
+  resetInGrad(dataPD, in);
+
+  resetWgtValBwdData(dataPD, wgtValBwdData_);
+}
+
+void MKLDNNConvLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+
+  if (cvtOutGrad_) {
+    pipeline.push_back(*cvtOutGrad_);
+  }
+
+  // add bwdWgt handle
+  if (bias) {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+
+  if (dataPD == nullptr) {
+    return;
+  }
+
+  if (cvtWgtVal_) {
+    pipeline.push_back(*cvtWgtVal_);
+  }
+
+  // add bwdData handle
+  CHECK(wgtValBwdData_) << "Should have weight memory";
+  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
+  pipeline.push_back(*bwdData_);
+
+  if (cvtInGrad_) {
+    pipeline.push_back(*cvtInGrad_);
+  }
+}
+
+void MKLDNNConvLayer::resetOutGrad(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
+  const MatrixPtr& outMat = output_.grad;
+  out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc());
+  CHECK(outVal_ != nullptr &&
+        out->getPrimitiveDesc() == outVal_->getPrimitiveDesc())
+      << "primitive desc of out grad and value should be equal";
+
+  // TODO(TJ): merge outgrad
+  // create reorder if has output grad does not match
+  cpuOutGrad_ = nullptr;
+  cvtOutGrad_ = nullptr;
+  if (!outputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    // same PrimitiveDesc with cpuInVal_
+    CHECK(cpuOutVal_);
+    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
+    if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) {
+      outMat->setData(cpuOut->getData());
+      out = cpuOutGrad_;
+    } else {
+      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
+      CHECK(cvtOutGrad_);
+    }
+  }
+}
+
+void MKLDNNConvLayer::resetWgtBiasGrad(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias) {
+  wgt = MKLDNNMatrix::create(weight_->getWGrad(),
+                             wgtPD->diff_weights_primitive_desc());
+  CHECK(nullptr != wgtVal_ &&
+        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
+      << "primitive desc of weight grad and value should be equal";
+  VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat();
+
+  if (biasVal_ == nullptr) {
+    return;
+  }
+  bias = MKLDNNMatrix::create(biases_->getWGrad(),
+                              wgtPD->diff_bias_primitive_desc());
+  CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
+      << "primitive desc of bias grad should equal the bias value";
+}
+
+void MKLDNNConvLayer::resetInGrad(
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in) {
+  if (dataPD == nullptr) {
+    return;
+  }
+
+  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
+  in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad,
+                            dataPD->diff_src_primitive_desc());
+  CHECK(nullptr != inVal_ &&
+        in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
+      << "primitive desc of input grad and value should be equal";
+
+  // create reorder if has output grad does not match
+  cpuInGrad_ = nullptr;
+  cvtInGrad_ = nullptr;
+  if (!inputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
+    // same PrimitiveDesc with cpuInVal_
+    CHECK(cpuInVal_);
+    cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
+    if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
+      const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE);
+      in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc());
+      cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
+      CHECK(cvtInGrad_);
+    } else {
+      in = cpuInGrad_;
+    }
+  }
+}
+
+void MKLDNNConvLayer::resetWgtValBwdData(
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& wgt) {
+  if (dataPD == nullptr) {
+    return;
+  }
+
+  // create new weight value for backward data, and create reorder if necessary
+  // since the primitive_desc would be different with wgtVal_
+  CHECK(wgtVal_) << "should have weight value";
+  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
+    wgtValBwdData_ =
+        MKLDNNMatrix::create(nullptr, dataPD->weights_primitive_desc());
+    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
+    CHECK(cvtWgtVal_);
+  } else {
+    wgtValBwdData_ = wgtVal_;
+  }
+  VLOG(MKLDNN_FMTS) << "weight value format for backward data"
+                    << wgtValBwdData_->getFormat();
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::convolution_forward conv_fwd;
+typedef mkldnn::convolution_backward_weights conv_bwdWgt;
+typedef mkldnn::convolution_backward_data conv_bwdData;
+
+/**
+ * @brief A subclass of MKLDNNLayer conv layer.
+ *
+ * The config file api is mkldnn_conv
+ */
+class MKLDNNConvLayer : public MKLDNNLayer {
+protected:
+  // padding height and width
+  int ph_, pw_;
+  // stride height and width
+  int sh_, sw_;
+  // dilation height and width
+  int dh_, dw_;
+  // filter(kenerl) height and width
+  int fh_, fw_;
+  // group number
+  int gp_;
+
+  // in resetBwdData, the format of wgtValBwdData_ is different with wgtVal_
+  MKLDNNMatrixPtr wgtValBwdData_;
+  // convert handle from wgtVal_ to wgtValBwdData_
+  std::shared_ptr<mkldnn::reorder> cvtWgtVal_;
+
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
+
+  // MKLDNNMatrixPtr which should be created from CPU Device
+  MKLDNNMatrixPtr cpuInVal_;
+  MKLDNNMatrixPtr cpuInGrad_;
+  MKLDNNMatrixPtr cpuOutVal_;
+  MKLDNNMatrixPtr cpuOutGrad_;
+  // convert handle between CPU device and MKLDNN device
+  std::shared_ptr<mkldnn::reorder> cvtInVal_;
+  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // whether the weight has been init
+  bool hasInitedWgt_;
+
+  // true by default, which impact the calculation of output image size.
+  // details can refer to mathUtil.h
+  bool caffeMode_;
+
+  // weight and bias
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit MKLDNNConvLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
+
+  ~MKLDNNConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                MKLDNNMatrixPtr& in,
+                MKLDNNMatrixPtr& wgt,
+                MKLDNNMatrixPtr& bias,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateInputData() override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void convertWeightsFromPaddle() override;
+
+  void convertWeightsToPaddle() override;
+
+  void printSizeInfo() override {
+    MKLDNNLayer::printSizeInfo();
+    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
+                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
+  }
+
+  void printValueFormatFlow() override {
+    if (cpuInVal_) {
+      VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
+    }
+    MKLDNNLayer::printValueFormatFlow();
+    if (cpuOutVal_) {
+      VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
+    }
+  }
+
+  void printGradFormatFlow() override {
+    if (cpuInGrad_) {
+      VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
+    }
+    MKLDNNLayer::printGradFormatFlow();
+    if (cpuOutGrad_) {
+      VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
+    }
+  }
+
+protected:
+  /**
+   * load the dims settings of this conv
+   */
+  void loadConvSettings(mkldnn::memory::dims& wgt,
+                        mkldnn::memory::dims& bias,
+                        mkldnn::memory::dims& stride,
+                        mkldnn::memory::dims& dilation,
+                        mkldnn::memory::dims& padL,
+                        mkldnn::memory::dims& padR);
+
+  /**
+   * reset the forward primitive descriptor.
+   */
+  void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
+  /**
+   * reset the MKLDNNMatrix buffers used in forward.
+   */
+  void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                       MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  /**
+   * reset the forward pipeline.
+   */
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * reset MKLDNNMatrix of input value
+   */
+  void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                    MKLDNNMatrixPtr& in);
+  /**
+   * reset MKLDNNMatrix of weight and bias value
+   */
+  void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                         MKLDNNMatrixPtr& wgt,
+                         MKLDNNMatrixPtr& bias);
+  /**
+   * reset MKLDNNMatrix of output value
+   */
+  void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                     MKLDNNMatrixPtr& out);
+
+  /**
+   * reset the backward weight primitive descriptor.
+   */
+  void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
+  /**
+   * reset the backward data primitive descriptor.
+   */
+  void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
+  /**
+   * reset the MKLDNNMatrix buffers used in backward.
+   */
+  void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                       std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                       MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  /**
+   * reset the backward pipeline.
+   */
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * reset MKLDNNMatrix of output grad
+   */
+  void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                    MKLDNNMatrixPtr& out);
+  /**
+   * reset MKLDNNMatrix of weight and bias grad
+   */
+  void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias);
+  /**
+   * reset MKLDNNMatrix of input grad
+   */
+  void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                   MKLDNNMatrixPtr& in);
+  /**
+   * reset MKLDNNMatrix of weight value for backward data
+   * since the primitive_desc would be different with wgtVal_
+   */
+  void resetWgtValBwdData(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                          MKLDNNMatrixPtr& wgt);
+
+  /**
+   * get padding_r according to
+   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+   * test_convolution_forward_common.hpp
+   * @note: mkldnn dilation start from 0 while paddle start from 1
+   */
+  mkldnn::memory::dims getPaddingR() const {
+    mkldnn::memory::dims padR = {ph_, pw_};
+    for (int i = 0; i < 2; ++i) {
+      if ((ih_ - ((fh_ - 1) * dh_ + 1) + ph_ + padR[0]) / sh_ + 1 != oh_) {
+        ++padR[0];
+      }
+      if ((iw_ - ((fw_ - 1) * dw_ + 1) + pw_ + padR[1]) / sw_ + 1 != ow_) {
+        ++padR[1];
+      }
+    }
+    return padR;
+  }
+};
+
+}  // namespace paddle
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "MKLDNNTester.h"
 #include "ModelConfig.pb.h"
+#include "paddle/math/MathUtils.h"

 using namespace paddle;  // NOLINT

@@ -63,6 +64,83 @@ TEST(MKLDNNLayer, FcLayer) {
  testFcLayer({/*bs*/ 15, /*ic*/ 3, /*oc*/ 6, /*ih*/ 16, /*iw*/ 16});
 }

+struct testConvDesc {
+  int bs, gp;
+  int ic, ih, iw;
+  int oc, oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+  int dh, dw;
+};
+
+void testConvLayer(const testConvDesc& pm) {
+  const std::string compareTypes[] = {"mkldnn_conv", "exconv"};
+  TestConfig cfg;
+  cfg.layerConfig.set_type(compareTypes[0]);
+  cfg.layerConfig.set_num_filters(pm.oc);
+  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
+  // cfg.layerConfig.set_partial_sum(1); // TODO: check it
+  cfg.layerConfig.set_shared_biases(true);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_groups(pm.gp);
+  conv->set_img_size(pm.iw);
+  conv->set_img_size_y(pm.ih);
+  conv->set_output_x(pm.ow);
+  conv->set_output_y(pm.oh);
+  conv->set_filter_size(pm.fw);
+  conv->set_filter_size_y(pm.fh);
+  conv->set_channels(pm.ic);
+  conv->set_padding(pm.pw);
+  conv->set_padding_y(pm.ph);
+  conv->set_stride(pm.sw);
+  conv->set_stride_y(pm.sh);
+  conv->set_dilation(pm.dw);
+  conv->set_dilation_y(pm.dh);
+  conv->set_caffe_mode(true);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
+      << "it is indivisible";
+
+  int fh = (pm.fh - 1) * pm.dh + 1;
+  int fw = (pm.fw - 1) * pm.dw + 1;
+  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
+  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+
+  MKLDNNTester tester;
+  for (auto biasSize : {pm.oc, 0}) {
+    cfg.biasSize = biasSize;
+    TestConfig ref = cfg;
+    ref.layerConfig.set_type(compareTypes[1]);
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(cfg, ref, bs, pm.ih, pm.iw);
+    }
+  }
+}
+
+TEST(MKLDNNLayer, ConvLayer) {
+  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
+  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
+  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
+  // with groups
+  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
+}
+
 // TODO(TJ): add branch test

 int main(int argc, char** argv) {

--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -49,6 +49,27 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
 }

+std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
+                                                     const MKLDNNMatrixPtr& dst,
+                                                     bool checkData) {
+  if (src == dst || src->getPrimitiveDesc() == dst->getPrimitiveDesc()) {
+    return nullptr;
+  }
+
+  if (checkData && (src->getData() == dst->getData())) {
+    LOG(FATAL) << "can not create reorder with inplace data";
+    return nullptr;
+  }
+
+  memory::dims srcDims = src->getDims();
+  memory::dims dstDims = dst->getDims();
+  CHECK_EQ(srcDims.size(), dstDims.size());
+  for (size_t i = 0; i < srcDims.size(); ++i) {
+    CHECK_EQ(srcDims[i], dstDims[i]);
+  }
+  return std::make_shared<reorder>(*src, *dst);
+}
+
 void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
                                   memory::format srcFmt,
                                   memory::dims targetDim) {

--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -52,6 +52,31 @@ public:
      mkldnn::engine& eg,
      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);

+  /**
+   * Create Memory descriptor.
+   * default with any format and f32 dtype
+   */
+  static mkldnn::memory::desc createMemoryDesc(
+      const mkldnn::memory::dims& dims,
+      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::desc(dims, dtype, fmt);
+  }
+
+  /**
+   * Create reorder primitive.
+   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
+   * checkData: for whether to check the data handle of src and dst is the same.
+   *            if true, means check it and do not want support inplace reorder;
+   *            otherwise do not check data which means the created reorder
+   *            maybe inplace buffer and do not guarantee the logical is correct
+   *            since not all format or conversion support inplace.
+   */
+  static std::shared_ptr<mkldnn::reorder> createReorder(
+      const MKLDNNMatrixPtr& src,
+      const MKLDNNMatrixPtr& dst,
+      bool checkData = true);
+
 public:
  /**
   * Reorder this MKLDNNMatrix from other format.

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2055,20 +2055,26 @@ class ConvLayerBase(LayerBase):
        if num_filters is not None:
            self.config.num_filters = num_filters

+        use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
        use_gpu = int(g_command_config_args.get("use_gpu", 0))
        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))

-        # Automatically select cudnn_type for GPU and exconv for CPU
+        # Automatically select cudnn_type for GPU, exconv for CPU
+        # and mkldnn_conv for MKLDNN
        # if set type=conv, but still reserve the way user specify
-        # exconv or cudnn_conv manually.
+        # exconv, mkldnn_conv or cudnn_conv manually.
        if self.layer_type == "cudnn_conv":
            config_assert(use_gpu, "cudnn_conv only support GPU")

+        if self.layer_type == "mkldnn_conv":
+            config_assert(use_mkldnn, "mkldnn_conv only support MKLDNN")
+
        if (use_gpu == 1 and self.layer_type != "exconv" and
+                self.layer_type != "mkldnn_conv" and
            (parallel_nn == 0 or self.config.device > -1)):
            self.layer_type = "cudnn_conv"
        else:
-            self.layer_type = "exconv"
+            self.layer_type = "mkldnn_conv" if use_mkldnn else "exconv"
        # need to specify layer in config
        self.config.type = self.layer_type

@@ -2100,6 +2106,11 @@ class ConvLayer(ConvLayerBase):
    layer_type = 'exconv'


+@config_layer('mkldnn_conv')
+class ConvLayer(ConvLayerBase):
+    layer_type = 'mkldnn_conv'
+
+
 @config_layer('cudnn_conv')
 class ConvLayer(ConvLayerBase):
    layer_type = 'cudnn_conv'