From c1914543b0eaef98450314a1b56f4f918aa36ce2 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Thu, 19 Oct 2017 14:34:44 +0800
Subject: [PATCH] refine mkldnn logic, move reset buffers into MKLDNNLayer

---
 paddle/gserver/layers/MKLDNNConvLayer.cpp | 233 +++-------------
 paddle/gserver/layers/MKLDNNConvLayer.h   |  66 -----
 paddle/gserver/layers/MKLDNNFcLayer.cpp   | 101 ++-----
 paddle/gserver/layers/MKLDNNFcLayer.h     |   8 -
 paddle/gserver/layers/MKLDNNLayer.h       | 324 ++++++++++++++++++----
 paddle/gserver/layers/MKLDNNPoolLayer.cpp | 103 +------
 paddle/gserver/layers/MKLDNNPoolLayer.h   |  13 -
 paddle/math/MKLDNNMatrix.cpp              |   2 +-
 paddle/math/MKLDNNMatrix.h                |  14 +-
 9 files changed, 358 insertions(+), 506 deletions(-)
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 26810a64834..463e6ad0ed7 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -116,8 +116,6 @@ void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdBuffers(fwdPD_, in, wgt, bias, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -135,12 +133,6 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
 
   resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNConvLayer::updateInputData() {
-  cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
@@ -211,11 +203,18 @@ void MKLDNNConvLayer::resetFwdBuffers(
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   CHECK(pd);
-  resetInValue(pd, in);
+  resetInValue(
+      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
+
+  resetOutValue(out, pd->dst_primitive_desc());
 
-  resetWgtBiasValue(pd, wgt, bias);
+  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
 
-  resetOutValue(pd, out);
+  bias = nullptr;
+  if (biases_ == nullptr || biases_->getW() == nullptr) {
+    return;
+  }
+  resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
 }
 
 void MKLDNNConvLayer::resetFwdPipeline(
@@ -225,104 +224,12 @@ void MKLDNNConvLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  if (cvtInVal_) {
-    pipeline.push_back(*cvtInVal_);
-  }
-
   if (bias) {
     fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
   } else {
     fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
   }
   pipeline.push_back(*fwd_);
-
-  if (cvtOutVal_) {
-    pipeline.push_back(*cvtOutVal_);
-  }
-}
-
-void MKLDNNConvLayer::resetInValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
-  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-  in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
-
-  // create buffer and reorder if input value do not match
-  cpuInVal_ = nullptr;
-  cvtInVal_ = nullptr;
-
-  MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr);
-  if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
-    in = dnnIn;
-    return;
-  }
-  if (dnnIn) {
-    if (dnnIn->getFormat() == format::nc) {
-      CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
-      // create a new one with nchw format and same data
-      memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
-      dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-    }
-    if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
-      in = dnnIn;
-      return;
-    }
-    cpuInVal_ = dnnIn;
-    in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
-    cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
-    CHECK(cvtInVal_) << "should not be emptry";
-  } else {
-    memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-    cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-    if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      // create new mkldnn matrix
-      in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
-      cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
-      CHECK(cvtInVal_) << "should not be emptry";
-    } else {
-      in = cpuInVal_;
-    }
-  }
-}
-
-void MKLDNNConvLayer::resetWgtBiasValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias) {
-  wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc());
-  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
-
-  bias = (biases_ && biases_->getW())
-             ? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc())
-             : nullptr;
-}
-
-void MKLDNNConvLayer::resetOutValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
-  out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutVal_ = nullptr;
-  cvtOutVal_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
-    memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) {
-      out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc());
-      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be empty";
-    } else {
-      cpuOut->setData(output_.value->getData());
-      cpuOutVal_ = out;
-    }
-    // when output is cpu device, change the mkldnn output value and make them
-    // share the same data. Then if next layer use inputlayer->getOuputValue()
-    // to achieve the input value, it will get the right data.
-    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
-    return;
-  }
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 
 void MKLDNNConvLayer::resetBwdWgtPD(
@@ -331,8 +238,8 @@ void MKLDNNConvLayer::resetBwdWgtPD(
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
 
   // create backward weight using input, output and weight value memory desc
-  CHECK(inVal_) << "Should have input value";
-  CHECK(outVal_) << "Should have output value";
+  CHECK(inVal_) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
   CHECK(wgtVal_) << "Should have weight value";
   algorithm algo = algorithm::convolution_direct;
   padding_kind padKind = padding_kind::zero;
@@ -372,8 +279,8 @@ void MKLDNNConvLayer::resetBwdDataPD(
 
   memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVal_) << "Should have input value";
-  CHECK(outVal_) << "Should have output value";
+  CHECK(inVal_) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
   // create backward data using input and output value memory desc
   // but using weight memory desc with any format
   auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
@@ -399,12 +306,27 @@ void MKLDNNConvLayer::resetBwdBuffers(
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   CHECK(wgtPD);
-  resetOutGrad(wgtPD, out);
+  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
 
-  resetWgtBiasGrad(wgtPD, wgt, bias);
+  resetWithMatrix(
+      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
+  CHECK(wgtVal_ != nullptr &&
+        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
+      << "primitive desc of weight grad and value should be equal";
 
-  resetInGrad(dataPD, in);
+  bias = nullptr;
+  if (biases_ && biases_->getWGrad()) {
+    resetWithMatrix(
+        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
+    CHECK(bias && biasVal_ &&
+          bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
+        << "primitive desc of bias grad should equal the bias value";
+  }
 
+  if (dataPD == nullptr) {
+    return;
+  }
+  resetInGrad(in, dataPD->diff_src_primitive_desc());
   resetWgtValBwdData(dataPD, wgtValBwdData_);
 }
 
@@ -416,10 +338,7 @@ void MKLDNNConvLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  if (cvtOutGrad_) {
-    pipeline.push_back(*cvtOutGrad_);
-  }
-
+  CHECK(inVal_);
   // add bwdWgt handle
   if (bias) {
     bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
@@ -431,99 +350,13 @@ void MKLDNNConvLayer::resetBwdPipeline(
   if (dataPD == nullptr) {
     return;
   }
-
   if (cvtWgtVal_) {
     pipeline.push_back(*cvtWgtVal_);
   }
-
   // add bwdData handle
   CHECK(wgtValBwdData_) << "Should have weight memory";
   bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
   pipeline.push_back(*bwdData_);
-
-  if (cvtInGrad_) {
-    pipeline.push_back(*cvtInGrad_);
-  }
-}
-
-void MKLDNNConvLayer::resetOutGrad(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
-  cpuOutGrad_ = nullptr;
-  cvtOutGrad_ = nullptr;
-  CHECK(outVal_ != nullptr &&
-        outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
-      << "primitive desc of out grad and value should be equal";
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    // always share the same grad data of CPU output
-    // then the activation can get the right grad from output_.grad
-    output_.grad->setData(cpuOut->getData());
-    // same PrimitiveDesc with cpuInVal_
-    CHECK(cpuOutVal_);
-    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
-    // create reorder if primitive desc does not match
-    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
-      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
-      CHECK(cvtOutGrad_);
-    } else {
-      out = cpuOutGrad_;
-    }
-  }
-}
-
-void MKLDNNConvLayer::resetWgtBiasGrad(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias) {
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(),
-                             wgtPD->diff_weights_primitive_desc());
-  CHECK(nullptr != wgtVal_ &&
-        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
-      << "primitive desc of weight grad and value should be equal";
-  VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat();
-
-  bias = nullptr;
-  if (biasVal_ == nullptr) {
-    return;
-  }
-  bias = MKLDNNMatrix::create(biases_->getWGrad(),
-                              wgtPD->diff_bias_primitive_desc());
-  CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
-      << "primitive desc of bias grad should equal the bias value";
-}
-
-void MKLDNNConvLayer::resetInGrad(
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  cpuInGrad_ = nullptr;
-  cvtInGrad_ = nullptr;
-  if (dataPD == nullptr) {
-    return;
-  }
-
-  if (inputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
-    CHECK(nullptr != inVal_ &&
-          in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
-        << "primitive desc of input grad and value should be equal";
-  } else {
-    const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
-    // same PrimitiveDesc with cpuInVal_
-    CHECK(cpuInVal_);
-    cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
-    in = cpuInGrad_;
-    // create reorder if PrimitiveDesc does not match
-    if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
-      in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
-                                dataPD->diff_src_primitive_desc());
-      cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
-      CHECK(cvtInGrad_);
-    }
-  }
 }
 
 void MKLDNNConvLayer::resetWgtValBwdData(
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index f84f2f737c4..1fed0e1c656 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -48,17 +48,6 @@ protected:
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
 
-  // MKLDNNMatrixPtr which should be created from CPU Device
-  MKLDNNMatrixPtr cpuInVal_;
-  MKLDNNMatrixPtr cpuInGrad_;
-  MKLDNNMatrixPtr cpuOutVal_;
-  MKLDNNMatrixPtr cpuOutGrad_;
-  // convert handle between CPU device and MKLDNN device
-  std::shared_ptr<mkldnn::reorder> cvtInVal_;
-  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
   // whether the weight has been init
   bool hasInitedWgt_;
 
@@ -94,8 +83,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void updateWeights(const UpdateCallback& callback) override;
 
   void convertWeightsFromPaddle() override;
@@ -109,26 +96,6 @@ public:
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
-  void printValueFormatFlow() override {
-    if (cpuInVal_) {
-      VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
-    }
-    MKLDNNLayer::printValueFormatFlow();
-    if (cpuOutVal_) {
-      VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
-    }
-  }
-
-  void printGradFormatFlow() override {
-    if (cpuInGrad_) {
-      VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
-    }
-    MKLDNNLayer::printGradFormatFlow();
-    if (cpuOutGrad_) {
-      VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
-    }
-  }
-
 protected:
   /**
    * load the dims settings of this conv
@@ -162,23 +129,6 @@ protected:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
-  /**
-   * reset MKLDNNMatrix of input value
-   */
-  void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                    MKLDNNMatrixPtr& in);
-  /**
-   * reset MKLDNNMatrix of weight and bias value
-   */
-  void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                         MKLDNNMatrixPtr& wgt,
-                         MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of output value
-   */
-  void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                     MKLDNNMatrixPtr& out);
-
   /**
    * reset the backward weight primitive descriptor.
    */
@@ -207,22 +157,6 @@ protected:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
-  /**
-   * reset MKLDNNMatrix of output grad
-   */
-  void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                    MKLDNNMatrixPtr& out);
-  /**
-   * reset MKLDNNMatrix of weight and bias grad
-   */
-  void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of input grad
-   */
-  void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                   MKLDNNMatrixPtr& in);
   /**
    * reset MKLDNNMatrix of weight value for backward data
    * since the primitive_desc would be different with wgtVal_
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index cf19a155681..9f82a3b7475 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
   bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
   wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
   hasInitedWgt_ = true;
 }
@@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
   bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
@@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdPD(fwdPD_, in, wgt, bias, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdDataPD(bwdDataPD, in, out);
 
   resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNFcLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@@ -139,51 +131,33 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
   resetInValue(in);
+  CHECK(in);
+  in->downSpatial();
 
-  resetWgtBiasValue(wgt, bias);
-
-  resetOutValue(out);
-}
+  //  if (extInVal_) {
+  //    extInVal_->downSpatial();
+  //  }
 
-void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
-  in->downSpatial();
-}
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
+  resetOutValue(out, outPD);
 
-void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
-                                      MKLDNNMatrixPtr& bias) {
   format wgtFmt = format::oihw;
-  if (inVal_->getFormat() == format::nChw8c) {
+  if (in->getFormat() == format::nChw8c) {
     wgtFmt = format::oIhw8i;
-  } else if (inVal_->getFormat() == format::nChw16c) {
+  } else if (in->getFormat() == format::nChw16c) {
     wgtFmt = format::oIhw16i;
   }
-  wgt = MKLDNNMatrix::create(
-      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  auto wgtPD =
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  resetWithMatrix(wgt, weight_->getW(), wgtPD);
   wgt->downSpatial();
-  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
-
-  bias = (biases_ && biases_->getW())
-             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
-             : nullptr;
-}
 
-void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
-  if (!outputIsOnlyMKLDNN()) {
-    // fc cpu output value do not need create convert, just share data
-    getOutput(CPU_DEVICE).value->setData(out->getData());
+  if (biases_ == nullptr || biases_->getW() == nullptr) {
+    return;
   }
-  output_.value = std::dynamic_pointer_cast<Matrix>(out);
+  auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+  resetWithMatrix(bias, biases_->getW(), biasPD);
 }
 
 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@@ -219,7 +193,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
   } else {
     fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
   }
-
   pipeline.push_back(*fwd_);
 }
 
@@ -227,44 +200,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& wgt,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetWgtBiasGrad(wgt, bias);
-
-  resetInGrad(in);
-}
-
-void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_);
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    output_.grad->setData(cpuOut->getData());
-    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
-  }
-}
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 
-void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
-                                     MKLDNNMatrixPtr& bias) {
   CHECK(wgtVal_);
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
 
   bias = nullptr;
   if (biasVal_ == nullptr) {
     return;
   }
-  bias =
-      MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-}
-
-void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-  CHECK(inVal_);
-  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
+  resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
 }
 
 void MKLDNNFcLayer::resetBwdWgtPD(
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index c76878aafab..ee861763ff3 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -66,8 +66,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void updateWeights(const UpdateCallback& callback) override;
 
   void convertWeightsFromPaddle() override;
@@ -84,9 +82,6 @@ protected:
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  void resetInValue(MKLDNNMatrixPtr& in);
-  void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetOutValue(MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
                   MKLDNNMatrixPtr wgt,
@@ -109,9 +104,6 @@ protected:
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  void resetOutGrad(MKLDNNMatrixPtr& out);
-  void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetInGrad(MKLDNNMatrixPtr& in);
   void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
                      MKLDNNMatrixPtr& wgt,
                      MKLDNNMatrixPtr& bias,
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 4e2753eba23..ab59357ad01 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -58,11 +58,30 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-  // MKLDNNMatrixPtr with internal format
+  /// value and grad are seperate as internal and external buffers.
+  /// each MKLDNNLayer must init or reset internal buffer at least,
+  /// and the external buffer format is always nchw of nc(when h==w==1),
+  /// which is the same format as paddle.
+  /// When mixed with cpu device, the output_.value and output_.grad
+  /// always save the external data.
+  /// When all layers are all mkldnn layers, they could be internal data.
+  /// below MKLDNNMatrix buffers are all internal buffers
   MKLDNNMatrixPtr inVal_;
   MKLDNNMatrixPtr inGrad_;
   MKLDNNMatrixPtr outVal_;
   MKLDNNMatrixPtr outGrad_;
+  // below are external value and grad
+  MKLDNNMatrixPtr extInVal_;
+  MKLDNNMatrixPtr extInGrad_;
+  MKLDNNMatrixPtr extOutVal_;
+  MKLDNNMatrixPtr extOutGrad_;
+  // convert handle between external and internal buffers
+  std::shared_ptr<mkldnn::reorder> cvtInVal_;
+  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // weight and bias are always internal buffers
   MKLDNNMatrixPtr wgtVal_;
   MKLDNNMatrixPtr wgtGrad_;
   MKLDNNMatrixPtr biasVal_;
@@ -91,6 +110,7 @@ public:
         oh_(0),
         ow_(0),
         needResetBwd_(true),
+        outputOnlyMKLDNN_(false),
         engine_(mkldnn::engine::cpu, 0),
         stream_(nullptr),
         fwd_(nullptr),
@@ -128,20 +148,39 @@ public:
       REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
       CHECK(!inputLayers_.empty());
       copySeqInfoToOutputs();
-      size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt();
+      size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
       if (inputElemenCnt_ != elemenCnt) {
         VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
         // reset when input total sizes changed, not only the batchsize
         inputElemenCnt_ = elemenCnt;
         pipelineFwd_.clear();
         reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
+        // all cpu device output grad or value share output's
+        shareCPUDevice();
         resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
+        // MKLDNNLayer output value should be MKLDNNMatrix
+        // so external output value is necessary.
+        // then external input value is not necessary,
+        // since input may be mkldnn internal buffer.
+        CHECK(extOutVal_) << "external output value is necessary";
+        output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+        CHECK(inVal_ && outVal_) << "internal memories are necessary";
+        if (cvtInVal_) {
+          pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
+        }
+        if (cvtOutVal_) {
+          pipelineFwd_.push_back(*cvtOutVal_);
+        }
         convertWeightsFromPaddle();
+        printValueFormat();
         needResetBwd_ = true;
       }
 
       if (inputLayers_[0]->getType() == "data") {
-        updateInputData();
+        // Update input value data when input layer is "data" type,
+        // since the input value data address might be changed.
+        CHECK(extInVal_);
+        extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
       }
 
       if (!outputOnlyMKLDNN_) {
@@ -149,8 +188,7 @@ public:
       }
       stream_->submit(pipelineFwd_);
     }
-
-    /* activation */ {
+    {
       REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
       forwardActivation();
     }
@@ -163,6 +201,16 @@ public:
       pipelineMergeGrad_.clear();
       mergeGrad_ = nullptr;
       resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
+      // external output grad is not necessary
+      // since output may be mkldnn internal buffer or merge them directly.
+      CHECK(outGrad_) << "internal output grad is necessary";
+      if (cvtOutGrad_) {
+        pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
+      }
+      if (cvtInGrad_) {
+        pipelineBwd_.push_back(*cvtInGrad_);
+      }
+      printGradFormat();
       needResetBwd_ = false;
     }
 
@@ -179,7 +227,6 @@ public:
       REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
       stream_->submit(pipelineBwd_);
     }
-
     {
       REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
       updateWeights(callback);
@@ -195,7 +242,7 @@ public:
       int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
 
   /**
-   * reset the mkldnn forward primitve and memory
+   * reset the mkldnn forward primitve and memories
    * only would be called when input size changes
    */
   virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
@@ -205,7 +252,7 @@ public:
                         MKLDNNMatrixPtr& out) = 0;
 
   /**
-   * reset the mkldnn backward primitve and memory for mkldnn fc
+   * reset the mkldnn backward primitve and memories
    * only would be called when needed
    */
   virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
@@ -214,12 +261,6 @@ public:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out) = 0;
 
-  /**
-   * Update input value data when input layer is "data" type.
-   * Since the input value data address might be changed.
-   */
-  virtual void updateInputData() {}
-
   /**
    * Update weights and biases if necessary.
    */
@@ -272,21 +313,167 @@ protected:
   }
 
   /**
-   * reset the output grad matrix from primitive desc.
-   * and reset the merge grad primitive if needed.
-   * note: when this layer has serval outputs,
+   * reset MKLDNNMatrix from Matrix and internal primitive desc.
+   * reset nullptr if matrix or primitive desc is empty
+   */
+  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                       const MatrixPtr& mat,
+                       mkldnn::memory::primitive_desc pd) {
+    dnn = nullptr;
+    if (mat == nullptr) {
+      return;
+    }
+    dnn = MKLDNNMatrix::create(mat, pd);
+  }
+
+  /**
+   * reset input value from input MKLDNNMatrix and internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInValue(
+      MKLDNNMatrixPtr& in,
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr) {
+    cvtInVal_ = nullptr;
+    extInVal_ = nullptr;
+    in = nullptr;
+    CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
+    auto extPD = MKLDNNMatrix::createPrimitiveDesc(
+        {bs_, ic_, ih_, iw_}, mkldnn::memory::format::nchw, engine_);
+    const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+    in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+    CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
+    if (in == nullptr || in->getFormat() == mkldnn::memory::format::nc) {
+      in = MKLDNNMatrix::create(inMat, extPD);
+    }
+    extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
+    if (in->getFormat() == mkldnn::memory::format::nc) {
+      CHECK(ih_ == 1 && iw_ == 1);
+    }
+    if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
+      return;
+    }
+    // need create reorder
+    in = MKLDNNMatrix::create(nullptr, *intPD);
+    extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(inMat, extPD);
+    cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
+    CHECK(cvtInVal_) << "should not be emptry";
+  }
+
+  /**
+   * reset output value from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetOutValue(MKLDNNMatrixPtr& out,
+                     mkldnn::memory::primitive_desc intPD) {
+    cvtOutVal_ = nullptr;
+    out = MKLDNNMatrix::create(output_.value, intPD);
+    extOutVal_ = out;
+    if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
+      return;
+    }
+    // need create reorder
+    CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
+    extOutVal_ = MKLDNNMatrix::create(output_.value,
+                                      {bs_, oc_, oh_, ow_},
+                                      mkldnn::memory::format::nchw,
+                                      engine_);
+    out = MKLDNNMatrix::create(nullptr, intPD);
+    cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
+    CHECK(cvtOutVal_) << "should not be empty";
+  }
+
+  /**
+   * reset input grad from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD) {
+    cvtInGrad_ = nullptr;
+    extInGrad_ = nullptr;
+    in = nullptr;
+    LayerPtr& input = inputLayers_[0];
+    if (input->getOutputGrad() == nullptr) {
+      // no need input grad
+      return;
+    }
+    CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
+        << "only support input is MKLDNN layer or only have one output layer";
+    // when input is a mkldnn branch node,
+    // this layer will save input grad to a internal buffer,
+    // and the mkldnn input layer will merge them to actual prev->output_.grad
+    const MatrixPtr& inMat =
+        input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
+    in = MKLDNNMatrix::create(inMat, intPD);
+    Argument& arg = input->getOutput(this->getName());
+    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+    CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+        << "should have internal input value and primitive desc must equal";
+    if (inputIsOnlyMKLDNN()) {
+      return;
+    }
+
+    extInGrad_ = in;
+    if (isPaddleFormat(extInGrad_->getFormat())) {
+      return;
+    }
+    // need create reorder
+    CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
+        << "should have external input value and the format must be nchw(nc)";
+    extInGrad_ = MKLDNNMatrix::create(inMat, extInVal_->getPrimitiveDesc());
+    CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+        << "should have internal input value and primitive desc must equal";
+    in = MKLDNNMatrix::create(nullptr, intPD);
+    cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
+    CHECK(cvtInGrad_);
+  }
+
+  /**
+   * reset output grad from internal primitive desc.
+   * merge grad if necessary.
+   * reset both internal and external buffer and create reorder if necessary.
+   * note: about merge grad, when this layer has serval outputs,
    *       it could not be mixed with cpu device,
    *       since it can not get memory desc from cpu device.
    */
-  virtual void resetOutGrad(MKLDNNMatrixPtr& out,
-                            mkldnn::memory::primitive_desc pd) {
-    CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
+  void resetOutGrad(MKLDNNMatrixPtr& out,
+                    mkldnn::memory::primitive_desc intPD) {
+    cvtOutGrad_ = nullptr;
+    extOutGrad_ = nullptr;
+    out = nullptr;
+    MatrixPtr& outMat = output_.grad;
+    out = MKLDNNMatrix::create(outMat, intPD);
+    resetMergeGrad(out);
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
+    extOutGrad_ = out;
+    if (isPaddleFormat(extOutGrad_->getFormat())) {
+      return;
+    }
+    // need create reorder
+    CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
+        << "should have external output value and the format must be nchw(nc)";
+    extOutGrad_ = MKLDNNMatrix::create(outMat, extOutVal_->getPrimitiveDesc());
+    CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
+        << "should have internal output value and primitive desc must equal";
+    out = MKLDNNMatrix::create(nullptr, intPD);
+    cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
+    CHECK(cvtOutGrad_);
+  }
+
+  /**
+   * reset the merge grad primitive if necessary.
+   * note: do not support the grads are mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  virtual void resetMergeGrad(MKLDNNMatrixPtr& out) {
     mergeGrad_ = nullptr;
     pipelineMergeGrad_.clear();
-    out = MKLDNNMatrix::create(output_.grad, pd);
-    if (outputMap_.size() <= 1) {
+    if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
+      // do not merge when output is not all MKLDNN or only one output
       return;
     }
+    CHECK(out) << "should have reset internal ouput grad";
     std::vector<double> scales(outputMap_.size(), 1.0);
     std::vector<mkldnn::memory::primitive_desc> srcPDs;
     std::vector<mkldnn::primitive::at> srcs;
@@ -309,15 +496,13 @@ protected:
     for (size_t i = 1; i < srcPDs.size(); ++i) {
       CHECK(srcPDs[0] == srcPDs[i]);
     }
-    tmpOutGrad_ = nullptr;
+    tmpOutGrad_ = out;
     tmpCvt_ = nullptr;
     if (out->getPrimitiveDesc() != srcPDs[0]) {
       tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
       tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
       CHECK(tmpCvt_);
       pipelineMergeGrad_.push_back(*tmpCvt_);
-    } else {
-      tmpOutGrad_ = out;
     }
 
     auto sumPD = mkldnn::sum::primitive_desc(
@@ -326,21 +511,6 @@ protected:
     pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
   }
 
-  /**
-   * reset input grad from primitive desc.
-   * this function is avaiable for input is only mkldnn
-   * or input do not care cpu device
-   */
-  virtual void resetInGrad(MKLDNNMatrixPtr& in,
-                           mkldnn::memory::primitive_desc pd) {
-    LayerPtr& input = inputLayers_[0];
-    const MatrixPtr& grad =
-        input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
-    in = MKLDNNMatrix::create(grad, pd);
-    Argument& arg = input->getOutput(this->getName());
-    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  }
-
   /**
    * print info about sizes
    */
@@ -351,22 +521,50 @@ protected:
   }
 
   /**
-   * Print the mkldnn memory format flow of value
+   * print the mkldnn memory format of value
    */
-  virtual void printValueFormatFlow() {
-    if (inVal_ && outVal_) {
-      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>> "
-                        << outVal_->getFormat();
+  virtual void printValueFormat() {
+    if (extInVal_) {
+      VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
+    }
+    if (inVal_) {
+      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+    if (wgtVal_) {
+      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
+    }
+    if (biasVal_) {
+      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
     }
   }
 
   /**
-   * Print the mkldnn memory format flow of grad
+   * print the mkldnn memory format of grad
    */
-  virtual void printGradFormatFlow() {
-    if (inGrad_ && outGrad_) {
-      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<< "
-                        << outGrad_->getFormat();
+  virtual void printGradFormat() {
+    if (extInGrad_) {
+      VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
+    }
+    if (inGrad_) {
+      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (wgtGrad_) {
+      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
+    }
+    if (biasGrad_) {
+      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
     }
   }
 
@@ -405,6 +603,19 @@ protected:
   void setDevice(int id) { deviceId_ = id; }
 
 private:
+  /**
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
+   */
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
   /**
    * clear all grad
    */
@@ -449,6 +660,19 @@ private:
     }
   }
 
+  /**
+   * if have cpu device, share value and grad data with output_
+   */
+  void shareCPUDevice() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].value = output_.value;
+      outputOtherDevice_[i].grad = output_.grad;
+    }
+  }
+
   /**
    * Check the cpu device number of outputOtherDevice_.
    * should have only one at most.
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 0e53e2d1b7e..6e89260f499 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdPD(fwdPD_, in, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -101,65 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdPD(pd, in, out);
 
   resetBwdPipeline(pipeline, pd, in, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNPoolLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
   resetInValue(in);
 
-  resetOutValue(out);
-}
-
-void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
-}
-
-void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  CHECK(inVal_) << "Should reset input value first";
   memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  out = MKLDNNMatrix::create(
-      output_.value, outDims, inVal_->getFormat(), engine_);
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutVal_ = nullptr;
-  cvtOutVal_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
-    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
-      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
-    } else {
-      cpuOut->setData(output_.value->getData());
-      cpuOutVal_ = out;
-    }
-    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
-    return;
-  }
-  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
 }
 
 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr in,
                                  MKLDNNMatrixPtr out) {
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
   memory::dims kernels = memory::dims{fh_, fw_};
   memory::dims strides = memory::dims{sh_, sw_};
   memory::dims padL = memory::dims{ph_, pw_};
@@ -194,58 +149,26 @@ void MKLDNNPoolLayer::resetFwdPipeline(
              ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
              : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
   pipeline.push_back(*fwd_);
-
-  if (cvtOutVal_) {
-    pipeline.push_back(*cvtOutVal_);
-  }
 }
 
 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetInGrad(in);
-}
-void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  cpuOutGrad_ = nullptr;
-  cvtOutGrad_ = nullptr;
-  CHECK(outVal_);
-  if (outputIsOnlyMKLDNN()) {
-    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
-  } else {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    // always share the same grad data of CPU output
-    // then the activation can get the right grad from output_.grad
-    output_.grad->setData(cpuOut->getData());
-    cpuOutGrad_ = MKLDNNMatrix::create(
-        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
-      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
-      CHECK(cvtOutGrad_) << "should not be emptry";
-    } else {
-      out = cpuOutGrad_;
-    }
-  }
-}
-
-void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  if (inputLayers_[0]->getOutput().grad == nullptr) {
-    return;
-  }
-  CHECK(inVal_);
-  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr& in,
                                  MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
   memory::dims kernels = memory::dims{fh_, fw_};
   memory::dims strides = memory::dims{sh_, sw_};
   memory::dims padL = memory::dims{ph_, pw_};
   memory::dims padR = getPaddingR();
-  CHECK(in);
   CHECK(out);
   auto bwdDesc = pool_bwd::desc(poolAlgo_,
                                 in->getMemoryDesc(),
@@ -263,8 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline(
     std::shared_ptr<pool_bwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  if (cvtOutGrad_) {
-    pipeline.push_back(*cvtOutGrad_);
+  if (pd == nullptr) {
+    return;
   }
 
   bwdData_ =
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index 891e15a7efc..c5ec87828bf 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -38,13 +38,6 @@ protected:
   // pooling_avg or pooling_max
   mkldnn::algorithm poolAlgo_;
 
-  // MKLDNNMatrixPtr which should be created from CPU Device
-  MKLDNNMatrixPtr cpuOutVal_;
-  MKLDNNMatrixPtr cpuOutGrad_;
-  // convert handle between CPU device and MKLDNN device
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
   // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
@@ -74,8 +67,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void printSizeInfo() override {
     MKLDNNLayer::printSizeInfo();
     VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
@@ -90,8 +81,6 @@ protected:
    *                    reset pipeline.
    */
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetInValue(MKLDNNMatrixPtr& in);
-  void resetOutValue(MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
                   MKLDNNMatrixPtr out);
@@ -106,8 +95,6 @@ protected:
    *                     reset pipeline.
    */
   void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetOutGrad(MKLDNNMatrixPtr& out);
-  void resetInGrad(MKLDNNMatrixPtr& in);
   void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr& in,
                   MKLDNNMatrixPtr& out);
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 0778bb63b7b..c606560473a 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -46,7 +46,7 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
                                      memory::format fmt,
                                      engine& eg,
                                      mkldnn::memory::data_type dtype) {
-  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
+  return create(m, createPrimitiveDesc(dims, fmt, eg, dtype));
 }
 
 std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index c843115eb9a..9e3f29eb575 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -52,12 +52,24 @@ public:
       mkldnn::engine& eg,
       mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
 
+  /**
+   * Create primitive descriptor.
+   * default with f32 dtype
+   */
+  static mkldnn::memory::primitive_desc createPrimitiveDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt,
+      const mkldnn::engine& eg,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
+  }
+
   /**
    * Create Memory descriptor.
    * default with any format and f32 dtype
    */
   static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims& dims,
+      const mkldnn::memory::dims dims,
       const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
       const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
     return mkldnn::memory::desc(dims, dtype, fmt);
-- 
GitLab