diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 4002a3d0747a86ab7b495ffe52247521831b71b8..9813a556076bc2666869a85225feaf10f345217a 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -86,6 +86,7 @@ protected:
   /// Also used in 'use_mkldnn' case.
   std::vector<Argument> outputOtherDevice_;
   /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
   std::map<std::string, Argument*> outputMap_;
   /// Used to merge grad on different devices.
   MatrixPtr tmpGrad_;
@@ -325,6 +326,11 @@ public:
     outputMap_[name] = output;
   }
 
+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
   /**
    * Get the output based on layer's name.
    */
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 0d6742e909635c1097b4fe21bbb304f8a71af5cb..93b35e46a75313a31c1f379cde617c1d0d7ab68c 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -225,8 +225,6 @@ void MKLDNNConvLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (cvtInVal_) {
     pipeline.push_back(*cvtInVal_);
   }
@@ -412,8 +410,6 @@ void MKLDNNConvLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (cvtOutGrad_) {
     pipeline.push_back(*cvtOutGrad_);
   }
@@ -446,28 +442,27 @@ void MKLDNNConvLayer::resetBwdPipeline(
 
 void MKLDNNConvLayer::resetOutGrad(
     std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
-  const MatrixPtr& outMat = output_.grad;
-  out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc());
-  CHECK(outVal_ != nullptr &&
-        out->getPrimitiveDesc() == outVal_->getPrimitiveDesc())
-      << "primitive desc of out grad and value should be equal";
-
-  // TODO(TJ): merge outgrad
-  // create reorder if has output grad does not match
   cpuOutGrad_ = nullptr;
   cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_ != nullptr &&
+        outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
+      << "primitive desc of out grad and value should be equal";
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    outMat->setData(cpuOut->getData());
     // same PrimitiveDesc with cpuInVal_
     CHECK(cpuOutVal_);
     cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
-    if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) {
-      out = cpuOutGrad_;
-    } else {
-      out = MKLDNNMatrix::create(nullptr, wgtPD->diff_dst_primitive_desc());
+    // create reorder if primitive desc does not match
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_);
+    } else {
+      // share the same data of CPU output
+      output_.grad->setData(cpuOut->getData());
+      out = cpuOutGrad_;
     }
   }
 }
@@ -496,32 +491,30 @@ void MKLDNNConvLayer::resetWgtBiasGrad(
 void MKLDNNConvLayer::resetInGrad(
     std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
     MKLDNNMatrixPtr& in) {
+  in = nullptr;
+  cpuInGrad_ = nullptr;
+  cvtInGrad_ = nullptr;
   if (dataPD == nullptr) {
     return;
   }
 
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
-  in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad,
-                            dataPD->diff_src_primitive_desc());
-  CHECK(nullptr != inVal_ &&
-        in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
-      << "primitive desc of input grad and value should be equal";
-
-  // create reorder if has output grad does not match
-  cpuInGrad_ = nullptr;
-  cvtInGrad_ = nullptr;
-  if (!inputIsOnlyMKLDNN()) {
+  if (inputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
+    CHECK(nullptr != inVal_ &&
+          in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
+        << "primitive desc of input grad and value should be equal";
+  } else {
     const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
     // same PrimitiveDesc with cpuInVal_
     CHECK(cpuInVal_);
     cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
-    if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE);
-      in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc());
+    in = cpuInGrad_;
+    // create reorder if PrimitiveDesc does not match
+    if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
+      in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
+                                dataPD->diff_src_primitive_desc());
       cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
       CHECK(cvtInGrad_);
-    } else {
-      in = cpuInGrad_;
     }
   }
 }
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index e829456d6afd7cc844f752d4571cd9f90c73997f..11d3553abf7d5dbc2f259e382ed0b525c4747f55 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -214,8 +214,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (bias) {
     fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
   } else {
@@ -237,19 +235,14 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
 }
 
 void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  // TODO(TJ): merge outgrad
-  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  output_.grad->setData(getOutput(device).grad->getData());
-  // for MKLDNN device:
-  // can not directly cast outputgrad to mkldnnmatrix,
-  // since each layer can not write the inputgrad to mkldnn inputgrad.
-  // So just create from matrix with outputvalue format.
-  // for CPU device:
-  // fc do not need to convert from cpu device since output is always nc format
-  // only need create from cpu device
   CHECK(outVal_);
-  out =
-      MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc());
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    output_.grad->setData(cpuOut->getData());
+    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
+  }
 }
 
 void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
@@ -267,13 +260,11 @@ void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
 
 void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
   in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
     return;
   }
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
   CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNFcLayer::resetBwdWgtPD(
@@ -314,7 +305,6 @@ void MKLDNNFcLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   CHECK(inVal_);
   if (bias) {
     bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index c09fd89462ef4fdaeaae3e122f96b0cc6ce373ea..41d74d08a90589761ae93cfe06bceb33bf9c94cf 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -65,6 +65,11 @@ protected:
   MKLDNNMatrixPtr biasVal_;
   MKLDNNMatrixPtr biasGrad_;
 
+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
@@ -99,6 +104,7 @@ public:
     if (!Layer::init(layerMap, parameterMap)) {
       return false;
     }
+    setOutputMap();
     checkCPUOutputsNumber();
 
     stream_.reset(new MKLDNNStream());
@@ -118,6 +124,7 @@ public:
         VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
         // reset when input total sizes changed, not only the batchsize
         inputElemenCnt_ = elemenCnt;
+        pipelineFwd_.clear();
         reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
         resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
         if (outVal_) {
@@ -144,6 +151,7 @@ public:
   void backward(const UpdateCallback& callback) override {
     if (needResetBwd_) {
       VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+      pipelineBwd_.clear();
       resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
       needResetBwd_ = false;
     }
@@ -247,6 +255,58 @@ protected:
     }
   }
 
+  /**
+   * reset the output grad matrix from primitive desc.
+   * and reset the merge grad primitive if needed.
+   * note: when this layer have serval output,
+   *       do not support mixing with cpu device,
+   *       because can not get memory desc from cpu device.
+   */
+  virtual void resetOutGrad(MKLDNNMatrixPtr& out,
+                            mkldnn::memory::primitive_desc pd) {
+    CHECK(outputIsOnlyMKLDNN()) << "only support mixed with other device yet";
+    mergeGrad_ = nullptr;
+    out = MKLDNNMatrix::create(output_.grad, pd);
+    if (outputMap_.size() <= 1) {
+      return;
+    }
+    std::vector<double> scales;
+    std::vector<mkldnn::memory::primitive_desc> srcPDs;
+    std::vector<mkldnn::primitive::at> srcs;
+    for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+      MKLDNNMatrixPtr src =
+          std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+      CHECK(src) << "should be MKLDNNMatrix";
+      auto srcDims = src->getDims();
+      auto dstDims = out->getDims();
+      CHECK_EQ(srcDims.size(), dstDims.size());
+      for (size_t i = 0; i < srcDims.size(); ++i) {
+        CHECK_EQ(srcDims[i], dstDims[i]);
+      }
+      srcPDs.push_back(src->getPrimitiveDesc());
+      srcs.push_back(*src);
+      scales.push_back(1.0);
+    }
+    auto sumPD = mkldnn::sum::primitive_desc(pd.desc(), scales, srcPDs);
+    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *out));
+    pipelineBwd_.insert(pipelineBwd_.begin(), *mergeGrad_);
+  }
+
+  /**
+   * reset input grad from primitive desc.
+   * this function is avaiable for input is only mkldnn
+   * or input do not care cpu device
+   */
+  virtual void resetInGrad(MKLDNNMatrixPtr& in,
+                           mkldnn::memory::primitive_desc pd) {
+    LayerPtr& input = inputLayers_[0];
+    const MatrixPtr& grad =
+        input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
+    in = MKLDNNMatrix::create(grad, pd);
+    auto arg = input->getOutput(this->getName());
+    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  }
+
   /**
    * print info about sizes
    */
@@ -334,6 +394,16 @@ private:
     }
   }
 
+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
   /**
    * Check the cpu device number of outputOtherDevice_.
    * should have only one at most.
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index b62dfb7c54258a593aa50d5b30096423f375c69d..5de23e1378836bd3baee1d9c8942a9a575c9dd06 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -187,7 +187,6 @@ void MKLDNNPoolLayer::resetFwdPipeline(
     std::shared_ptr<pool_fwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   fwd_ = workspace_
              ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
              : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
@@ -205,17 +204,17 @@ void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
   resetInGrad(in);
 }
 void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_) << "Should have output value";
-  out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
-
-  // create reorder if output value has cpu device and pd do not match
   cpuOutGrad_ = nullptr;
   cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_);
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
     cpuOutGrad_ = MKLDNNMatrix::create(
         cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_) << "should not be emptry";
     } else {
@@ -228,12 +227,11 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
 
 void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
   in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
     return;
   }
   CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
@@ -261,7 +259,6 @@ void MKLDNNPoolLayer::resetBwdPipeline(
     std::shared_ptr<pool_bwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   if (cvtOutGrad_) {
     pipeline.push_back(*cvtOutGrad_);
   }