diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 4002a3d0747a86ab7b495ffe52247521831b71b8..9813a556076bc2666869a85225feaf10f345217a 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -86,6 +86,7 @@ protected:
   /// Also used in 'use_mkldnn' case.
   std::vector<Argument> outputOtherDevice_;
   /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
   std::map<std::string, Argument*> outputMap_;
   /// Used to merge grad on different devices.
   MatrixPtr tmpGrad_;
@@ -325,6 +326,11 @@ public:
     outputMap_[name] = output;
   }
 
+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
   /**
    * Get the output based on layer's name.
    */
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 0d6742e909635c1097b4fe21bbb304f8a71af5cb..8b67a1ef4ffdd42559f8078873ed135751d56674 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -225,8 +225,6 @@ void MKLDNNConvLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (cvtInVal_) {
     pipeline.push_back(*cvtInVal_);
   }
@@ -245,7 +243,7 @@ void MKLDNNConvLayer::resetFwdPipeline(
 
 void MKLDNNConvLayer::resetInValue(
     std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
-  const MatrixPtr& inMat = inputLayers_[0]->getOutput().value;
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
   in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
 
   // create buffer and reorder if input value do not match
@@ -310,15 +308,20 @@ void MKLDNNConvLayer::resetOutValue(
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
     memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
     cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+    if (cpuOutVal_->getPrimitiveDesc() != pd->dst_primitive_desc()) {
+      out = MKLDNNMatrix::create(nullptr, pd->dst_primitive_desc());
       cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
+      CHECK(cvtOutVal_) << "should not be empty";
     } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
       cpuOutVal_ = out;
     }
+    // when output is cpu device, change the mkldnn output value and make them
+    // share the same data. Then if next layer use inputlayer->getOuputValue()
+    // to achieve the input value, it will get the right data.
+    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
+    return;
   }
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 
 void MKLDNNConvLayer::resetBwdWgtPD(
@@ -412,8 +415,6 @@ void MKLDNNConvLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (cvtOutGrad_) {
     pipeline.push_back(*cvtOutGrad_);
   }
@@ -446,28 +447,27 @@ void MKLDNNConvLayer::resetBwdPipeline(
 
 void MKLDNNConvLayer::resetOutGrad(
     std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
-  const MatrixPtr& outMat = output_.grad;
-  out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc());
-  CHECK(outVal_ != nullptr &&
-        out->getPrimitiveDesc() == outVal_->getPrimitiveDesc())
-      << "primitive desc of out grad and value should be equal";
-
-  // TODO(TJ): merge outgrad
-  // create reorder if has output grad does not match
   cpuOutGrad_ = nullptr;
   cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_ != nullptr &&
+        outVal_->getPrimitiveDesc() == wgtPD->diff_dst_primitive_desc())
+      << "primitive desc of out grad and value should be equal";
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    outMat->setData(cpuOut->getData());
     // same PrimitiveDesc with cpuInVal_
     CHECK(cpuOutVal_);
     cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
-    if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) {
-      out = cpuOutGrad_;
-    } else {
-      out = MKLDNNMatrix::create(nullptr, wgtPD->diff_dst_primitive_desc());
+    // create reorder if primitive desc does not match
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_);
+    } else {
+      // share the same data of CPU output
+      output_.grad->setData(cpuOut->getData());
+      out = cpuOutGrad_;
     }
   }
 }
@@ -496,32 +496,30 @@ void MKLDNNConvLayer::resetWgtBiasGrad(
 void MKLDNNConvLayer::resetInGrad(
     std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
     MKLDNNMatrixPtr& in) {
+  in = nullptr;
+  cpuInGrad_ = nullptr;
+  cvtInGrad_ = nullptr;
   if (dataPD == nullptr) {
     return;
   }
 
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
-  in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad,
-                            dataPD->diff_src_primitive_desc());
-  CHECK(nullptr != inVal_ &&
-        in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
-      << "primitive desc of input grad and value should be equal";
-
-  // create reorder if has output grad does not match
-  cpuInGrad_ = nullptr;
-  cvtInGrad_ = nullptr;
-  if (!inputIsOnlyMKLDNN()) {
+  if (inputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetInGrad(in, dataPD->diff_src_primitive_desc());
+    CHECK(nullptr != inVal_ &&
+          in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
+        << "primitive desc of input grad and value should be equal";
+  } else {
     const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
     // same PrimitiveDesc with cpuInVal_
     CHECK(cpuInVal_);
     cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
-    if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE);
-      in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc());
+    in = cpuInGrad_;
+    // create reorder if PrimitiveDesc does not match
+    if (cpuInGrad_->getPrimitiveDesc() != dataPD->diff_src_primitive_desc()) {
+      in = MKLDNNMatrix::create(getInputGrad(0, MKLDNN_DEVICE),
+                                dataPD->diff_src_primitive_desc());
       cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
       CHECK(cvtInGrad_);
-    } else {
-      in = cpuInGrad_;
     }
   }
 }
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index e829456d6afd7cc844f752d4571cd9f90c73997f..cf19a155681f3a1ceb20af67245c8f2b8fa8fa73 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -180,10 +180,10 @@ void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
 void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
   out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
   if (!outputIsOnlyMKLDNN()) {
-    // fc cpu output value do not need create convert
-    // just share point
+    // fc cpu output value do not need create convert, just share data
     getOutput(CPU_DEVICE).value->setData(out->getData());
   }
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
 }
 
 void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
@@ -214,8 +214,6 @@ void MKLDNNFcLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (bias) {
     fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
   } else {
@@ -237,19 +235,14 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
 }
 
 void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  // TODO(TJ): merge outgrad
-  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  output_.grad->setData(getOutput(device).grad->getData());
-  // for MKLDNN device:
-  // can not directly cast outputgrad to mkldnnmatrix,
-  // since each layer can not write the inputgrad to mkldnn inputgrad.
-  // So just create from matrix with outputvalue format.
-  // for CPU device:
-  // fc do not need to convert from cpu device since output is always nc format
-  // only need create from cpu device
   CHECK(outVal_);
-  out =
-      MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc());
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    output_.grad->setData(cpuOut->getData());
+    out = MKLDNNMatrix::create(cpuOut, outVal_->getPrimitiveDesc());
+  }
 }
 
 void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
@@ -267,13 +260,11 @@ void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
 
 void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
   in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
     return;
   }
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
   CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNFcLayer::resetBwdWgtPD(
@@ -314,7 +305,6 @@ void MKLDNNFcLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   CHECK(inVal_);
   if (bias) {
     bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index c09fd89462ef4fdaeaae3e122f96b0cc6ce373ea..5f9923da769781287e39a3aaaf92248dfe09f225 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -65,6 +65,17 @@ protected:
   MKLDNNMatrixPtr biasVal_;
   MKLDNNMatrixPtr biasGrad_;
 
+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+  // since mkldnn sum do not support different formats:
+  // can refer to https://github.com/01org/mkl-dnn/issues/134
+  // so need create reorder manually and save tmp MKLDNNMatrix
+  MKLDNNMatrixPtr tmpOutGrad_;
+  std::shared_ptr<mkldnn::primitive> tmpCvt_;
+
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
@@ -99,6 +110,7 @@ public:
     if (!Layer::init(layerMap, parameterMap)) {
       return false;
     }
+    setOutputMap();
     checkCPUOutputsNumber();
 
     stream_.reset(new MKLDNNStream());
@@ -118,12 +130,9 @@ public:
         VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
         // reset when input total sizes changed, not only the batchsize
         inputElemenCnt_ = elemenCnt;
+        pipelineFwd_.clear();
         reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
         resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-        if (outVal_) {
-          // change original output value to mkldnn output value
-          output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
-        }
         convertWeightsFromPaddle();
         needResetBwd_ = true;
       }
@@ -144,9 +153,18 @@ public:
   void backward(const UpdateCallback& callback) override {
     if (needResetBwd_) {
       VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+      pipelineBwd_.clear();
+      pipelineMergeGrad_.clear();
+      mergeGrad_ = nullptr;
       resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
       needResetBwd_ = false;
     }
+
+    // merge grad must before backward activation
+    if (mergeGrad_) {
+      REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+      stream_->submit(pipelineMergeGrad_);
+    }
     {
       REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
       backwardActivation();
@@ -247,6 +265,76 @@ protected:
     }
   }
 
+  /**
+   * reset the output grad matrix from primitive desc.
+   * and reset the merge grad primitive if needed.
+   * note: when this layer has serval outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  virtual void resetOutGrad(MKLDNNMatrixPtr& out,
+                            mkldnn::memory::primitive_desc pd) {
+    CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
+    mergeGrad_ = nullptr;
+    pipelineMergeGrad_.clear();
+    out = MKLDNNMatrix::create(output_.grad, pd);
+    if (outputMap_.size() <= 1) {
+      return;
+    }
+    std::vector<double> scales(outputMap_.size(), 1.0);
+    std::vector<mkldnn::memory::primitive_desc> srcPDs;
+    std::vector<mkldnn::primitive::at> srcs;
+    for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+      MKLDNNMatrixPtr src =
+          std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+      VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
+      CHECK(src) << "should be MKLDNNMatrix";
+      auto srcDims = src->getDims();
+      auto dstDims = out->getDims();
+      CHECK_EQ(srcDims.size(), dstDims.size());
+      for (size_t i = 0; i < srcDims.size(); ++i) {
+        CHECK_EQ(srcDims[i], dstDims[i]);
+      }
+      srcPDs.push_back(src->getPrimitiveDesc());
+      srcs.push_back(*src);
+    }
+
+    // TODO(TJ): remove me when mkldnn sum support different formats
+    for (size_t i = 1; i < srcPDs.size(); ++i) {
+      CHECK(srcPDs[0] == srcPDs[i]);
+    }
+    tmpOutGrad_ = nullptr;
+    tmpCvt_ = nullptr;
+    if (out->getPrimitiveDesc() != srcPDs[0]) {
+      tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
+      tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+      CHECK(tmpCvt_);
+      pipelineMergeGrad_.push_back(*tmpCvt_);
+    } else {
+      tmpOutGrad_ = out;
+    }
+
+    auto sumPD = mkldnn::sum::primitive_desc(
+        tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
+    pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+  }
+
+  /**
+   * reset input grad from primitive desc.
+   * this function is avaiable for input is only mkldnn
+   * or input do not care cpu device
+   */
+  virtual void resetInGrad(MKLDNNMatrixPtr& in,
+                           mkldnn::memory::primitive_desc pd) {
+    LayerPtr& input = inputLayers_[0];
+    const MatrixPtr& grad =
+        input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
+    in = MKLDNNMatrix::create(grad, pd);
+    Argument& arg = input->getOutput(this->getName());
+    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  }
+
   /**
    * print info about sizes
    */
@@ -334,6 +422,16 @@ private:
     }
   }
 
+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
   /**
    * Check the cpu device number of outputOtherDevice_.
    * should have only one at most.
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index b62dfb7c54258a593aa50d5b30096423f375c69d..5606aae80ce8e9a1e571d3c057c471b26a59d032 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -142,14 +142,16 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
     cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
     if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(nullptr, out->getPrimitiveDesc());
       cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
       CHECK(cvtOutVal_) << "should not be emptry";
     } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
       cpuOutVal_ = out;
     }
+    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
+    return;
   }
+  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
 }
 
 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
@@ -187,7 +189,6 @@ void MKLDNNPoolLayer::resetFwdPipeline(
     std::shared_ptr<pool_fwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   fwd_ = workspace_
              ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
              : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
@@ -205,17 +206,17 @@ void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
   resetInGrad(in);
 }
 void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_) << "Should have output value";
-  out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
-
-  // create reorder if output value has cpu device and pd do not match
   cpuOutGrad_ = nullptr;
   cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
+  CHECK(outVal_);
+  if (outputIsOnlyMKLDNN()) {
+    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
+  } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
     cpuOutGrad_ = MKLDNNMatrix::create(
         cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
+      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_) << "should not be emptry";
     } else {
@@ -228,12 +229,11 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
 
 void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
   in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
     return;
   }
   CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  MKLDNNLayer::resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
@@ -261,7 +261,6 @@ void MKLDNNPoolLayer::resetBwdPipeline(
     std::shared_ptr<pool_bwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   if (cvtOutGrad_) {
     pipeline.push_back(*cvtOutGrad_);
   }
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index f59618be9d09d146be52fb51cae84f4d24c15ef1..eaebdd671cfa1b37e5efe149588ca23fdc402a8e 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -124,8 +124,8 @@ void MKLDNNTester::randomTopDiffs() {
 void MKLDNNTester::checkForward() {
   VLOG(MKLDNN_ALL) << "Check Forward";
   printTopDatas();
-  double delta = compareMatrix(dnnLayer_->getOutput(CPU_DEVICE).value,
-                               refLayer_->getOutputValue());
+  double delta =
+      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
   EXPECT_LE(fabs(delta), eps_);
 }
 
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 066837ca959e46dbe3b39c661aa1bab11cbf2734..5ebbb99c94bce45d295ae0bf585f2cf864bfc4d4 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -39,15 +39,18 @@ add_test(NAME test_CompareTwoNets
 
 ################ test_CompareMKLDNNandCPU ######################
 if(WITH_MKLDNN)
-  add_unittest_without_exec(test_CompareMKLDNNandCPU
-      test_CompareTwoNets.cpp)
-  add_test(NAME test_CompareMKLDNNandCPU
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-          ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU
-              --config_file_a=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_a=True
-              --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False
-              --use_gpu=False
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+  macro(gen_command VAR_NAME CONFIG_FILE)
+    set(${VAR_NAME} "${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh" "-d" "${PADDLE_SOURCE_DIR}/python/"
+                    "${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU --use_gpu=False"
+                    "--config_file_a=trainer/tests/${CONFIG_FILE} --use_mkldnn_a=True"
+                    "--config_file_b=trainer/tests/${CONFIG_FILE} --use_mkldnn_b=False"
+                    "WORKING_DIRECTORY" "${PADDLE_SOURCE_DIR}/paddle/")
+  endmacro()
+  add_unittest_without_exec(test_CompareMKLDNNandCPU test_CompareTwoNets.cpp)
+  gen_command(compare_simple_net "sample_trainer_config_simple_net.conf")
+  gen_command(compare_branch_net "sample_trainer_config_branch_net.conf")
+  add_test(NAME test_CompareMKLDNNandCPU_simple_net COMMAND ${compare_simple_net})
+  add_test(NAME test_CompareMKLDNNandCPU_branch_net COMMAND ${compare_branch_net})
 endif()
 
 ############### test_CompareTwoOpts ###################
diff --git a/paddle/trainer/tests/sample_trainer_config_branch_net.conf b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
new file mode 100644
index 0000000000000000000000000000000000000000..c2594bc13c250a877a7b8a77e11405671c4d8907
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
@@ -0,0 +1,103 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+################################### Algorithm Configuration ###################################
+settings(batch_size = 256,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+################################### Network Configuration ###################################
+data = data_layer(name ="input", size=784)
+
+tmp = img_conv_layer(input=data,
+            num_channels=1,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1 = img_conv_layer(input=tmp,
+            filter_size=1,
+            num_filters=32,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+
+a2 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = concat_layer(input=[a1, a2])
+
+tmp = img_pool_layer(input=tmp,
+            num_channels=64,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+b1 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=64,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+b1 = img_pool_layer(input=b1,
+            pool_size=3,
+            stride=1,
+            padding=1,
+            pool_type=MaxPooling())
+
+b2 = img_conv_layer(input=tmp,
+            filter_size=5,
+            num_filters=64,
+            padding=2,
+            shared_biases=True,
+            act=ReluActivation())
+
+b2 = img_pool_layer(input=b2,
+            pool_size=5,
+            stride=1,
+            padding=2,
+            pool_type=MaxPooling())
+
+tmp = addto_layer(input=[b1, b2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = fc_layer(input=tmp, size=64,
+            bias_attr=False,
+            act=TanhActivation())
+
+output = fc_layer(input=tmp, size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=10)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)