diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..91f0ff5bd326e848c49c4e57ccb9e02e2bdc0626
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -0,0 +1,327 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+bool MKLDNNLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                          << "Please set WITH_MKLDNN=ON "
+                          << "and set use_mkldnn=True";
+  CHECK(!useGpu_) << "Do not support GPU yet";
+
+  // set device id before Layer::init
+  setDevice(MKLDNN_DEVICE);
+  // change param device to MKLDNN device
+  setParamsDevice(MKLDNN_DEVICE, parameterMap);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setOutputMap();
+  checkCPUOutputsNumber();
+
+  stream_.reset(new MKLDNNStream());
+  engine_ = CPUEngine::Instance().getEngine();
+  return true;
+}
+
+void MKLDNNLayer::forward(PassType passType) {
+  passType_ = passType;
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+    CHECK(!inputLayers_.empty());
+    copySeqInfoToOutputs();
+    size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
+    if (inputElemenCnt_ != elemenCnt) {
+      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+      // reset when input total sizes changed, not only the batchsize
+      inputElemenCnt_ = elemenCnt;
+      pipelineFwd_.clear();
+      reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
+      // all cpu device output grad or value share output's
+      shareCPUDevice();
+      resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
+      // MKLDNNLayer output value should be MKLDNNMatrix
+      // so external output value is necessary.
+      // then external input value is not necessary,
+      // since input may be mkldnn internal buffer.
+      CHECK(extOutVal_) << "external output value is necessary";
+      output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+      CHECK(inVal_ && outVal_) << "internal memories are necessary";
+      if (cvtInVal_) {
+        pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
+      }
+      if (cvtOutVal_) {
+        pipelineFwd_.push_back(*cvtOutVal_);
+      }
+      convertWeightsFromPaddle();
+      printSizeInfo();
+      printValueFormat();
+      needResetBwd_ = true;
+    }
+
+    if (inputLayers_[0]->getType() == "data") {
+      // Update input value data when input layer is "data" type,
+      // since the input value data address might be changed.
+      CHECK(extInVal_);
+      extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+    }
+
+    if (!outputOnlyMKLDNN_) {
+      clearGrads();
+    }
+    stream_->submit(pipelineFwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MKLDNNLayer::backward(const UpdateCallback& callback) {
+  if (needResetBwd_) {
+    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+    pipelineBwd_.clear();
+    pipelineMergeGrad_.clear();
+    mergeGrad_ = nullptr;
+    resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (cvtOutGrad_) {
+      pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
+    }
+    if (cvtInGrad_) {
+      pipelineBwd_.push_back(*cvtInGrad_);
+    }
+    printGradFormat();
+    needResetBwd_ = false;
+  }
+
+  // merge grad must before backward activation
+  if (mergeGrad_) {
+    REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+    stream_->submit(pipelineMergeGrad_);
+  }
+  {
+    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
+    backwardActivation();
+  }
+  {
+    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
+    stream_->submit(pipelineBwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    updateWeights(callback);
+  }
+}
+
+void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
+  const Argument& input = inputLayers_[0]->getOutput();
+  batchsize = input.getBatchSize();
+  int h = input.getFrameHeight();
+  int w = input.getFrameWidth();
+  if (h != 0) {
+    height = h;
+  }
+  if (w != 0) {
+    width = w;
+  }
+}
+
+void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
+  output_.setFrameHeight(height);
+  output_.setFrameWidth(width);
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    outputOtherDevice_[i].setFrameHeight(height);
+    outputOtherDevice_[i].setFrameWidth(width);
+  }
+}
+
+void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                                  const MatrixPtr& mat,
+                                  memory::primitive_desc pd) {
+  dnn = nullptr;
+  if (mat == nullptr) {
+    return;
+  }
+  dnn = MKLDNNMatrix::create(pd, mat);
+}
+
+void MKLDNNLayer::resetInValue(
+    MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
+  cvtInVal_ = nullptr;
+  extInVal_ = nullptr;
+  in = nullptr;
+  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
+  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
+      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
+  if (in == nullptr || in->getFormat() == format::nc) {
+    in = MKLDNNMatrix::create(extPD, inMat);
+  }
+  extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
+  if (in->getFormat() == format::nc) {
+    CHECK(ih_ == 1 && iw_ == 1);
+  }
+  if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
+    return;
+  }
+  // need create reorder
+  in = MKLDNNMatrix::create(*intPD);
+  extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
+  cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
+  CHECK(cvtInVal_) << "should not be emptry";
+}
+
+void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
+                                memory::primitive_desc intPD) {
+  cvtOutVal_ = nullptr;
+  out = MKLDNNMatrix::create(intPD, output_.value);
+  extOutVal_ = out;
+  if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
+  extOutVal_ = MKLDNNMatrix::create(
+      memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value);
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
+  CHECK(cvtOutVal_) << "should not be empty";
+}
+
+void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
+                              memory::primitive_desc intPD) {
+  cvtInGrad_ = nullptr;
+  extInGrad_ = nullptr;
+  in = nullptr;
+  LayerPtr& input = inputLayers_[0];
+  if (input->getOutputGrad() == nullptr) {
+    // no need input grad
+    return;
+  }
+  CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
+      << "only support input is MKLDNN layer or only have one output layer";
+  // when input is a mkldnn branch node,
+  // this layer will save input grad to a internal buffer,
+  // and the mkldnn input layer will merge them to actual prev->output_.grad
+  const MatrixPtr& inMat =
+      input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
+  in = MKLDNNMatrix::create(intPD, inMat);
+  Argument& arg = input->getOutput(this->getName());
+  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+      << "should have internal input value and primitive desc must equal";
+  if (inputIsOnlyMKLDNN()) {
+    return;
+  }
+
+  extInGrad_ = in;
+  if (isPaddleFormat(extInGrad_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
+      << "should have external input value and the format must be nchw(nc)";
+  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
+  CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+      << "should have internal input value and primitive desc must equal";
+  in = MKLDNNMatrix::create(intPD);
+  cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
+  CHECK(cvtInGrad_);
+}
+
+void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
+                               memory::primitive_desc intPD) {
+  cvtOutGrad_ = nullptr;
+  extOutGrad_ = nullptr;
+  out = nullptr;
+  MatrixPtr& outMat = output_.grad;
+  out = MKLDNNMatrix::create(intPD, outMat);
+  resetMergeGrad(out);
+  if (outputIsOnlyMKLDNN()) {
+    return;
+  }
+  CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
+  extOutGrad_ = out;
+  if (isPaddleFormat(extOutGrad_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
+      << "should have external output value and the format must be nchw(nc)";
+  extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
+  CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
+      << "should have internal output value and primitive desc must equal";
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
+  CHECK(cvtOutGrad_);
+}
+
+void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
+  mergeGrad_ = nullptr;
+  pipelineMergeGrad_.clear();
+  if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
+    // do not merge when output is not all MKLDNN or only one output
+    return;
+  }
+  CHECK(out) << "should have reset internal ouput grad";
+  std::vector<double> scales(outputMap_.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  std::vector<primitive::at> srcs;
+  for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+    MKLDNNMatrixPtr src =
+        std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+    VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
+    CHECK(src) << "should be MKLDNNMatrix";
+    auto srcDims = src->getDims();
+    auto dstDims = out->getDims();
+    CHECK_EQ(srcDims.size(), dstDims.size());
+    for (size_t i = 0; i < srcDims.size(); ++i) {
+      CHECK_EQ(srcDims[i], dstDims[i]);
+    }
+    srcPDs.push_back(src->getPrimitiveDesc());
+    srcs.push_back(*src);
+  }
+
+  // TODO(TJ): remove me when mkldnn sum support different formats
+  for (size_t i = 1; i < srcPDs.size(); ++i) {
+    CHECK(srcPDs[0] == srcPDs[i]);
+  }
+  tmpOutGrad_ = out;
+  tmpCvt_ = nullptr;
+  if (out->getPrimitiveDesc() != srcPDs[0]) {
+    tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
+    tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+    CHECK(tmpCvt_);
+    pipelineMergeGrad_.push_back(*tmpCvt_);
+  }
+
+  auto sumPD =
+      sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+  mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
+  pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 80c67529dafe4d08be4271e8a57489d1770683eb..faad434526fd76d1ddb65984c43ac529ea071612 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -119,119 +119,9 @@ public:
 
   ~MKLDNNLayer() {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                            << "Please set WITH_MKLDNN=ON "
-                            << "and set use_mkldnn=True";
-    CHECK(!useGpu_) << "Do not support GPU yet";
-
-    // set device id before Layer::init
-    setDevice(MKLDNN_DEVICE);
-    // change param device to MKLDNN device
-    setParamsDevice(MKLDNN_DEVICE, parameterMap);
-    if (!Layer::init(layerMap, parameterMap)) {
-      return false;
-    }
-    setOutputMap();
-    checkCPUOutputsNumber();
-
-    stream_.reset(new MKLDNNStream());
-    engine_ = CPUEngine::Instance().getEngine();
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    passType_ = passType;
-
-    {
-      REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-      CHECK(!inputLayers_.empty());
-      copySeqInfoToOutputs();
-      size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
-      if (inputElemenCnt_ != elemenCnt) {
-        VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-        // reset when input total sizes changed, not only the batchsize
-        inputElemenCnt_ = elemenCnt;
-        pipelineFwd_.clear();
-        reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
-        // all cpu device output grad or value share output's
-        shareCPUDevice();
-        resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-        // MKLDNNLayer output value should be MKLDNNMatrix
-        // so external output value is necessary.
-        // then external input value is not necessary,
-        // since input may be mkldnn internal buffer.
-        CHECK(extOutVal_) << "external output value is necessary";
-        output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
-        CHECK(inVal_ && outVal_) << "internal memories are necessary";
-        if (cvtInVal_) {
-          pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
-        }
-        if (cvtOutVal_) {
-          pipelineFwd_.push_back(*cvtOutVal_);
-        }
-        convertWeightsFromPaddle();
-        printValueFormat();
-        needResetBwd_ = true;
-      }
-
-      if (inputLayers_[0]->getType() == "data") {
-        // Update input value data when input layer is "data" type,
-        // since the input value data address might be changed.
-        CHECK(extInVal_);
-        extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
-      }
-
-      if (!outputOnlyMKLDNN_) {
-        clearGrads();
-      }
-      stream_->submit(pipelineFwd_);
-    }
-    {
-      REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
-      forwardActivation();
-    }
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    if (needResetBwd_) {
-      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-      pipelineBwd_.clear();
-      pipelineMergeGrad_.clear();
-      mergeGrad_ = nullptr;
-      resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
-      // external output grad is not necessary
-      // since output may be mkldnn internal buffer or merge them directly.
-      CHECK(outGrad_) << "internal output grad is necessary";
-      if (cvtOutGrad_) {
-        pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
-      }
-      if (cvtInGrad_) {
-        pipelineBwd_.push_back(*cvtInGrad_);
-      }
-      printGradFormat();
-      needResetBwd_ = false;
-    }
-
-    // merge grad must before backward activation
-    if (mergeGrad_) {
-      REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
-      stream_->submit(pipelineMergeGrad_);
-    }
-    {
-      REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
-      backwardActivation();
-    }
-    {
-      REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
-      stream_->submit(pipelineBwd_);
-    }
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      updateWeights(callback);
-    }
-  }
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
   /**
    * reshape the input image sizes
@@ -287,30 +177,12 @@ protected:
   /**
    * reshape the input image sizes and input batchsize
    */
-  virtual void reshapeInput(int& batchsize, int& height, int& width) {
-    const Argument& input = inputLayers_[0]->getOutput();
-    batchsize = input.getBatchSize();
-    int h = input.getFrameHeight();
-    int w = input.getFrameWidth();
-    if (h != 0) {
-      height = h;
-    }
-    if (w != 0) {
-      width = w;
-    }
-  }
+  void reshapeInput(int& batchsize, int& height, int& width);
 
   /**
    * reshape output image sizes
    */
-  virtual void reshapeOutput(size_t height, size_t width) {
-    output_.setFrameHeight(height);
-    output_.setFrameWidth(width);
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].setFrameHeight(height);
-      outputOtherDevice_[i].setFrameWidth(width);
-    }
-  }
+  void reshapeOutput(size_t height, size_t width);
 
   /**
    * reset MKLDNNMatrix from Matrix and internal primitive desc.
@@ -318,13 +190,7 @@ protected:
    */
   void resetWithMatrix(MKLDNNMatrixPtr& dnn,
                        const MatrixPtr& mat,
-                       mkldnn::memory::primitive_desc pd) {
-    dnn = nullptr;
-    if (mat == nullptr) {
-      return;
-    }
-    dnn = MKLDNNMatrix::create(pd, mat);
-  }
+                       mkldnn::memory::primitive_desc pd);
 
   /**
    * reset input value from input MKLDNNMatrix and internal primitive desc.
@@ -332,99 +198,20 @@ protected:
    */
   void resetInValue(
       MKLDNNMatrixPtr& in,
-      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr) {
-    cvtInVal_ = nullptr;
-    extInVal_ = nullptr;
-    in = nullptr;
-    CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
-    auto extPD = MKLDNNMatrix::createPrimitiveDesc(
-        {bs_, ic_, ih_, iw_}, mkldnn::memory::format::nchw, engine_);
-    const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-    CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
-    if (in == nullptr || in->getFormat() == mkldnn::memory::format::nc) {
-      in = MKLDNNMatrix::create(extPD, inMat);
-    }
-    extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
-    if (in->getFormat() == mkldnn::memory::format::nc) {
-      CHECK(ih_ == 1 && iw_ == 1);
-    }
-    if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
-      return;
-    }
-    // need create reorder
-    in = MKLDNNMatrix::create(*intPD);
-    extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
-    cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
-    CHECK(cvtInVal_) << "should not be emptry";
-  }
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
 
   /**
    * reset output value from internal primitive desc.
    * reset both internal and external buffer and create reorder if necessary.
    */
   void resetOutValue(MKLDNNMatrixPtr& out,
-                     mkldnn::memory::primitive_desc intPD) {
-    cvtOutVal_ = nullptr;
-    out = MKLDNNMatrix::create(intPD, output_.value);
-    extOutVal_ = out;
-    if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
-      return;
-    }
-    // need create reorder
-    CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
-    extOutVal_ = MKLDNNMatrix::create(mkldnn::memory::dims{bs_, oc_, oh_, ow_},
-                                      mkldnn::memory::format::nchw,
-                                      engine_,
-                                      output_.value);
-    out = MKLDNNMatrix::create(intPD);
-    cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
-    CHECK(cvtOutVal_) << "should not be empty";
-  }
+                     mkldnn::memory::primitive_desc intPD);
 
   /**
    * reset input grad from internal primitive desc.
    * reset both internal and external buffer and create reorder if necessary.
    */
-  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD) {
-    cvtInGrad_ = nullptr;
-    extInGrad_ = nullptr;
-    in = nullptr;
-    LayerPtr& input = inputLayers_[0];
-    if (input->getOutputGrad() == nullptr) {
-      // no need input grad
-      return;
-    }
-    CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
-        << "only support input is MKLDNN layer or only have one output layer";
-    // when input is a mkldnn branch node,
-    // this layer will save input grad to a internal buffer,
-    // and the mkldnn input layer will merge them to actual prev->output_.grad
-    const MatrixPtr& inMat =
-        input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
-    in = MKLDNNMatrix::create(intPD, inMat);
-    Argument& arg = input->getOutput(this->getName());
-    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-    CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
-        << "should have internal input value and primitive desc must equal";
-    if (inputIsOnlyMKLDNN()) {
-      return;
-    }
-
-    extInGrad_ = in;
-    if (isPaddleFormat(extInGrad_->getFormat())) {
-      return;
-    }
-    // need create reorder
-    CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
-        << "should have external input value and the format must be nchw(nc)";
-    extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
-    CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
-        << "should have internal input value and primitive desc must equal";
-    in = MKLDNNMatrix::create(intPD);
-    cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
-    CHECK(cvtInGrad_);
-  }
+  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
 
   /**
    * reset output grad from internal primitive desc.
@@ -434,81 +221,59 @@ protected:
    *       it could not be mixed with cpu device,
    *       since it can not get memory desc from cpu device.
    */
-  void resetOutGrad(MKLDNNMatrixPtr& out,
-                    mkldnn::memory::primitive_desc intPD) {
-    cvtOutGrad_ = nullptr;
-    extOutGrad_ = nullptr;
-    out = nullptr;
-    MatrixPtr& outMat = output_.grad;
-    out = MKLDNNMatrix::create(intPD, outMat);
-    resetMergeGrad(out);
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-    CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
-    extOutGrad_ = out;
-    if (isPaddleFormat(extOutGrad_->getFormat())) {
-      return;
-    }
-    // need create reorder
-    CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
-        << "should have external output value and the format must be nchw(nc)";
-    extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
-    CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
-        << "should have internal output value and primitive desc must equal";
-    out = MKLDNNMatrix::create(intPD);
-    cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
-    CHECK(cvtOutGrad_);
-  }
+  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
 
   /**
    * reset the merge grad primitive if necessary.
    * note: do not support the grads are mixed with cpu device,
    *       since it can not get memory desc from cpu device.
    */
-  virtual void resetMergeGrad(MKLDNNMatrixPtr& out) {
-    mergeGrad_ = nullptr;
-    pipelineMergeGrad_.clear();
-    if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
-      // do not merge when output is not all MKLDNN or only one output
-      return;
-    }
-    CHECK(out) << "should have reset internal ouput grad";
-    std::vector<double> scales(outputMap_.size(), 1.0);
-    std::vector<mkldnn::memory::primitive_desc> srcPDs;
-    std::vector<mkldnn::primitive::at> srcs;
-    for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
-      MKLDNNMatrixPtr src =
-          std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
-      VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
-      CHECK(src) << "should be MKLDNNMatrix";
-      auto srcDims = src->getDims();
-      auto dstDims = out->getDims();
-      CHECK_EQ(srcDims.size(), dstDims.size());
-      for (size_t i = 0; i < srcDims.size(); ++i) {
-        CHECK_EQ(srcDims[i], dstDims[i]);
-      }
-      srcPDs.push_back(src->getPrimitiveDesc());
-      srcs.push_back(*src);
-    }
+  void resetMergeGrad(MKLDNNMatrixPtr& out);
+
+protected:
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
 
-    // TODO(TJ): remove me when mkldnn sum support different formats
-    for (size_t i = 1; i < srcPDs.size(); ++i) {
-      CHECK(srcPDs[0] == srcPDs[i]);
+  /**
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
+   */
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
     }
-    tmpOutGrad_ = out;
-    tmpCvt_ = nullptr;
-    if (out->getPrimitiveDesc() != srcPDs[0]) {
-      tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
-      tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
-      CHECK(tmpCvt_);
-      pipelineMergeGrad_.push_back(*tmpCvt_);
+  }
+
+  /**
+   * If input only has MKLDNN device.
+   * Otherwise, only support the previous layer using CPU device.
+   */
+  bool inputIsOnlyMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
     }
+  }
 
-    auto sumPD = mkldnn::sum::primitive_desc(
-        tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
-    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
-    pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool outputIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
   }
 
   /**
@@ -568,54 +333,7 @@ protected:
     }
   }
 
-protected:
-  /**
-   * If input only has MKLDNN device.
-   * Otherwise, only support the previous layer using CPU device.
-   */
-  bool inputIsOnlyMKLDNN(int index = 0) {
-    int prevDevice = getPrev(index)->getDeviceId();
-    if (prevDevice == MKLDNN_DEVICE) {
-      return true;
-    } else {
-      // do not support GPU yet
-      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
-      return false;
-    }
-  }
-
-  /**
-   * If output only has MKLDNN device.
-   * Otherwise, other devices should only using CPU device.
-   */
-  bool outputIsOnlyMKLDNN() {
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
-    }
-    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
-    return outputOnlyMKLDNN_;
-  }
-
-  /**
-   * Set deviceId of this layer.
-   */
-  void setDevice(int id) { deviceId_ = id; }
-
 private:
-  /**
-   * check the format is nchw or nc,
-   * which is supported by Paddle default memory layout
-   */
-  bool isPaddleFormat(mkldnn::memory::format fmt) {
-    if (fmt == mkldnn::memory::format::nchw ||
-        fmt == mkldnn::memory::format::nc) {
-      return true;
-    } else {
-      return false;
-    }
-  }
-
   /**
    * clear all grad
    */