diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index dcf0acb5a2cc7698625a2e254d4c12a32bc9631d..dbadc352a4ccd7483bf67e1025c212f514e32a24 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -21,6 +21,10 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
+#ifdef PADDLE_USE_MKLDNN
+#include "paddle/gserver/layers/MKLDNNLayer.h"
+#endif
+
 #ifndef PADDLE_MOBILE_INFERENCE
 #include "MultiNetwork.h"
 #include "RecurrentGradientMachine.h"
@@ -300,6 +304,17 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
   }
 }
 
+void NeuralNetwork::finish() {
+#ifdef PADDLE_USE_MKLDNN
+  FOR_EACH_R(layer, layers_) {
+    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
+    if (dnnLayer) {
+      dnnLayer->convertWeightsToPaddle();
+    }
+  }
+#endif
+}
+
 Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
   return getLayer(layerName)->getOutput();
 }
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 56a1ec78460731554c9b47cf3f517f7654dc314f..16971883b42786e2cb48faafd2a7e95d45075ac8 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -134,6 +134,9 @@ public:
 
   const std::string& getName() const { return subModelName_; }
 
+  /// some finish work, like convert the weight format of MKLDNNLayers
+  void finish() override;
+
 protected:
   /**
    * The constructor of NeuralNetwork.
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 8b67a1ef4ffdd42559f8078873ed135751d56674..26810a648343d6203f7937740641325ae8ea6879 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -313,6 +313,7 @@ void MKLDNNConvLayer::resetOutValue(
       cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
       CHECK(cvtOutVal_) << "should not be empty";
     } else {
+      cpuOut->setData(output_.value->getData());
       cpuOutVal_ = out;
     }
     // when output is cpu device, change the mkldnn output value and make them
@@ -456,17 +457,18 @@ void MKLDNNConvLayer::resetOutGrad(
     MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
   } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    // always share the same grad data of CPU output
+    // then the activation can get the right grad from output_.grad
+    output_.grad->setData(cpuOut->getData());
     // same PrimitiveDesc with cpuInVal_
     CHECK(cpuOutVal_);
     cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
     // create reorder if primitive desc does not match
     if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_);
     } else {
-      // share the same data of CPU output
-      output_.grad->setData(cpuOut->getData());
       out = cpuOutGrad_;
     }
   }
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 5f9923da769781287e39a3aaaf92248dfe09f225..4e2753eba2350d2c3df81b57fe98270a3c38cb24 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -46,6 +46,9 @@ protected:
   // backward also need reset after reset forward handle
   bool needResetBwd_;
 
+  // is output only mkldnn
+  bool outputOnlyMKLDNN_;
+
   // mkldnn engine, stream and primivtives
   mkldnn::engine engine_;
   std::shared_ptr<MKLDNNStream> stream_;
@@ -141,6 +144,9 @@ public:
         updateInputData();
       }
 
+      if (!outputOnlyMKLDNN_) {
+        clearGrads();
+      }
       stream_->submit(pipelineFwd_);
     }
 
@@ -389,7 +395,8 @@ protected:
       CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
           << "Only support other device is CPU yet";
     }
-    return outputOtherDevice_.size() == 0;
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
   }
 
   /**
@@ -398,6 +405,16 @@ protected:
   void setDevice(int id) { deviceId_ = id; }
 
 private:
+  /**
+   * clear all grad
+   */
+  void clearGrads() {
+    output_.grad->zeroMem();
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].grad->zeroMem();
+    }
+  }
+
   /**
    * Set deviceId of the params used in this layer.
    */
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 5606aae80ce8e9a1e571d3c057c471b26a59d032..0e53e2d1b7e6691909955eeacd345981a9960ec6 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -146,6 +146,7 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
       cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
       CHECK(cvtOutVal_) << "should not be emptry";
     } else {
+      cpuOut->setData(output_.value->getData());
       cpuOutVal_ = out;
     }
     output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
@@ -213,15 +214,16 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
     MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
   } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    // always share the same grad data of CPU output
+    // then the activation can get the right grad from output_.grad
+    output_.grad->setData(cpuOut->getData());
     cpuOutGrad_ = MKLDNNMatrix::create(
         cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
     if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_) << "should not be emptry";
     } else {
-      // share the same data of CPU output
-      output_.grad->setData(cpuOut->getData());
       out = cpuOutGrad_;
     }
   }
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index fcee19415c13e9731bd47eb53bbff9b52cf6450b..329536afaf6d69676e8c39fdf8b6b8cb87ade5fa 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -26,7 +26,10 @@ if(WITH_MKLDNN)
         test_MKLDNN.cpp
         MKLDNNTester.cpp
         LayerGradUtil.cpp)
-    add_test(NAME test_MKLDNN COMMAND test_MKLDNN)
+    add_test(NAME test_MKLDNN
+        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
+            ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
+            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 ################ test_CRFLayerGrad ####################
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index eaebdd671cfa1b37e5efe149588ca23fdc402a8e..3bf6a9e176cc1235aa5ddefcedd4253e6afc1342 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "MKLDNNTester.h"
 #include "paddle/gserver/layers/MKLDNNBase.h"
 #include "paddle/gserver/layers/MKLDNNLayer.h"
+#include "paddle/trainer/Trainer.h"
 
 namespace paddle {
 
@@ -315,6 +316,7 @@ void MKLDNNTester::runOnce() {
     auto& value = para->getBuf(PARAMETER_VALUE);
     real lr = 1e-3;
     value->add(*grad, lr);
+    grad->zeroMem();
   };
   randomTopDiffs();
   dnnLayer_->backward(updateCallback);
@@ -411,4 +413,143 @@ void MKLDNNTester::run(const TestConfig& dnn,
   }
 }
 
+void MKLDNNTester::initArgument(DataIn& data,
+                                const std::string& configPath,
+                                const size_t iter) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+  data.inArgs.resize(iter);
+  data.outGrads.resize(iter);
+  data.paraValues.clear();
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      Argument arg;
+      arg.value = Matrix::create(batchSize, layerSize, false, false);
+      arg.grad = Matrix::create(batchSize, layerSize, false, false);
+      arg.value->randomizeUniform();
+      arg.value->add(-0.5);
+      arg.value->sigmoid(*arg.value);
+      arg.grad->zeroMem();
+      arg.ids = VectorT<int>::create(batchSize, false);
+      arg.ids->rand(layerSize);
+      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+      data.inArgs[i].push_back(arg);
+    }
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
+      grad->randomizeUniform();
+      data.outGrads[i].push_back(grad);
+    }
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), false);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void MKLDNNTester::getOutResult(const std::string& configPath,
+                                DataIn& in,
+                                DataOut& out,
+                                bool use_mkldnn,
+                                size_t iter) {
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = use_mkldnn;
+  *ThreadLocalRand::getSeed() = 1;
+  srand(1);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+  auto gradientMachine = trainer.getGradientMachine();
+  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+  }
+  UpdateCallback simpleUpdate = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+
+  vector<Argument> outArgs;
+  gradientMachine->start();
+  out.outValues.clear();
+  out.paraValues.clear();
+  for (size_t i = 0; i < iter; ++i) {
+    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
+    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
+    // save forward result
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      MatrixPtr value = Matrix::create(outArgs[k].value->getHeight(),
+                                       outArgs[k].value->getWidth(),
+                                       false,
+                                       false);
+      value->copyFrom(*outArgs[k].value);
+      out.outValues.push_back(value);
+    }
+
+    // random backward input
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
+    }
+    gradientMachine->backward(simpleUpdate);
+  }
+  gradientMachine->finish();
+
+  // save param value
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr val = Vector::create(
+        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
+    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
+    out.paraValues.push_back(val);
+  }
+}
+
+void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
+  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
+  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
+  for (size_t i = 0; i < ref.outValues.size(); i++) {
+    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
+  }
+  for (size_t i = 0; i < ref.paraValues.size(); i++) {
+    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
+  }
+}
+
+void MKLDNNTester::runBranchesTest(const std::string& configPath,
+                                   size_t iter,
+                                   float eps) {
+  DataIn in;
+  initArgument(in, configPath, iter);
+
+  DataOut outCpu, outDnn;
+  getOutResult(configPath, in, outCpu, false, iter);
+  getOutResult(configPath, in, outDnn, true, iter);
+
+  compareResult(outCpu, outDnn, eps);
+}
+
 }  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index 171d176ee757f1164c38d86273bdf9e5aefeda06..51abfcb67e2ec35fe1b0179e742a7d18f08f8a2c 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -33,6 +33,17 @@ class MKLDNNTester {
     NUM = 2,  // Number of total
   };
 
+  struct DataIn {
+    std::vector<std::vector<Argument>> inArgs;
+    std::vector<std::vector<MatrixPtr>> outGrads;
+    std::vector<VectorPtr> paraValues;
+  };
+
+  struct DataOut {
+    std::vector<MatrixPtr> outValues;
+    std::vector<VectorPtr> paraValues;
+  };
+
 protected:
   std::vector<TestConfig> configs_;
   vector<string> layerNames_;
@@ -74,7 +85,17 @@ public:
            float epsilon = 1e-4,
            bool log = false,
            int level = MKLDNN_ALL);
-  void setLogLevel(int lvl) { lvl_ = lvl; }
+  static void runBranchesTest(const std::string& configPath,
+                              size_t iter = 3,
+                              float eps = 1e-4);
+  static void initArgument(DataIn& data,
+                           const std::string& configPath,
+                           size_t iter = 3);
+  static void getOutResult(const std::string& configPath,
+                           DataIn& in,
+                           DataOut& out,
+                           bool use_mkldnn,
+                           size_t iter = 3);
 
 private:
   void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
@@ -101,8 +122,9 @@ private:
   void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
   void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
 
-  double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
-  double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
 
   /**
    * Get delta percent
@@ -111,11 +133,11 @@ private:
    * else return sum(abs(a-b)) / sum(abs(b))
    * The return value should be smaller than eps when passing.
    */
-  double getDelta(const real* d1,
-                  const real* d2,
-                  size_t len,
-                  const float failRate = 1e-3,
-                  const float thres = 0.1);
+  static double getDelta(const real* d1,
+                         const real* d2,
+                         size_t len,
+                         const float failRate = 1e-3,
+                         const float thres = 0.1);
 };
 
 }  //  namespace paddle
diff --git a/paddle/gserver/tests/mkldnn_branches_conv.conf b/paddle/gserver/tests/mkldnn_branches_conv.conf
new file mode 100644
index 0000000000000000000000000000000000000000..2628509db43e6a5f69a4f5ea956bffdc2837e32a
--- /dev/null
+++ b/paddle/gserver/tests/mkldnn_branches_conv.conf
@@ -0,0 +1,56 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=16)
+channels = get_config_arg("channels", int, 2)
+
+def two_conv(input, group_name):
+  out1 = img_conv_layer(input=input,
+            name=group_name+'_conv1',
+            filter_size=1,
+            num_filters=channels,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+
+  out2 = img_conv_layer(input=input,
+            name=group_name+'_conv2',
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+  return out1, out2
+
+data = data_layer(name ="input", size=channels*16*16)
+
+conv = img_conv_layer(input=data,
+            num_channels=channels,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1, a2 = two_conv(input=conv, group_name='a')
+
+concat = concat_layer(input=[a1, a2])
+
+b1, b2 = two_conv(input=conv, group_name='b')
+
+addto = addto_layer(input=[b1, b2])
+
+outputs([concat, addto])
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index a70b2f17f4f1130322f3c50d244f70fdcf34468b..3571fbb9e335fc6652bdbfc3f9e35beabda5044f 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+#include <paddle/utils/PythonUtil.h>
 #include <string>
 #include <vector>
 #include "MKLDNNTester.h"
@@ -40,12 +41,13 @@ DECLARE_bool(use_mkldnn);
 struct testFcDesc {
   int bs;
   int ic;
-  int oc;
   int ih, iw;  // oh == ow == 1
+  int oc;
 };
 
 static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
   cfg.layerConfig.set_type("mkldnn_fc");
+  cfg.layerConfig.set_active_type("relu");
   cfg.layerConfig.set_size(pm.oc);
   cfg.inputDefs.push_back(
       {INPUT_DATA,
@@ -86,6 +88,7 @@ struct testConvDesc {
 
 static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
   cfg.layerConfig.set_type("mkldnn_conv");
+  cfg.layerConfig.set_active_type("relu");
   cfg.layerConfig.set_num_filters(pm.oc);
   cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
   cfg.layerConfig.set_shared_biases(true);
@@ -158,6 +161,7 @@ struct testPoolDesc {
 
 static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
   cfg.layerConfig.set_type("mkldnn_pool");
+  cfg.layerConfig.set_active_type("relu");
   cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
   cfg.inputDefs.push_back(
       {INPUT_DATA,
@@ -244,13 +248,26 @@ TEST(MKLDNNActivation, Activations) {
   }
 }
 
-// TODO(TJ): add branch test
+DECLARE_string(config_args);
+TEST(MKLDNNLayer, branches) {
+  std::vector<std::string> cases = {"conv"};
+  for (auto name : cases) {
+    std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf";
+    for (auto channels : {2, 32}) {
+      std::ostringstream oss;
+      oss << "channels=" << channels;
+      FLAGS_config_args = oss.str();
+      MKLDNNTester::runBranchesTest(config);
+    }
+  }
+}
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   FLAGS_use_gpu = false;
   FLAGS_use_mkldnn = true;
   initMain(argc, argv);
+  initPython(argc, argv);
   FLAGS_thread_local_rand_use_global_seed = true;
   srand(1);
   return RUN_ALL_TESTS();