diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 2c382a6d4f3761e3f994936368de2775510c7111..31867477c3278f6c69bd851996199e0e9f123e75 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -67,8 +67,14 @@ protected:
 
   // merge grad primitive
   std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
   // tmp input argument to save input grad, only used to merge grad
   Argument tmpInArg_;
+  // since mkldnn sum do not support different formats:
+  // can refer to https://github.com/01org/mkl-dnn/issues/134
+  // so need create reorder manually and save tmp MKLDNNMatrix
+  MKLDNNMatrixPtr tmpOutGrad_;
+  std::shared_ptr<mkldnn::primitive> tmpCvt_;
 
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
@@ -148,9 +154,17 @@ public:
     if (needResetBwd_) {
       VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
       pipelineBwd_.clear();
+      pipelineMergeGrad_.clear();
+      mergeGrad_ = nullptr;
       resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
       needResetBwd_ = false;
     }
+
+    // merge grad must before backward activation
+    if (mergeGrad_) {
+      REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+      stream_->submit(pipelineMergeGrad_);
+    }
     {
       REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
       backwardActivation();
@@ -262,6 +276,7 @@ protected:
                             mkldnn::memory::primitive_desc pd) {
     CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
     mergeGrad_ = nullptr;
+    pipelineMergeGrad_.clear();
     out = MKLDNNMatrix::create(output_.grad, pd);
     if (outputMap_.size() <= 1) {
       return;
@@ -272,6 +287,7 @@ protected:
     for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
       MKLDNNMatrixPtr src =
           std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+      VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
       CHECK(src) << "should be MKLDNNMatrix";
       auto srcDims = src->getDims();
       auto dstDims = out->getDims();
@@ -283,9 +299,26 @@ protected:
       srcs.push_back(*src);
       scales.push_back(1.0);
     }
-    auto sumPD = mkldnn::sum::primitive_desc(pd.desc(), scales, srcPDs);
-    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *out));
-    pipelineBwd_.insert(pipelineBwd_.begin(), *mergeGrad_);
+
+    // TODO(TJ): remove me when mkldnn sum support different formats
+    for (size_t i = 1; i < srcPDs.size(); ++i) {
+      CHECK(srcPDs[0] == srcPDs[i]);
+    }
+    tmpOutGrad_ = nullptr;
+    tmpCvt_ = nullptr;
+    if (out->getPrimitiveDesc() != srcPDs[0]) {
+      tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
+      tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+      CHECK(tmpCvt_);
+      pipelineMergeGrad_.push_back(*tmpCvt_);
+    } else {
+      tmpOutGrad_ = out;
+    }
+
+    auto sumPD = mkldnn::sum::primitive_desc(
+        tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
+    pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
   }
 
   /**
@@ -299,7 +332,7 @@ protected:
     const MatrixPtr& grad =
         input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
     in = MKLDNNMatrix::create(grad, pd);
-    auto arg = input->getOutput(this->getName());
+    Argument& arg = input->getOutput(this->getName());
     arg.grad = std::dynamic_pointer_cast<Matrix>(in);
   }
 
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 066837ca959e46dbe3b39c661aa1bab11cbf2734..db87ea205316426703d8d9451bd0896064a3bd8a 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -48,6 +48,13 @@ if(WITH_MKLDNN)
               --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False
               --use_gpu=False
       WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+  add_test(NAME test_CompareMKLDNNandCPU_Banches
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+          ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU
+              --config_file_a=trainer/tests/sample_trainer_config_branch_net.conf --use_mkldnn_a=True
+              --config_file_b=trainer/tests/sample_trainer_config_branch_net.conf --use_mkldnn_b=False
+              --use_gpu=False
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
 
 ############### test_CompareTwoOpts ###################
diff --git a/paddle/trainer/tests/sample_trainer_config_branch_net.conf b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
new file mode 100644
index 0000000000000000000000000000000000000000..c2594bc13c250a877a7b8a77e11405671c4d8907
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
@@ -0,0 +1,103 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+################################### Algorithm Configuration ###################################
+settings(batch_size = 256,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+################################### Network Configuration ###################################
+data = data_layer(name ="input", size=784)
+
+tmp = img_conv_layer(input=data,
+            num_channels=1,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1 = img_conv_layer(input=tmp,
+            filter_size=1,
+            num_filters=32,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+
+a2 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = concat_layer(input=[a1, a2])
+
+tmp = img_pool_layer(input=tmp,
+            num_channels=64,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+b1 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=64,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+b1 = img_pool_layer(input=b1,
+            pool_size=3,
+            stride=1,
+            padding=1,
+            pool_type=MaxPooling())
+
+b2 = img_conv_layer(input=tmp,
+            filter_size=5,
+            num_filters=64,
+            padding=2,
+            shared_biases=True,
+            act=ReluActivation())
+
+b2 = img_pool_layer(input=b2,
+            pool_size=5,
+            stride=1,
+            padding=2,
+            pool_type=MaxPooling())
+
+tmp = addto_layer(input=[b1, b2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = fc_layer(input=tmp, size=64,
+            bias_attr=False,
+            act=TanhActivation())
+
+output = fc_layer(input=tmp, size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=10)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)