fix bug: merge grad must before backward act.

and add branch net comparing with cpu result

fix bug: merge grad must before backward act.
and add branch net comparing with cpu result
e1954857 · tensor-tang · 698071cc · e1954857 · e1954857 · e1954857
3 changed file
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -67,8 +67,14 @@ protected:

  // merge grad primitive
  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
  // tmp input argument to save input grad, only used to merge grad
  Argument tmpInArg_;
+  // since mkldnn sum do not support different formats:
+  // can refer to https://github.com/01org/mkl-dnn/issues/134
+  // so need create reorder manually and save tmp MKLDNNMatrix
+  MKLDNNMatrixPtr tmpOutGrad_;
+  std::shared_ptr<mkldnn::primitive> tmpCvt_;

 public:
  explicit MKLDNNLayer(const LayerConfig& config)
@@ -148,9 +154,17 @@ public:
    if (needResetBwd_) {
      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
      pipelineBwd_.clear();
+      pipelineMergeGrad_.clear();
+      mergeGrad_ = nullptr;
      resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
      needResetBwd_ = false;
    }
+
+    // merge grad must before backward activation
+    if (mergeGrad_) {
+      REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+      stream_->submit(pipelineMergeGrad_);
+    }
    {
      REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
      backwardActivation();
@@ -262,6 +276,7 @@ protected:
                            mkldnn::memory::primitive_desc pd) {
    CHECK(outputIsOnlyMKLDNN()) << "do not support mixed with other device yet";
    mergeGrad_ = nullptr;
+    pipelineMergeGrad_.clear();
    out = MKLDNNMatrix::create(output_.grad, pd);
    if (outputMap_.size() <= 1) {
      return;
@@ -272,6 +287,7 @@ protected:
    for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
      MKLDNNMatrixPtr src =
          std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+      VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first;
      CHECK(src) << "should be MKLDNNMatrix";
      auto srcDims = src->getDims();
      auto dstDims = out->getDims();
@@ -283,9 +299,26 @@ protected:
      srcs.push_back(*src);
      scales.push_back(1.0);
    }
-    auto sumPD = mkldnn::sum::primitive_desc(pd.desc(), scales, srcPDs);
-    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *out));
-    pipelineBwd_.insert(pipelineBwd_.begin(), *mergeGrad_);
+
+    // TODO(TJ): remove me when mkldnn sum support different formats
+    for (size_t i = 1; i < srcPDs.size(); ++i) {
+      CHECK(srcPDs[0] == srcPDs[i]);
+    }
+    tmpOutGrad_ = nullptr;
+    tmpCvt_ = nullptr;
+    if (out->getPrimitiveDesc() != srcPDs[0]) {
+      tmpOutGrad_ = MKLDNNMatrix::create(nullptr, srcPDs[0]);
+      tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+      CHECK(tmpCvt_);
+      pipelineMergeGrad_.push_back(*tmpCvt_);
+    } else {
+      tmpOutGrad_ = out;
+    }
+
+    auto sumPD = mkldnn::sum::primitive_desc(
+        tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+    mergeGrad_.reset(new mkldnn::sum(sumPD, srcs, *tmpOutGrad_));
+    pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
  }

  /**
@@ -299,7 +332,7 @@ protected:
    const MatrixPtr& grad =
        input->getOutputMapSize() > 1 ? nullptr : input->getOutput().grad;
    in = MKLDNNMatrix::create(grad, pd);
-    auto arg = input->getOutput(this->getName());
+    Argument& arg = input->getOutput(this->getName());
    arg.grad = std::dynamic_pointer_cast<Matrix>(in);
  }


--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -48,6 +48,13 @@ if(WITH_MKLDNN)
              --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False
              --use_gpu=False
      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+  add_test(NAME test_CompareMKLDNNandCPU_Banches
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+          ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU
+              --config_file_a=trainer/tests/sample_trainer_config_branch_net.conf --use_mkldnn_a=True
+              --config_file_b=trainer/tests/sample_trainer_config_branch_net.conf --use_mkldnn_b=False
+              --use_gpu=False
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()

 ############### test_CompareTwoOpts ###################

--- a/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+################################### Algorithm Configuration ###################################
+settings(batch_size = 256,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+################################### Network Configuration ###################################
+data = data_layer(name ="input", size=784)
+
+tmp = img_conv_layer(input=data,
+            num_channels=1,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1 = img_conv_layer(input=tmp,
+            filter_size=1,
+            num_filters=32,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+
+a2 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = concat_layer(input=[a1, a2])
+
+tmp = img_pool_layer(input=tmp,
+            num_channels=64,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+b1 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=64,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+b1 = img_pool_layer(input=b1,
+            pool_size=3,
+            stride=1,
+            padding=1,
+            pool_type=MaxPooling())
+
+b2 = img_conv_layer(input=tmp,
+            filter_size=5,
+            num_filters=64,
+            padding=2,
+            shared_biases=True,
+            act=ReluActivation())
+
+b2 = img_pool_layer(input=b2,
+            pool_size=5,
+            stride=1,
+            padding=2,
+            pool_type=MaxPooling())
+
+tmp = addto_layer(input=[b1, b2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = fc_layer(input=tmp, size=64,
+            bias_attr=False,
+            act=TanhActivation())
+
+output = fc_layer(input=tmp, size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=10)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)