From 1199aa6876fc8c6759e3e2be7bb89a597d88359a Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 16 Oct 2017 17:37:39 +0800
Subject: [PATCH] fix bug: clear grad and always share data when output has cpu
 and add activation in unit tests

---
 paddle/gserver/layers/MKLDNNConvLayer.cpp |  8 +++++---
 paddle/gserver/layers/MKLDNNLayer.h       | 19 ++++++++++++++++++-
 paddle/gserver/layers/MKLDNNPoolLayer.cpp |  8 +++++---
 paddle/gserver/tests/test_MKLDNN.cpp      |  3 +++
 4 files changed, 31 insertions(+), 7 deletions(-)
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 8b67a1ef4f..26810a6483 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -313,6 +313,7 @@ void MKLDNNConvLayer::resetOutValue(
       cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
       CHECK(cvtOutVal_) << "should not be empty";
     } else {
+      cpuOut->setData(output_.value->getData());
       cpuOutVal_ = out;
     }
     // when output is cpu device, change the mkldnn output value and make them
@@ -456,17 +457,18 @@ void MKLDNNConvLayer::resetOutGrad(
     MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
   } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    // always share the same grad data of CPU output
+    // then the activation can get the right grad from output_.grad
+    output_.grad->setData(cpuOut->getData());
     // same PrimitiveDesc with cpuInVal_
     CHECK(cpuOutVal_);
     cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
     // create reorder if primitive desc does not match
     if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_);
     } else {
-      // share the same data of CPU output
-      output_.grad->setData(cpuOut->getData());
       out = cpuOutGrad_;
     }
   }
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 5f9923da76..4e2753eba2 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -46,6 +46,9 @@ protected:
   // backward also need reset after reset forward handle
   bool needResetBwd_;
 
+  // is output only mkldnn
+  bool outputOnlyMKLDNN_;
+
   // mkldnn engine, stream and primivtives
   mkldnn::engine engine_;
   std::shared_ptr<MKLDNNStream> stream_;
@@ -141,6 +144,9 @@ public:
         updateInputData();
       }
 
+      if (!outputOnlyMKLDNN_) {
+        clearGrads();
+      }
       stream_->submit(pipelineFwd_);
     }
 
@@ -389,7 +395,8 @@ protected:
       CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
           << "Only support other device is CPU yet";
     }
-    return outputOtherDevice_.size() == 0;
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
   }
 
   /**
@@ -398,6 +405,16 @@ protected:
   void setDevice(int id) { deviceId_ = id; }
 
 private:
+  /**
+   * clear all grad
+   */
+  void clearGrads() {
+    output_.grad->zeroMem();
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].grad->zeroMem();
+    }
+  }
+
   /**
    * Set deviceId of the params used in this layer.
    */
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 5606aae80c..0e53e2d1b7 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -146,6 +146,7 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
       cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
       CHECK(cvtOutVal_) << "should not be emptry";
     } else {
+      cpuOut->setData(output_.value->getData());
       cpuOutVal_ = out;
     }
     output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
@@ -213,15 +214,16 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
     MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
   } else {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    // always share the same grad data of CPU output
+    // then the activation can get the right grad from output_.grad
+    output_.grad->setData(cpuOut->getData());
     cpuOutGrad_ = MKLDNNMatrix::create(
         cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
     if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_) << "should not be emptry";
     } else {
-      // share the same data of CPU output
-      output_.grad->setData(cpuOut->getData());
       out = cpuOutGrad_;
     }
   }
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index a70b2f17f4..03515e9469 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -46,6 +46,7 @@ struct testFcDesc {
 
 static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
   cfg.layerConfig.set_type("mkldnn_fc");
+  cfg.layerConfig.set_active_type("sigmoid");
   cfg.layerConfig.set_size(pm.oc);
   cfg.inputDefs.push_back(
       {INPUT_DATA,
@@ -86,6 +87,7 @@ struct testConvDesc {
 
 static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
   cfg.layerConfig.set_type("mkldnn_conv");
+  cfg.layerConfig.set_active_type("relu");
   cfg.layerConfig.set_num_filters(pm.oc);
   cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
   cfg.layerConfig.set_shared_biases(true);
@@ -158,6 +160,7 @@ struct testPoolDesc {
 
 static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
   cfg.layerConfig.set_type("mkldnn_pool");
+  cfg.layerConfig.set_active_type("relu");
   cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
   cfg.inputDefs.push_back(
       {INPUT_DATA,
-- 
GitLab