fix bug: clear grad and always share data when output has cpu

and add activation in unit tests

fix bug: clear grad and always share data when output has cpu
and add activation in unit tests
1199aa68 · tensor-tang · dee7f813 · 1199aa68 · 1199aa68 · 1199aa68
4 changed file
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -313,6 +313,7 @@ void MKLDNNConvLayer::resetOutValue(
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
      CHECK(cvtOutVal_) << "should not be empty";
    } else {
+      cpuOut->setData(output_.value->getData());
      cpuOutVal_ = out;
    }
    // when output is cpu device, change the mkldnn output value and make them
@@ -456,17 +457,18 @@ void MKLDNNConvLayer::resetOutGrad(
    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    // always share the same grad data of CPU output
+    // then the activation can get the right grad from output_.grad
+    output_.grad->setData(cpuOut->getData());
    // same PrimitiveDesc with cpuInVal_
    CHECK(cpuOutVal_);
    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
    // create reorder if primitive desc does not match
    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_);
    } else {
-      // share the same data of CPU output
-      output_.grad->setData(cpuOut->getData());
      out = cpuOutGrad_;
    }
  }

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -46,6 +46,9 @@ protected:
  // backward also need reset after reset forward handle
  bool needResetBwd_;
+  // is output only mkldnn
+  bool outputOnlyMKLDNN_;
  // mkldnn engine, stream and primivtives
  mkldnn::engine engine_;
  std::shared_ptr<MKLDNNStream> stream_;
@@ -141,6 +144,9 @@ public:
        updateInputData();
      }
+      if (!outputOnlyMKLDNN_) {
+        clearGrads();
+      }
      stream_->submit(pipelineFwd_);
    }
@@ -389,7 +395,8 @@ protected:
      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
          << "Only support other device is CPU yet";
    }
-    return outputOtherDevice_.size() == 0;
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
  }
  /**
@@ -398,6 +405,16 @@ protected:
  void setDevice(int id) { deviceId_ = id; }
 private:
+  /**
+   * clear all grad
+   */
+  void clearGrads() {
+    output_.grad->zeroMem();
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].grad->zeroMem();
+    }
+  }
  /**
   * Set deviceId of the params used in this layer.
   */

--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -146,6 +146,7 @@ void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
      CHECK(cvtOutVal_) << "should not be emptry";
    } else {
+      cpuOut->setData(output_.value->getData());
      cpuOutVal_ = out;
    }
    output_.value = std::dynamic_pointer_cast<Matrix>(cpuOutVal_);
@@ -213,15 +214,16 @@ void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
    MKLDNNLayer::resetOutGrad(out, outVal_->getPrimitiveDesc());
  } else {
    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    // always share the same grad data of CPU output
+    // then the activation can get the right grad from output_.grad
+    output_.grad->setData(cpuOut->getData());
    cpuOutGrad_ = MKLDNNMatrix::create(
        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
    if (cpuOutGrad_->getPrimitiveDesc() != outVal_->getPrimitiveDesc()) {
-      out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+      out = MKLDNNMatrix::create(nullptr, outVal_->getPrimitiveDesc());
      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
      CHECK(cvtOutGrad_) << "should not be emptry";
    } else {
-      // share the same data of CPU output
-      output_.grad->setData(cpuOut->getData());
      out = cpuOutGrad_;
    }
  }

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -46,6 +46,7 @@ struct testFcDesc {
 static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
  cfg.layerConfig.set_type("mkldnn_fc");
+  cfg.layerConfig.set_active_type("sigmoid");
  cfg.layerConfig.set_size(pm.oc);
  cfg.inputDefs.push_back(
      {INPUT_DATA,
@@ -86,6 +87,7 @@ struct testConvDesc {
 static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
  cfg.layerConfig.set_type("mkldnn_conv");
+  cfg.layerConfig.set_active_type("relu");
  cfg.layerConfig.set_num_filters(pm.oc);
  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
  cfg.layerConfig.set_shared_biases(true);
@@ -158,6 +160,7 @@ struct testPoolDesc {
 static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
  cfg.layerConfig.set_type("mkldnn_pool");
+  cfg.layerConfig.set_active_type("relu");
  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
  cfg.inputDefs.push_back(
      {INPUT_DATA,