From bfbd066fdd1c4a81266864bf837d89742b3f2ad6 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Tue, 29 Aug 2017 19:55:44 +0800
Subject: [PATCH] refine

---
 paddle/gserver/layers/MKLDNNFcLayer.cpp | 117 ++++++++++++------------
 paddle/gserver/layers/MKLDNNFcLayer.h   |   2 +
 paddle/gserver/layers/MKLDNNLayer.h     |  48 +++++++---
 paddle/math/MKLDNNMatrix.cpp            |  25 ++---
 paddle/math/MKLDNNMatrix.h              |  29 +++---
 5 files changed, 118 insertions(+), 103 deletions(-)
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index d38e6a2099..a08cca318e 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -77,6 +77,24 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
+void MKLDNNFcLayer::convertOutputToOtherDevice() {
+  copyOutputInfoToOtherDevice();
+  // find other cpu device and reorder output to cpu device
+  int cnt = 0;
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+      // fc cpu output value do not need convert
+      // just share point
+      outputOtherDevice_[i].value = output_.value;
+      ++cnt;
+    }
+  }
+
+  if (cnt > 1) {
+    LOG(WARNING) << "should not have more than one CPU devie";
+  }
+}
+
 void MKLDNNFcLayer::reshape() {
   const Argument& input = getInput(0, getPrev(0)->getDeviceId());
   int batchSize = input.getBatchSize();
@@ -116,7 +134,7 @@ void MKLDNNFcLayer::resetFwd() {
   const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
   const MatrixPtr& out = output_.value;
 
-  if (prevIsMKLDNN()) {
+  if (prevIsOnlyMKLDNN()) {
     const MatrixPtr& in = getInputValue(0);
     inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
     CHECK(inVal_) << "Input should be MKLDNNMatrix";
@@ -136,30 +154,21 @@ void MKLDNNFcLayer::resetFwd() {
 
   // change original output value to mkldnn output value
   output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
-  if (!nextIsMKLDNN()) {
-    Argument cpuOutput;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-        cpuOutput = outputOtherDevice_[i];
-      }
-    }
-    cpuOutput.setFrameHeight(output_.getFrameHeight());
-    cpuOutput.setFrameWidth(output_.getFrameWidth());
-
-    // fc cpu output value do not need convert
-    cpuOutput.value = output_.value;
+  if (!nextIsOnlyMKLDNN()) {
+    convertOutputToOtherDevice();
   }
 
   // create forward handle
   prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc =
-      hasBias ? fc_fwd::desc(pk,
-                             inVal_->getMD(),
-                             wgtVal_->getMD(),
-                             biasVal_->getMD(),
-                             outVal_->getMD())
-              : fc_fwd::desc(
-                    pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD());
+  fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                biasVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc())
+                                 : fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
   if (hasBias) {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
@@ -184,36 +193,38 @@ void MKLDNNFcLayer::resetBwd() {
   const MatrixPtr& wgt = weight_->getWGrad();
   const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
 
-  // TODO(TJ): merge topdiffs
-  if (nextIsMKLDNN()) {
+  // TODO(TJ): merge outgrad
+  if (nextIsOnlyMKLDNN()) {
     // can not directly cast outputgrad to mkldnnmatrix,
     // since each layer can not write the inputgrad to mkldnn inputgrad.
     // So just create from matrix with outputvalue format.
     const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
-    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
   } else {
     const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
     // fc do not need to convert from cpu device since output always nc
     // only need create from cpu device
-    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
   }
 
-  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPD());
-  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPD()) : nullptr;
+  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
+  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
+                      : nullptr;
 
   // create memory primitive desc
   fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
-                                      inVal_->getMD(),
-                                      wgtGrad_->getMD(),
-                                      outGrad_->getMD());
+                                      inVal_->getMemoryDesc(),
+                                      wgtGrad_->getMemoryDesc(),
+                                      outGrad_->getMemoryDesc());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-  fc_bwdWgt::desc bwdWgtDesc =
-      hasBias ? fc_bwdWgt::desc(inVal_->getMD(),
-                                wgtGrad_->getMD(),
-                                biasGrad_->getMD(),
-                                outGrad_->getMD())
-              : fc_bwdWgt::desc(
-                    inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
+  fc_bwdWgt::desc bwdWgtDesc = hasBias
+                                   ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     biasGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc())
+                                   : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc());
   fc_bwdWgt::primitive_desc bwdWgtPD =
       fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
 
@@ -227,30 +238,20 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdWgt_);
 
   /// backward data
-  if (prevIsMKLDNN()) {
-    const MatrixPtr& in = getInputGrad(0, MKLDNN_DEVICE);
-    if (in == nullptr) {
-      return;
-    }
-    if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
-      // TODO(TJ): use outputMaps_ ways when merge topdiff done
-    } else {
-      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
-    }
+  int device = prevIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  const MatrixPtr& in = getInputGrad(0, device);
+  if (in == nullptr) {
+    return;
+  }
+  if (getInput(0, device).getAllCount() > 1) {
+    // TODO(TJ): use outputMaps_ ways when merge outgrad done
   } else {
-    const MatrixPtr& in = getInputGrad(0, CPU_DEVICE);
-    if (in == nullptr) {
-      return;
-    }
-    if (getInput(0, CPU_DEVICE).getAllCount() > 1) {
-      // TODO(TJ): use outputMaps_ ways when merge topdiff done
-    } else {
-      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
-    }
+    inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
   }
 
-  fc_bwdData::desc bwdDataDesc =
-      fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
+                                                  wgtGrad_->getMemoryDesc(),
+                                                  outGrad_->getMemoryDesc());
   fc_bwdData::primitive_desc bwdDataPD =
       fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
 
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index e2657a8d5e..e138a6faf1 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -72,6 +72,8 @@ protected:
    * only would be called when needed
    */
   void resetBwd();
+
+  void convertOutputToOtherDevice() override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index 3dd17a36ff..8fe9630e82 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -86,10 +86,7 @@ public:
     CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
                             << "Please set WITH_MKLDNN=ON "
                             << "and set use_mkldnn=True";
-    if (useGpu_ == true) {
-      LOG(WARNING) << "Do not support GPU yet, will change to useGpu = false";
-      useGpu_ = false;
-    }
+    CHECK(!useGpu_) << "Do not support GPU yet";
 
     // set device id before Layer::init
     setDevice(MKLDNN_DEVICE);
@@ -116,6 +113,12 @@ public:
    */
   virtual void convertWeightsToPaddle() {}
 
+  /**
+   * convert MKLDNN output to other device.
+   * only support CPU device yet
+   */
+  virtual void convertOutputToOtherDevice() {}
+
   /**
    * print info about sizes
    */
@@ -147,22 +150,25 @@ public:
 
 protected:
   /**
-   * If next layer only has MKLDNN type.
-   * Otherwise, only support otherdevice CPU device.
+   * copy image size and sequence info to other device
    */
-  bool nextIsMKLDNN() {
+  void copyOutputInfoToOtherDevice() {
     for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
+      outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
+      outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
     }
-    return outputOtherDevice_.size() == 0;
   }
 
   /**
-   * Is previous layer MKLDNN type.
-   * Otherwise, only support otherdevice CPU device.
+   * Is previous layer only has MKLDNN type.
+   * Otherwise, only support the previous layer using CPU device.
    */
-  bool prevIsMKLDNN(int index = 0) {
+  bool prevIsOnlyMKLDNN(int index = 0) {
     int prevDevice = getPrev(index)->getDeviceId();
     if (prevDevice == MKLDNN_DEVICE) {
       return true;
@@ -173,11 +179,23 @@ protected:
     }
   }
 
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool nextIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    return outputOtherDevice_.size() == 0;
+  }
+
   /**
    * Sync input value data
    */
   void syncInputValue() {
-    if (prevIsMKLDNN()) {
+    if (prevIsOnlyMKLDNN()) {
       return;
     }
     real* iData = getInputValue(0, CPU_DEVICE)->getData();
@@ -190,7 +208,7 @@ protected:
    * Sync output grad data
    */
   void syncOutputGrad() {
-    if (nextIsMKLDNN()) {
+    if (nextIsOnlyMKLDNN()) {
       return;
     }
 
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 32ae3b1bcf..0a355e2644 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -31,7 +31,6 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
   if (m == nullptr) {
     size_t height = dims[0];
     size_t width = cnts / dims[0];
-    // LOG(INFO) << height << "," << width;
     m = Matrix::create(height, width, false, false);
   }
 
@@ -40,10 +39,8 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
   CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
 
   CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
-  size_t width = m->getWidth();
-  size_t height = m->getHeight();
-  real* data = m->getData();
-  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+  return std::make_shared<MKLDNNMatrix>(
+      m->getData(), m->getHeight(), m->getWidth(), pd);
 }
 
 MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
@@ -51,9 +48,7 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
                                      memory::format fmt,
                                      engine& eg,
                                      mkldnn::memory::data_type dtype) {
-  memory::desc md = memory::desc(dims, dtype, fmt);
-  memory::primitive_desc pd = memory::primitive_desc(md, eg);
-  return create(m, pd);
+  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
 }
 
 void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
@@ -64,9 +59,7 @@ void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
     return;
   }
   CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  real* srcData = getData();
-  real* dstData = m->getData();
-  reorderOnce(srcData, dstData, srcFmt, dstFmt, targetDim);
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
 }
 
 void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
@@ -77,9 +70,7 @@ void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
     return;
   }
   CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  real* srcData = getData();
-  real* dstData = m->getData();
-  reorderOnce(srcData, dstData, srcFmt, dstFmt, targetDim);
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
 }
 
 void MKLDNNMatrix::reorderOnce(void* srcData,
@@ -120,8 +111,9 @@ void MKLDNNMatrix::downSpatial() {
     return;
   }
 
-  memory::dims srcDims = getDims();
+  // TODO(TJ): change H(height) and W(width) if support nhwc or more
   const int H = 2, W = 3;
+  memory::dims srcDims = getDims();
   if (srcDims[H] != 1 || srcDims[W] != 1) {
     // can not down spatial
     return;
@@ -141,13 +133,12 @@ void MKLDNNMatrix::downSpatial() {
   }
   memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
   memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
-  void* data = getData();
   mkldnn_primitive_t result;
   mkldnn::error::wrap_c_api(
       mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
       "could not create a memory primitive");
   reset(result);
-  set_data_handle(data);
+  set_data_handle(getData());
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index ea3fd7d461..e50f698b49 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -56,9 +56,9 @@ public:
 public:
   /**
    * Reorder this MKLDNNMatrix from other format.
-   * Support inplace reorder
-   * Pay attention: this function would only reorder the data layout.
-   *                will NOT change this original dim or format info
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
    */
   void reorderDataFrom(const MKLDNNMatrixPtr& m,
                        memory::format srcFmt,
@@ -66,9 +66,9 @@ public:
 
   /**
    * Reorder this MKLDNNMatrix to other format.
-   * Support inplace reorder
-   * Pay attention: this function would only reorder the data layout.
-   *                will NOT change the dst dim or format info
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
    */
   void reorderDataTo(const MKLDNNMatrixPtr& m,
                      memory::format dstFmt,
@@ -90,18 +90,20 @@ public:
   /**
    * Get primitive descriptor.
    */
-  mkldnn::memory::primitive_desc getPD() { return this->get_primitive_desc(); }
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }
 
   /**
    * Get memory descriptor.
    */
-  mkldnn::memory::desc getMD() { return getPD().desc(); }
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
 
   /**
    * Get dimensions.
    */
   mkldnn::memory::dims getDims() {
-    mkldnn::memory::desc md = getMD();
+    mkldnn::memory::desc md = getMemoryDesc();
     const int* src = md.data.dims;
     int ndims = md.data.ndims;
     mkldnn::memory::dims dst;
@@ -116,24 +118,25 @@ public:
    * Get format.
    */
   mkldnn::memory::format getFormat() {
-    return (mkldnn::memory::format)(getMD().data.format);
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
   }
 
   /**
    * Get memory data type.
    */
   mkldnn::memory::data_type getDtype() {
-    return (mkldnn::memory::data_type)(getMD().data.data_type);
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
   }
 
   /**
    * Get engine.
    */
-  mkldnn::engine getEngine() { return getPD().get_engine(); }
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
 
 protected:
   /**
-   * Do once reorder supported inplace.
+   * Do reorder once.
+   * Can support inplace.
    */
   void reorderOnce(void* srcData,
                    void* dstData,
-- 
GitLab