refine

bfbd066f · tensor-tang · fe51f726 · bfbd066f · bfbd066f · bfbd066f
5 changed file
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -77,6 +77,24 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }

+void MKLDNNFcLayer::convertOutputToOtherDevice() {
+  copyOutputInfoToOtherDevice();
+  // find other cpu device and reorder output to cpu device
+  int cnt = 0;
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+      // fc cpu output value do not need convert
+      // just share point
+      outputOtherDevice_[i].value = output_.value;
+      ++cnt;
+    }
+  }
+
+  if (cnt > 1) {
+    LOG(WARNING) << "should not have more than one CPU devie";
+  }
+}
+
 void MKLDNNFcLayer::reshape() {
  const Argument& input = getInput(0, getPrev(0)->getDeviceId());
  int batchSize = input.getBatchSize();
@@ -116,7 +134,7 @@ void MKLDNNFcLayer::resetFwd() {
  const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
  const MatrixPtr& out = output_.value;

-  if (prevIsMKLDNN()) {
+  if (prevIsOnlyMKLDNN()) {
    const MatrixPtr& in = getInputValue(0);
    inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
    CHECK(inVal_) << "Input should be MKLDNNMatrix";
@@ -136,30 +154,21 @@ void MKLDNNFcLayer::resetFwd() {

  // change original output value to mkldnn output value
  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
-  if (!nextIsMKLDNN()) {
-    Argument cpuOutput;
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-        cpuOutput = outputOtherDevice_[i];
-      }
-    }
-    cpuOutput.setFrameHeight(output_.getFrameHeight());
-    cpuOutput.setFrameWidth(output_.getFrameWidth());
-
-    // fc cpu output value do not need convert
-    cpuOutput.value = output_.value;
+  if (!nextIsOnlyMKLDNN()) {
+    convertOutputToOtherDevice();
  }

  // create forward handle
  prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc =
-      hasBias ? fc_fwd::desc(pk,
-                             inVal_->getMD(),
-                             wgtVal_->getMD(),
-                             biasVal_->getMD(),
-                             outVal_->getMD())
-              : fc_fwd::desc(
-                    pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD());
+  fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                biasVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc())
+                                 : fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc());
  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
  if (hasBias) {
    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
@@ -184,36 +193,38 @@ void MKLDNNFcLayer::resetBwd() {
  const MatrixPtr& wgt = weight_->getWGrad();
  const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;

-  // TODO(TJ): merge topdiffs
-  if (nextIsMKLDNN()) {
+  // TODO(TJ): merge outgrad
+  if (nextIsOnlyMKLDNN()) {
    // can not directly cast outputgrad to mkldnnmatrix,
    // since each layer can not write the inputgrad to mkldnn inputgrad.
    // So just create from matrix with outputvalue format.
    const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
-    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
  } else {
    const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
    // fc do not need to convert from cpu device since output always nc
    // only need create from cpu device
-    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
  }

-  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPD());
-  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPD()) : nullptr;
+  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
+  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
+                      : nullptr;

  // create memory primitive desc
  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
-                                      inVal_->getMD(),
-                                      wgtGrad_->getMD(),
-                                      outGrad_->getMD());
+                                      inVal_->getMemoryDesc(),
+                                      wgtGrad_->getMemoryDesc(),
+                                      outGrad_->getMemoryDesc());
  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-  fc_bwdWgt::desc bwdWgtDesc =
-      hasBias ? fc_bwdWgt::desc(inVal_->getMD(),
-                                wgtGrad_->getMD(),
-                                biasGrad_->getMD(),
-                                outGrad_->getMD())
-              : fc_bwdWgt::desc(
-                    inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
+  fc_bwdWgt::desc bwdWgtDesc = hasBias
+                                   ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     biasGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc())
+                                   : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc());
  fc_bwdWgt::primitive_desc bwdWgtPD =
      fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);

@@ -227,30 +238,20 @@ void MKLDNNFcLayer::resetBwd() {
  pipelineBwd_.push_back(*bwdWgt_);

  /// backward data
-  if (prevIsMKLDNN()) {
-    const MatrixPtr& in = getInputGrad(0, MKLDNN_DEVICE);
-    if (in == nullptr) {
-      return;
-    }
-    if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
-      // TODO(TJ): use outputMaps_ ways when merge topdiff done
-    } else {
-      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
-    }
+  int device = prevIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  const MatrixPtr& in = getInputGrad(0, device);
+  if (in == nullptr) {
+    return;
+  }
+  if (getInput(0, device).getAllCount() > 1) {
+    // TODO(TJ): use outputMaps_ ways when merge outgrad done
  } else {
-    const MatrixPtr& in = getInputGrad(0, CPU_DEVICE);
-    if (in == nullptr) {
-      return;
-    }
-    if (getInput(0, CPU_DEVICE).getAllCount() > 1) {
-      // TODO(TJ): use outputMaps_ ways when merge topdiff done
-    } else {
-      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
-    }
+    inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
  }

-  fc_bwdData::desc bwdDataDesc =
-      fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
+                                                  wgtGrad_->getMemoryDesc(),
+                                                  outGrad_->getMemoryDesc());
  fc_bwdData::primitive_desc bwdDataPD =
      fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);


--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -72,6 +72,8 @@ protected:
   * only would be called when needed
   */
  void resetBwd();
+
+  void convertOutputToOtherDevice() override;
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -86,10 +86,7 @@ public:
    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
                            << "Please set WITH_MKLDNN=ON "
                            << "and set use_mkldnn=True";
-    if (useGpu_ == true) {
-      LOG(WARNING) << "Do not support GPU yet, will change to useGpu = false";
-      useGpu_ = false;
-    }
+    CHECK(!useGpu_) << "Do not support GPU yet";

    // set device id before Layer::init
    setDevice(MKLDNN_DEVICE);
@@ -116,6 +113,12 @@ public:
   */
  virtual void convertWeightsToPaddle() {}

+  /**
+   * convert MKLDNN output to other device.
+   * only support CPU device yet
+   */
+  virtual void convertOutputToOtherDevice() {}
+
  /**
   * print info about sizes
   */
@@ -147,22 +150,25 @@ public:

 protected:
  /**
-   * If next layer only has MKLDNN type.
-   * Otherwise, only support otherdevice CPU device.
+   * copy image size and sequence info to other device
   */
-  bool nextIsMKLDNN() {
+  void copyOutputInfoToOtherDevice() {
    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
-          << "Only support other device is CPU yet";
+      outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
+      outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
    }
-    return outputOtherDevice_.size() == 0;
  }

  /**
-   * Is previous layer MKLDNN type.
-   * Otherwise, only support otherdevice CPU device.
+   * Is previous layer only has MKLDNN type.
+   * Otherwise, only support the previous layer using CPU device.
   */
-  bool prevIsMKLDNN(int index = 0) {
+  bool prevIsOnlyMKLDNN(int index = 0) {
    int prevDevice = getPrev(index)->getDeviceId();
    if (prevDevice == MKLDNN_DEVICE) {
      return true;
@@ -173,11 +179,23 @@ protected:
    }
  }

+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool nextIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    return outputOtherDevice_.size() == 0;
+  }
+
  /**
   * Sync input value data
   */
  void syncInputValue() {
-    if (prevIsMKLDNN()) {
+    if (prevIsOnlyMKLDNN()) {
      return;
    }
    real* iData = getInputValue(0, CPU_DEVICE)->getData();
@@ -190,7 +208,7 @@ protected:
   * Sync output grad data
   */
  void syncOutputGrad() {
-    if (nextIsMKLDNN()) {
+    if (nextIsOnlyMKLDNN()) {
      return;
    }


--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -31,7 +31,6 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
  if (m == nullptr) {
    size_t height = dims[0];
    size_t width = cnts / dims[0];
-    // LOG(INFO) << height << "," << width;
    m = Matrix::create(height, width, false, false);
  }

@@ -40,10 +39,8 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";

  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
-  size_t width = m->getWidth();
-  size_t height = m->getHeight();
-  real* data = m->getData();
-  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+  return std::make_shared<MKLDNNMatrix>(
+      m->getData(), m->getHeight(), m->getWidth(), pd);
 }

 MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
@@ -51,9 +48,7 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
                                     memory::format fmt,
                                     engine& eg,
                                     mkldnn::memory::data_type dtype) {
-  memory::desc md = memory::desc(dims, dtype, fmt);
-  memory::primitive_desc pd = memory::primitive_desc(md, eg);
-  return create(m, pd);
+  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
 }

 void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
@@ -64,9 +59,7 @@ void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
    return;
  }
  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  real* srcData = getData();
-  real* dstData = m->getData();
-  reorderOnce(srcData, dstData, srcFmt, dstFmt, targetDim);
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
 }

 void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
@@ -77,9 +70,7 @@ void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
    return;
  }
  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
-  real* srcData = getData();
-  real* dstData = m->getData();
-  reorderOnce(srcData, dstData, srcFmt, dstFmt, targetDim);
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
 }

 void MKLDNNMatrix::reorderOnce(void* srcData,
@@ -120,8 +111,9 @@ void MKLDNNMatrix::downSpatial() {
    return;
  }

-  memory::dims srcDims = getDims();
+  // TODO(TJ): change H(height) and W(width) if support nhwc or more
  const int H = 2, W = 3;
+  memory::dims srcDims = getDims();
  if (srcDims[H] != 1 || srcDims[W] != 1) {
    // can not down spatial
    return;
@@ -141,13 +133,12 @@ void MKLDNNMatrix::downSpatial() {
  }
  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
-  void* data = getData();
  mkldnn_primitive_t result;
  mkldnn::error::wrap_c_api(
      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
      "could not create a memory primitive");
  reset(result);
-  set_data_handle(data);
+  set_data_handle(getData());
 }

 }  // namespace paddle
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -56,9 +56,9 @@ public:
 public:
  /**
   * Reorder this MKLDNNMatrix from other format.
-   * Support inplace reorder
-   * Pay attention: this function would only reorder the data layout.
-   *                will NOT change this original dim or format info
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
   */
  void reorderDataFrom(const MKLDNNMatrixPtr& m,
                       memory::format srcFmt,
@@ -66,9 +66,9 @@ public:

  /**
   * Reorder this MKLDNNMatrix to other format.
-   * Support inplace reorder
-   * Pay attention: this function would only reorder the data layout.
-   *                will NOT change the dst dim or format info
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
   */
  void reorderDataTo(const MKLDNNMatrixPtr& m,
                     memory::format dstFmt,
@@ -90,18 +90,20 @@ public:
  /**
   * Get primitive descriptor.
   */
-  mkldnn::memory::primitive_desc getPD() { return this->get_primitive_desc(); }
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }

  /**
   * Get memory descriptor.
   */
-  mkldnn::memory::desc getMD() { return getPD().desc(); }
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }

  /**
   * Get dimensions.
   */
  mkldnn::memory::dims getDims() {
-    mkldnn::memory::desc md = getMD();
+    mkldnn::memory::desc md = getMemoryDesc();
    const int* src = md.data.dims;
    int ndims = md.data.ndims;
    mkldnn::memory::dims dst;
@@ -116,24 +118,25 @@ public:
   * Get format.
   */
  mkldnn::memory::format getFormat() {
-    return (mkldnn::memory::format)(getMD().data.format);
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
  }

  /**
   * Get memory data type.
   */
  mkldnn::memory::data_type getDtype() {
-    return (mkldnn::memory::data_type)(getMD().data.data_type);
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
  }

  /**
   * Get engine.
   */
-  mkldnn::engine getEngine() { return getPD().get_engine(); }
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }

 protected:
  /**
-   * Do once reorder supported inplace.
+   * Do reorder once.
+   * Can support inplace.
   */
  void reorderOnce(void* srcData,
                   void* dstData,