pass test, support input CPU device

48d87e5e · tensor-tang · 4eecd0c2 · 48d87e5e · 48d87e5e · 48d87e5e
6 changed file
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -82,6 +82,7 @@ protected:
  Argument output_;
  /// Several outputs stored on different devices, used in 'parallel_nn' case,
  /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
  std::vector<Argument> outputOtherDevice_;
  /// If there are several outputs, map them by each name.
  std::map<std::string, Argument*> outputMap_;
@@ -177,6 +178,13 @@ protected:
    return inputLayer.getOutput(deviceId_);
  }
+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
  /**
   * Get the forward-input value.
   */
@@ -191,6 +199,13 @@ protected:
    return inputLayer.getOutput(deviceId_).value;
  }
+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
  /**
   * Get the forward-input grad.
   */
@@ -205,6 +220,13 @@ protected:
    return inputLayer.getOutput(deviceId_).grad;
  }
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
  /**
   * Get the forward-input label.
   */
@@ -326,19 +348,6 @@ public:
    if (deviceId == getDeviceId()) {
      return output_;
    } else {
-      bool CPU2MKLDNN =
-          getDeviceId() == CPU_DEVICE && deviceId == MKLDNN_DEVICE;
-      bool MKLDNN2CPU =
-          getDeviceId() == MKLDNN_DEVICE && deviceId == CPU_DEVICE;
-      if (CPU2MKLDNN) {
-        // TODO: do something
-        return output_;
-      } else if (MKLDNN2CPU) {
-        // TODO: do something
-        return output_;
-      }
-      // TODO: handle mkldnn device or add mkldnn device to other
      for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
        if (outputOtherDevice_[i].deviceId == deviceId) {
          return outputOtherDevice_[i];

--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -97,7 +97,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
 }
 void MKLDNNFcLayer::reshape() {
-  const Argument& input = getInput(0);
+  const Argument& input = getInput(0, getPrev(0)->getDeviceId());
  int batchSize = input.getBatchSize();
  if (bs_ == batchSize) {
    return;
@@ -135,35 +135,43 @@ void MKLDNNFcLayer::reshape() {
 void MKLDNNFcLayer::resetFwd() {
  bool hasBias = biases_ && biases_->getW();
-  const MatrixPtr& in = getInputValue(0);
  const MatrixPtr& wgt = weight_->getW();
  const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
  const MatrixPtr& out = output_.value;
-  if (getPrev(0)->getDeviceId() == MKLDNN_DEVICE) {
+  if (prevIsMKLDNN()) {
+    const MatrixPtr& in = getInputValue(0);
    inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
    CHECK(inVal_) << "Input should be MKLDNNMatrix";
-    // TODO:  change input nchw to nc if available
-    // inVal_->downSpatial()
  } else {
+    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
+    const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
    inVal_ = MKLDNNMatrix::create(
-        in,
+        in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
-        hasSpatial_ ? memory::dims{bs_, ic_, ih_, iw_} : memory::dims{bs_, ic_},
-        hasSpatial_ ? format::nchw : format::nc,
-        engine_);
  }
+  inVal_->downSpatial();
  wgtVal_ = MKLDNNMatrix::create(
-      wgt,
+      wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
-      hasSpatial_ ? memory::dims{oc_, ic_, ih_, iw_} : memory::dims{oc_, ic_},
+  wgtVal_->downSpatial();
-      hasSpatial_ ? format::oihw : format::oi,
-      engine_);
  biasVal_ =
      hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
  outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
-  // change original output to mkldnn output
+  // change original output value to mkldnn output value
  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  if (!nextIsMKLDNN()) {
+    Argument cpuOutput;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        cpuOutput = outputOtherDevice_[i];
+      }
+    }
+    cpuOutput.setFrameHeight(output_.getFrameHeight());
+    cpuOutput.setFrameWidth(output_.getFrameWidth());
+    // fc cpu output value do not need convert
+    cpuOutput.value = output_.value;
+  }
  // create forward handle
  prop_kind pk = prop_kind::forward;
@@ -176,12 +184,13 @@ void MKLDNNFcLayer::resetFwd() {
              : fc_fwd::desc(
                    pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD());
  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
  if (hasBias) {
    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
  } else {
    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
  }
+  printValueFormatFlow();
  pipelineFwd_.clear();
  pipelineFwd_.push_back(*fwd_);
 }
@@ -197,17 +206,24 @@ void MKLDNNFcLayer::resetBwd() {
  CHECK(inVal_) << "Should have input value";
  const MatrixPtr& wgt = weight_->getWGrad();
  const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
-  const MatrixPtr& out = output_.grad;
-  wgtGrad_ = MKLDNNMatrix::create(
+  if (nextIsMKLDNN()) {
-      wgt, wgtVal_->getDims(), wgtVal_->getFormat(), engine_);
+    // can not directly cast outputgrad to mkldnnmatrix,
-  biasGrad_ =
+    // since each layer can not write the inputgrad to mkldnn inputgrad.
-      hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
+    // So just create from matrix with outputvalue format.
+    const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+    // TODO: maybe need merge topdiffs
+  } else {
+    // TODO: merge topdiffs
+    const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
+    // fc do not need to convert from cpu device since output always nc
+    // only need create from cpu device
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+  }
-  outGrad_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
+  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPD());
-  // change original output to mkldnn output
+  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPD()) : nullptr;
-  // TODO: right?
-  output_.grad = std::dynamic_pointer_cast<Matrix>(outGrad_);
  // create memory primitive desc
  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
@@ -235,21 +251,38 @@ void MKLDNNFcLayer::resetBwd() {
  pipelineBwd_.push_back(*bwdWgt_);
  /// backward data
-  const MatrixPtr& in = getInputGrad(0);
+  if (prevIsMKLDNN()) {
-  if (in == nullptr) {
+    const MatrixPtr& in = getInputGrad(0, MKLDNN_DEVICE);
-    return;
+    if (in == nullptr) {
+      return;
+    }
+    if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
+      // TODO: many mkldnn bots
+      // add sum handle
+    } else {
+      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
+    }
+  } else {
+    const MatrixPtr& in = getInputGrad(0, CPU_DEVICE);
+    if (in == nullptr) {
+      return;
+    }
+    if (getInput(0, CPU_DEVICE).getAllCount() > 1) {
+      // TODO: many  bots
+      // add sum handle
+    } else {
+      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
+    }
  }
  fc_bwdData::desc bwdDataDesc =
      fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
  fc_bwdData::primitive_desc bwdDataPD =
      fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
-  // TODO: check right, just from ingrad?
-  inGrad_ =
-      MKLDNNMatrix::create(in, inVal_->getDims(), inVal_->getFormat(), engine_);
  CHECK(wgtVal_) << "Should have weight memory";
  bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+  printGradFormatFlow();
  pipelineBwd_.push_back(*bwdData_);
 }
@@ -259,11 +292,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
  {
    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+    syncInputValue();
-    // update input data
-    // since it might be changed if this is after data layer
-    real* iData = getInputValue(0)->getData();
-    inVal_->updateData(iData);
    // just submit forward pipeline
    stream_->submit(pipelineFwd_);
@@ -285,10 +314,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
    resetBwd();
-    // update diff
+    syncOutputGrad();
-    real* oDiff = getOutputGrad()->getData();
-    outGrad_->updateData(oDiff);
    // just sumbmit backward pipeline
    stream_->submit(pipelineBwd_);
  }

--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -125,23 +125,80 @@ public:
                       << ", oh: " << oh_ << ", ow: " << ow_;
  }
-  // TODO(TJ): move to MkldnnMatrix
+  /**
-  // create memory desc
+   * Print the mkldnn memory format flow of value
-  inline mkldnn::memory::desc createMD(
+   */
-      mkldnn::memory::dims dims,
+  virtual void printValueFormatFlow() {
-      mkldnn::memory::format fmt,
+    if (inVal_ && outVal_) {
-      mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
+      VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
-    // TODO(TJ): isFmtSuppoted(fmt)
+                        << " >>> " << outVal_->getFormat();
-    return mkldnn::memory::desc(dims, type, fmt);
+    }
  }
-  void resetMKLDNNOutput(size_t height, size_t width) {
+  /**
-    Layer::resetOutput(height, width);
+   * Print the mkldnn memory format flow of grad
-    // get valu and grad, use mkldnn matrix instaed
+   */
-    // output_.value;
+  virtual void printGradFormatFlow() {
+    if (inGrad_ && outGrad_) {
+      VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
+                        << " <<< " << outGrad_->getFormat();
+    }
  }
 protected:
+  /**
+   * If next layer only has MKLDNN type.
+   * Otherwise, only support otherdevice CPU device.
+   */
+  bool nextIsMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    return outputOtherDevice_.size() == 0;
+  }
+  /**
+   * Is previous layer MKLDNN type.
+   * Otherwise, only support otherdevice CPU device.
+   */
+  bool prevIsMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      // do not support GPU yet
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+  /**
+   * Sync input value data
+   */
+  void syncInputValue() {
+    if (prevIsMKLDNN()) {
+      return;
+    }
+    real* iData = getInputValue(0, CPU_DEVICE)->getData();
+    // update input data
+    // since it might be changed if this is after data layer
+    inVal_->updateData(iData);
+  }
+  /**
+   * Sync output grad data
+   */
+  void syncOutputGrad() {
+    if (nextIsMKLDNN()) {
+      return;
+    }
+    // update diff
+    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
+    outGrad_->updateData(oDiff);
+  }
  /**
   * Set deviceId of this layer.
   */

--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,13 @@ public:
   */
  virtual void* alloc(size_t size) {
    void* ptr;
+#ifdef PADDLE_USE_MKLDNN
+    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+    // memory alignment
+    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
+#else
    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+#endif
    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
    return ptr;
  }

--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -18,29 +18,74 @@ using namespace mkldnn;  // NOLINT
 namespace paddle {
-MKLDNNMatrixPtr MKLDNNMatrix::create(const MatrixPtr& m,
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
-                                     memory::dims dims,
+  memory::desc md = pd.desc();
-                                     memory::format fmt,
+  size_t ndims = md.data.ndims;
-                                     engine& eg,
+  int* dims = md.data.dims;
-                                     mkldnn::memory::data_type dtype) {
-  CpuMatrixPtr cpuM = std::dynamic_pointer_cast<CpuMatrix>(m);
-  CHECK(cpuM) << "Only support create from CPU matrix yet";
-  size_t ndims = dims.size();
  CHECK(ndims > 0) << "Input dims should not be empty";
-  size_t cnt = 1;
+  size_t cnts = 1;
  for (size_t i = 0; i < ndims; ++i) {
-    cnt *= dims[i];
+    cnts *= dims[i];
  }
-  CHECK_EQ(cnt, m->getElementCnt()) << "Count size does not match";
+  if (m == nullptr) {
+    size_t height = dims[0];
+    size_t width = cnts / dims[0];
+    // LOG(INFO) << height << "," << width;
+    m = Matrix::create(height, width, false, false);
+  }
+  CHECK(m) << " Matrix should not be empty";
+  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
+  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
+  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
  size_t width = m->getWidth();
  size_t height = m->getHeight();
  real* data = m->getData();
+  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+}
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
+                                     memory::dims dims,
+                                     memory::format fmt,
+                                     engine& eg,
+                                     mkldnn::memory::data_type dtype) {
  memory::desc md = memory::desc(dims, dtype, fmt);
  memory::primitive_desc pd = memory::primitive_desc(md, eg);
-  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+  return create(m, pd);
+}
+void MKLDNNMatrix::downSpatial() {
+  int fmt = getFormat();
+  if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
+    // only support nchw and oihw yet, later can support more like nhwc, ihwo
+    return;
+  }
+  memory::dims srcDims = getDims();
+  const int H = 2, W = 3;
+  if (srcDims[H] != 1 || srcDims[W] != 1) {
+    // can not down spatial
+    return;
+  }
+  memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
+  memory::format dstFmt;
+  switch (fmt) {
+    case memory::format::nchw:
+      dstFmt = memory::format::nc;
+      break;
+    case memory::format::oihw:
+      dstFmt = memory::format::oi;
+      break;
+    default:
+      LOG(FATAL) << "unsupported format";
+  }
+  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
+  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
+  void* data = getData();
+  memory(pd, data);
 }
 }  // namespace paddle
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -39,20 +39,37 @@ public:
               mkldnn::memory::primitive_desc pd)
      : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
-  MKLDNNMatrix(size_t height, size_t width, mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(height, width, false), mkldnn::memory(pd) {
-    set_data_handle(CpuMatrix::getData());
-  }
  ~MKLDNNMatrix() {}
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
  static MKLDNNMatrixPtr create(
-      const MatrixPtr& m,
+      MatrixPtr m,
      mkldnn::memory::dims dims,
      mkldnn::memory::format fmt,
      mkldnn::engine& eg,
      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+public:
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+  /**
+   * Update the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void updateData(void* data) { set_data_handle(data); }
  /**
   * Get primitive descriptor.
   */
@@ -64,12 +81,13 @@ public:
  mkldnn::memory::desc getMD() { return getPD().desc(); }
  /**
-   * Get dims.
+   * Get dimensions.
   */
  mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMD();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
    mkldnn::memory::dims dst;
-    int* src = getMD().data.dims;
-    int ndims = getMD().data.ndims;
    dst.resize(ndims);
    for (int i = 0; i < ndims; ++i) {
      dst[i] = src[i];
@@ -85,11 +103,16 @@ public:
  }
  /**
-   * Update the memory data handle.
+   * Get memory data type.
-   * Caution: This will not check the buffer size of the data,
-   *          it should be coverd by user.
   */
-  void updateData(void* data) { set_data_handle(data); }
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMD().data.data_type);
+  }
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPD().get_engine(); }
 };
 }  // namespace paddle