Merge pull request #3712 from tensor-tang/merge

add MKLDNN_DEVICE

Merge pull request #3712 from tensor-tang/merge
add MKLDNN_DEVICE
322d9ad8 · Tao Luo · GitHub · b45d020f · c5183caa · 322d9ad8
12 changed file
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -51,7 +51,7 @@ ExternalProject_Add(
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS             ${MKLDNN_DEPENDS}
    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
-    GIT_TAG             "v0.9"
+    GIT_TAG             "v0.10"
    PREFIX              ${MKLDNN_SOURCES_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -28,7 +28,7 @@ INCLUDE(ExternalProject)

 SET(MKLML_PROJECT       "extern_mklml")
 SET(MKLML_VER           "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
 SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
 SET(MKLML_DST_DIR       "mklml")

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -41,7 +41,7 @@ namespace paddle {
 Layer::Layer(const LayerConfig& config, bool useGpu)
    : config_(config),
      useGpu_(useGpu),
-      deviceId_(-1),
+      deviceId_(CPU_DEVICE),
      needSequenceInfo_(true) {}

 bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {

--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -59,7 +59,12 @@ protected:
  LayerConfig config_;
  /// whether to use GPU
  bool useGpu_;
-  /// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
+  /// Paddle device ID, MKLDNN is -2, CPU is -1
+  enum PADDLE_DEVICE_ID {
+    MKLDNN_DEVICE = -2,
+    CPU_DEVICE = -1,
+  };
+  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
  int deviceId_;
  /// Input layers
  std::vector<LayerPtr> inputLayers_;
@@ -77,6 +82,7 @@ protected:
  Argument output_;
  /// Several outputs stored on different devices, used in 'parallel_nn' case,
  /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
  std::vector<Argument> outputOtherDevice_;
  /// If there are several outputs, map them by each name.
  std::map<std::string, Argument*> outputMap_;
@@ -172,6 +178,13 @@ protected:
    return inputLayer.getOutput(deviceId_);
  }

+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
+
  /**
   * Get the forward-input value.
   */
@@ -186,6 +199,13 @@ protected:
    return inputLayer.getOutput(deviceId_).value;
  }

+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
+
  /**
   * Get the forward-input grad.
   */
@@ -200,6 +220,13 @@ protected:
    return inputLayer.getOutput(deviceId_).grad;
  }

+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
+
  /**
   * Get the forward-input label.
   */

--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -61,43 +61,42 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
    return;
  }

-  // TODO(TJ): dst format should get from wgtVal_
-  int dstFmt = PARAM_FORMAT_MKLDNN_OI;
-  int srcFmt = weight_->getParameterPtr()->getHeaderFormat();
-  if (srcFmt == dstFmt) {
-    return;
-  }
-
-  // The weight_ is transposed from initial paddle weight
-  MatrixPtr paddleWgt = Matrix::create(
-      weight_->getW()->getData(), iLayerSize_, oc_, false, false);
-
-  // TODO(TJ): remove this print when do not need differ weights
-  std::ostringstream ostr;
-  paddleWgt->print(ostr);
-  VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
-
-  // The mkldnn weight is transposed from initial paddle matrix
-  MatrixPtr paddleWgtT;
-  paddleWgt->transpose(paddleWgtT, true);
-  weight_->getW()->copyFrom(*paddleWgtT);
-  weight_->getParameterPtr()->setHeaderFormat(dstFmt);
+  CHECK(wgtVal_) << "should have been initialized";
+  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
  hasInitedWgt_ = true;
 }

 void MKLDNNFcLayer::convertWeightsToPaddle() {
-  MatrixPtr dnnWgt = weight_->getW();
-  MatrixPtr paddleWgt;
-  dnnWgt->transpose(paddleWgt, true);
-
-  // copy paddle weight and override on weight_
-  MatrixPtr dnnWgtT = Matrix::create(
-      dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
-  dnnWgtT->copyFrom(*paddleWgt);
+  CHECK(wgtVal_) << "should have been initialized";
+  bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNFcLayer::convertOutputToOtherDevice() {
+  copyOutputInfoToOtherDevice();
+  // find other cpu device and reorder output to cpu device
+  int cnt = 0;
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+      // fc cpu output value do not need convert
+      // just share point
+      outputOtherDevice_[i].value = output_.value;
+      ++cnt;
+    }
+  }
+
+  if (cnt > 1) {
+    LOG(WARNING) << "should not have more than one CPU devie";
+  }
 }

 void MKLDNNFcLayer::reshape() {
-  const Argument& input = getInput(0);
+  const Argument& input = getInput(0, getPrev(0)->getDeviceId());
  int batchSize = input.getBatchSize();
  if (bs_ == batchSize) {
    return;
@@ -111,10 +110,6 @@ void MKLDNNFcLayer::reshape() {
  if (iw_ == 0) {
    iw_ = 1;
  }
-  hasSpatial_ = true;
-  if (ih_ == 1 && iw_ == 1) {
-    hasSpatial_ = false;
-  }
  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
  ic_ = iLayerSize_ / (ih_ * iw_);
  CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
@@ -135,37 +130,53 @@ void MKLDNNFcLayer::reshape() {

 void MKLDNNFcLayer::resetFwd() {
  bool hasBias = biases_ && biases_->getW();
-  real* iData = getInputValue(0)->getData();
-  real* oData = getOutputValue()->getData();
-  real* wData = weight_->getW()->getData();
-  real* bData = hasBias ? biases_->getW()->getData() : NULL;
-
-  // TODO(TJ): below create should be covered in MkldnnMatrix
-  // create memory desc
-  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
-                                 : createMD({bs_, ic_}, format::nc);
-  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
-                                 : createMD({oc_, ic_}, format::oi);
-  memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
-                                   : createMD({}, format::format_undef);
-  memory::desc oMD = createMD({bs_, oc_}, format::nc);
-
-  // create memory primitive desc and memory self
-  inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
-  wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
-  outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
+  const MatrixPtr& wgt = weight_->getW();
+  const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
+  const MatrixPtr& out = output_.value;
+
+  if (inputIsOnlyMKLDNN()) {
+    const MatrixPtr& in = getInputValue(0);
+    inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
+    CHECK(inVal_) << "Input should be MKLDNNMatrix";
+  } else {
+    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
+    const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
+    inVal_ = MKLDNNMatrix::create(
+        in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
+  }
+  inVal_->downSpatial();
+  wgtVal_ = MKLDNNMatrix::create(
+      wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
+  wgtVal_->downSpatial();
+  biasVal_ =
+      hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
+  outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
+
+  // change original output value to mkldnn output value
+  output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  if (!outputIsOnlyMKLDNN()) {
+    convertOutputToOtherDevice();
+  }

+  // create forward handle
  prop_kind pk = prop_kind::forward;
-  fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
-                                       : fc_fwd::desc(pk, iMD, wMD, oMD);
+  fc_fwd::desc fwdDesc = hasBias ? fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                biasVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc())
+                                 : fc_fwd::desc(pk,
+                                                inVal_->getMemoryDesc(),
+                                                wgtVal_->getMemoryDesc(),
+                                                outVal_->getMemoryDesc());
  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-
-  if (bData != NULL) {
-    biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
+  if (hasBias) {
    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
  } else {
    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
  }
+  printValueFormatFlow();
+
  pipelineFwd_.clear();
  pipelineFwd_.push_back(*fwd_);
 }
@@ -175,45 +186,46 @@ void MKLDNNFcLayer::resetBwd() {
    return;
  }
  needResetBwd_ = false;
-
  bool hasBias = biases_ && biases_->getWGrad();
-  real* iData = getInputValue(0)->getData();
-  real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
-  real* oDiff = getOutputGrad()->getData();
-  real* wDiff = weight_->getWGrad()->getData();
-  real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;

  /// backward weight
-  // create memory desc for backward memory
-  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
-                                 : createMD({bs_, ic_}, format::nc);
-  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
-                                 : createMD({oc_, ic_}, format::oi);
-  memory::desc oMD = createMD({bs_, oc_}, format::nc);
-  memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
-                                   : createMD({}, format::format_undef);
-
-  if (inVal_) {
-    // update data
-    inVal_->set_data_handle(iData);
-  } else {
-    inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
-  }
-
-  // create memory primitive desc and memory self
-  wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
-  outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
-
-  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
+  CHECK(inVal_) << "Should have input value";
+  const MatrixPtr& wgt = weight_->getWGrad();
+  const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
+
+  // TODO(TJ): merge outgrad
+  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  // for MKLDNN device:
+  // can not directly cast outputgrad to mkldnnmatrix,
+  // since each layer can not write the inputgrad to mkldnn inputgrad.
+  // So just create from matrix with outputvalue format.
+  // for CPU device:
+  // fc do not need to convert from cpu device since output is always nc format
+  // only need create from cpu device
+  const MatrixPtr& out = getOutput(device).grad;
+  outGrad_ = MKLDNNMatrix::create(out, outVal_->getPrimitiveDesc());
+  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPrimitiveDesc());
+  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPrimitiveDesc())
+                      : nullptr;
+
+  // create memory primitive desc
+  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
+                                      inVal_->getMemoryDesc(),
+                                      wgtGrad_->getMemoryDesc(),
+                                      outGrad_->getMemoryDesc());
  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-  fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
-                                   ? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
-                                   : fc_bwdWgt::desc(iMD, wMD, oMD);
+  fc_bwdWgt::desc bwdWgtDesc = hasBias
+                                   ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     biasGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc())
+                                   : fc_bwdWgt::desc(inVal_->getMemoryDesc(),
+                                                     wgtGrad_->getMemoryDesc(),
+                                                     outGrad_->getMemoryDesc());
  fc_bwdWgt::primitive_desc bwdWgtPD =
      fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);

-  if (bDiff != NULL) {
-    biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
+  if (hasBias) {
    bwdWgt_.reset(
        new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
  } else {
@@ -223,15 +235,26 @@ void MKLDNNFcLayer::resetBwd() {
  pipelineBwd_.push_back(*bwdWgt_);

  /// backward data
-  if (iDiff == NULL) {
+  device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  const MatrixPtr& in = getInputGrad(0, device);
+  if (in == nullptr) {
    return;
  }
-  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
+  if (getInput(0, device).getAllCount() > 1) {
+    // TODO(TJ): use outputMaps_ ways when merge outgrad done
+  } else {
+    inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
+  }
+
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(inVal_->getMemoryDesc(),
+                                                  wgtGrad_->getMemoryDesc(),
+                                                  outGrad_->getMemoryDesc());
  fc_bwdData::primitive_desc bwdDataPD =
      fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
-  inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
+
  CHECK(wgtVal_) << "Should have weight memory";
  bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+  printGradFormatFlow();
  pipelineBwd_.push_back(*bwdData_);
 }

@@ -241,11 +264,7 @@ void MKLDNNFcLayer::forward(PassType passType) {

  {
    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-
-    // update input data
-    // since it might be changed if this is after data layer
-    real* iData = getInputValue(0)->getData();
-    inVal_->set_data_handle(iData);
+    syncInputValue();

    // just submit forward pipeline
    stream_->submit(pipelineFwd_);
@@ -267,10 +286,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
    resetBwd();

-    // update diff
-    real* oDiff = getOutputGrad()->getData();
-    outGrad_->set_data_handle(oDiff);
-
+    syncOutputGrad();
    // just sumbmit backward pipeline
    stream_->submit(pipelineBwd_);
  }

--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -32,16 +32,13 @@ protected:
  // if has already init the weight
  bool hasInitedWgt_;

-  // if input layer has image size info (ih>1 && iw>1)
-  bool hasSpatial_;
-
  // fc weight and bias
  std::unique_ptr<Weight> weight_;
  std::unique_ptr<Weight> biases_;

 public:
  explicit MKLDNNFcLayer(const LayerConfig& config)
-      : MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {}
+      : MKLDNNLayer(config), hasInitedWgt_(false) {}

  ~MKLDNNFcLayer() {}

@@ -75,6 +72,8 @@ protected:
   * only would be called when needed
   */
  void resetBwd();
+
+  void convertOutputToOtherDevice() override;
 };

 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "Layer.h"
 #include "MKLDNNBase.h"
 #include "mkldnn.hpp"
+#include "paddle/math/MKLDNNMatrix.h"

 DECLARE_bool(use_mkldnn);
-DECLARE_bool(use_mkldnn_wgt);

 namespace paddle {

@@ -52,15 +52,15 @@ protected:
  std::vector<mkldnn::primitive> pipelineFwd_;
  std::vector<mkldnn::primitive> pipelineBwd_;

-  // TODO(TJ): change below memory as MKLDNNMatrixPtr type
-  std::shared_ptr<mkldnn::memory> inVal_;
-  std::shared_ptr<mkldnn::memory> inGrad_;
-  std::shared_ptr<mkldnn::memory> outVal_;
-  std::shared_ptr<mkldnn::memory> outGrad_;
-  std::shared_ptr<mkldnn::memory> wgtVal_;
-  std::shared_ptr<mkldnn::memory> wgtGrad_;
-  std::shared_ptr<mkldnn::memory> biasVal_;
-  std::shared_ptr<mkldnn::memory> biasGrad_;
+  // MKLDNNMatrixPtr
+  MKLDNNMatrixPtr inVal_;
+  MKLDNNMatrixPtr inGrad_;
+  MKLDNNMatrixPtr outVal_;
+  MKLDNNMatrixPtr outGrad_;
+  MKLDNNMatrixPtr wgtVal_;
+  MKLDNNMatrixPtr wgtGrad_;
+  MKLDNNMatrixPtr biasVal_;
+  MKLDNNMatrixPtr biasGrad_;

 public:
  explicit MKLDNNLayer(const LayerConfig& config)
@@ -83,17 +83,21 @@ public:

  virtual bool init(const LayerMap& layerMap,
                    const ParameterMap& parameterMap) {
+    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                            << "Please set WITH_MKLDNN=ON "
+                            << "and set use_mkldnn=True";
+    CHECK(!useGpu_) << "Do not support GPU yet";
+
+    // set device id before Layer::init
+    setDevice(MKLDNN_DEVICE);
+    // change param device to MKLDNN device
+    setParamsDevice(MKLDNN_DEVICE, parameterMap);
    if (!Layer::init(layerMap, parameterMap)) {
      return false;
    }

-    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                            << "Please set WITH_MKLDNN=ON "
-                            << "and set use_mkldnn=True";
    stream_.reset(new MKLDNNStream());
    engine_ = CPUEngine::Instance().getEngine();
-
-    // TODO(TJ): deivecId
    return true;
  }

@@ -109,6 +113,12 @@ public:
   */
  virtual void convertWeightsToPaddle() {}

+  /**
+   * convert MKLDNN output to other device.
+   * only support CPU device yet
+   */
+  virtual void convertOutputToOtherDevice() {}
+
  /**
   * print info about sizes
   */
@@ -118,14 +128,124 @@ public:
                       << ", oh: " << oh_ << ", ow: " << ow_;
  }

-  // TODO(TJ): move to MkldnnMatrix
-  // create memory desc
-  inline mkldnn::memory::desc createMD(
-      mkldnn::memory::dims dims,
-      mkldnn::memory::format fmt,
-      mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
-    // TODO(TJ): isFmtSuppoted(fmt)
-    return mkldnn::memory::desc(dims, type, fmt);
+  /**
+   * Print the mkldnn memory format flow of value
+   */
+  virtual void printValueFormatFlow() {
+    if (inVal_ && outVal_) {
+      VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
+                        << " >>> " << outVal_->getFormat();
+    }
+  }
+
+  /**
+   * Print the mkldnn memory format flow of grad
+   */
+  virtual void printGradFormatFlow() {
+    if (inGrad_ && outGrad_) {
+      VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
+                        << " <<< " << outGrad_->getFormat();
+    }
+  }
+
+protected:
+  /**
+   * copy image size and sequence info to other device
+   * @note: can not directly use Layer::copyOutputToOtherDevice since here only
+   *        copy base info and do not copy data value
+   */
+  void copyOutputInfoToOtherDevice() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
+      outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+    }
+  }
+
+  /**
+   * If input only has MKLDNN device.
+   * Otherwise, only support the previous layer using CPU device.
+   */
+  bool inputIsOnlyMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      // do not support GPU yet
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool outputIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    return outputOtherDevice_.size() == 0;
+  }
+
+  /**
+   * Sync input value data
+   */
+  void syncInputValue() {
+    if (inputIsOnlyMKLDNN()) {
+      return;
+    }
+    real* iData = getInputValue(0, CPU_DEVICE)->getData();
+    // update input data
+    // since it might be changed if this is after data layer
+    inVal_->updateData(iData);
+  }
+
+  /**
+   * Sync output grad data
+   */
+  void syncOutputGrad() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+
+    // update diff
+    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
+    outGrad_->updateData(oDiff);
+  }
+
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
+
+  /**
+   * Set deviceId of the params used in this layer.
+   */
+  void setParamsDevice(int id, const ParameterMap& parameterMap) {
+    for (auto& inputConfig : config_.inputs()) {
+      if (inputConfig.has_input_parameter_name()) {
+        ParameterPtr parameter;
+        std::string name = inputConfig.input_parameter_name();
+        CHECK(mapGet(name, parameterMap, &parameter))
+            << "Cannot find input parameter " << name << " for layer "
+            << getName();
+        parameter->setDevice(id);
+      }
+    }
+    if (config_.has_bias_parameter_name()) {
+      ParameterPtr parameter;
+      std::string name = config_.bias_parameter_name();
+      CHECK(mapGet(name, parameterMap, &parameter))
+          << "Cannot find bias parameter " << name << " for layer "
+          << getName();
+      parameter->setDevice(id);
+    }
  }
 };


--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,13 @@ public:
   */
  virtual void* alloc(size_t size) {
    void* ptr;
+#ifdef PADDLE_USE_MKLDNN
+    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+    // memory alignment
+    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
+#else
    CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+#endif
    CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
    return ptr;
  }

--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -14,6 +14,17 @@
 #
 file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
+
+if(NOT WITH_MKLDNN)
+    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
+    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
+    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
+    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
+    message(STATUS "Skip compiling with MKLDNNMatrix")
+else()
+    message(STATUS "Compile with MKLDNNMatrix")
+endif()
+
 set(MATH_SOURCES
    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"

--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNMatrix.h"
+
+using namespace mkldnn;  // NOLINT
+
+namespace paddle {
+
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+  memory::desc md = pd.desc();
+  size_t ndims = md.data.ndims;
+  int* dims = md.data.dims;
+  CHECK(ndims > 0) << "Input dims should not be empty";
+  size_t cnts = 1;
+  for (size_t i = 0; i < ndims; ++i) {
+    cnts *= dims[i];
+  }
+
+  if (m == nullptr) {
+    size_t height = dims[0];
+    size_t width = cnts / dims[0];
+    m = Matrix::create(height, width, false, false);
+  }
+
+  CHECK(m) << " Matrix should not be empty";
+  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
+  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
+
+  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
+  return std::make_shared<MKLDNNMatrix>(
+      m->getData(), m->getHeight(), m->getWidth(), pd);
+}
+
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
+                                     memory::dims dims,
+                                     memory::format fmt,
+                                     engine& eg,
+                                     mkldnn::memory::data_type dtype) {
+  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
+}
+
+void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
+                                   memory::format srcFmt,
+                                   memory::dims targetDim) {
+  memory::format dstFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
+}
+
+void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
+                                 memory::format dstFmt,
+                                 memory::dims targetDim) {
+  memory::format srcFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
+}
+
+void MKLDNNMatrix::reorderOnce(void* srcData,
+                               void* dstData,
+                               memory::format srcFmt,
+                               memory::format dstFmt,
+                               memory::dims dm) {
+  CHECK(srcData);
+  CHECK(dstData);
+  MatrixPtr tmpSrc;
+  if (dstData == srcData) {
+    // inplace data
+    size_t sz = 1;
+    for (size_t i = 0; i < dm.size(); ++i) {
+      sz *= dm[i];
+    }
+    tmpSrc = Matrix::create(sz, 1, false, false);
+    tmpSrc->copyFrom((real*)srcData, sz);
+    srcData = tmpSrc->getData();
+  }
+
+  auto dtype = this->getDtype();
+  auto srcMD = memory::desc(dm, dtype, srcFmt);
+  auto dstMD = memory::desc(dm, dtype, dstFmt);
+
+  auto eg = this->getEngine();
+  auto src = memory(memory::primitive_desc(srcMD, eg), srcData);
+  auto dst = memory(memory::primitive_desc(dstMD, eg), dstData);
+
+  auto r = reorder(src, dst);
+  stream(stream::kind::eager).submit({r}).wait();
+}
+
+void MKLDNNMatrix::downSpatial() {
+  int fmt = getFormat();
+  if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
+    // only support nchw and oihw yet, later can support more like nhwc, ihwo
+    return;
+  }
+
+  // TODO(TJ): change H(height) and W(width) if support nhwc or more
+  const int H = 2, W = 3;
+  memory::dims srcDims = getDims();
+  if (srcDims[H] != 1 || srcDims[W] != 1) {
+    // can not down spatial
+    return;
+  }
+
+  memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
+  memory::format dstFmt;
+  switch (fmt) {
+    case memory::format::nchw:
+      dstFmt = memory::format::nc;
+      break;
+    case memory::format::oihw:
+      dstFmt = memory::format::oi;
+      break;
+    default:
+      LOG(FATAL) << "unsupported format";
+  }
+  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
+  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
+  mkldnn_primitive_t result;
+  mkldnn::error::wrap_c_api(
+      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+      "could not create a memory primitive");
+  reset(result);
+  set_data_handle(getData());
+}
+
+}  // namespace paddle
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Matrix.h"
+#include "mkldnn.hpp"
+#include "paddle/parameter/Parameter.h"
+
+namespace paddle {
+
+class MKLDNNMatrix;
+typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
+
+/**
+ * @brief MKLDNN Matrix.
+ *
+ */
+class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
+public:
+  MKLDNNMatrix(real* data,
+               size_t height,
+               size_t width,
+               mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
+
+  ~MKLDNNMatrix() {}
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
+  static MKLDNNMatrixPtr create(
+      MatrixPtr m,
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::engine& eg,
+      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+
+public:
+  /**
+   * Reorder this MKLDNNMatrix from other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
+   */
+  void reorderDataFrom(const MKLDNNMatrixPtr& m,
+                       memory::format srcFmt,
+                       memory::dims targetDim);
+
+  /**
+   * Reorder this MKLDNNMatrix to other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
+   */
+  void reorderDataTo(const MKLDNNMatrixPtr& m,
+                     memory::format dstFmt,
+                     memory::dims targetDim);
+
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+
+  /**
+   * Update the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void updateData(void* data) { set_data_handle(data); }
+
+  /**
+   * Get primitive descriptor.
+   */
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }
+
+  /**
+   * Get memory descriptor.
+   */
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
+
+  /**
+   * Get dimensions.
+   */
+  mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMemoryDesc();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
+    mkldnn::memory::dims dst;
+    dst.resize(ndims);
+    for (int i = 0; i < ndims; ++i) {
+      dst[i] = src[i];
+    }
+    return dst;
+  }
+
+  /**
+   * Get format.
+   */
+  mkldnn::memory::format getFormat() {
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
+  }
+
+  /**
+   * Get memory data type.
+   */
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
+  }
+
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
+
+protected:
+  /**
+   * Do reorder once.
+   * Can support inplace.
+   */
+  void reorderOnce(void* srcData,
+                   void* dstData,
+                   memory::format srcFmt,
+                   memory::format dstFmt,
+                   memory::dims dm);
+};
+
+}  // namespace paddle
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -281,7 +281,11 @@ public:
  /**
   * @brief Set the format in header.
   */
-  void setHeaderFormat(int32_t fmt) { headerFormat_ = fmt; }
+  void setHeaderFormat(int32_t fmt) {
+    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
+                                        << fmt;
+    headerFormat_ = fmt;
+  }

  /**
   * @brief  Parameter Update Hook.