diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index ec4d093e0cac9a766d17a36827affd2c08e1c618..edef36194aabdb9c122ec3423deb036169a34d7c 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -82,6 +82,7 @@ protected:
   Argument output_;
   /// Several outputs stored on different devices, used in 'parallel_nn' case,
   /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
   std::vector<Argument> outputOtherDevice_;
   /// If there are several outputs, map them by each name.
   std::map<std::string, Argument*> outputMap_;
@@ -177,6 +178,13 @@ protected:
     return inputLayer.getOutput(deviceId_);
   }
 
+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
+
   /**
    * Get the forward-input value.
    */
@@ -191,6 +199,13 @@ protected:
     return inputLayer.getOutput(deviceId_).value;
   }
 
+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
+
   /**
    * Get the forward-input grad.
    */
@@ -205,6 +220,13 @@ protected:
     return inputLayer.getOutput(deviceId_).grad;
   }
 
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
+
   /**
    * Get the forward-input label.
    */
@@ -326,19 +348,6 @@ public:
     if (deviceId == getDeviceId()) {
       return output_;
     } else {
-      bool CPU2MKLDNN =
-          getDeviceId() == CPU_DEVICE && deviceId == MKLDNN_DEVICE;
-      bool MKLDNN2CPU =
-          getDeviceId() == MKLDNN_DEVICE && deviceId == CPU_DEVICE;
-      if (CPU2MKLDNN) {
-        // TODO: do something
-        return output_;
-      } else if (MKLDNN2CPU) {
-        // TODO: do something
-        return output_;
-      }
-
-      // TODO: handle mkldnn device or add mkldnn device to other
       for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
         if (outputOtherDevice_[i].deviceId == deviceId) {
           return outputOtherDevice_[i];
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 54631044696329517acae66484285af81a87c708..a3291e6a8fb755286427942e340fbd40c73350ad 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -97,7 +97,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
 }
 
 void MKLDNNFcLayer::reshape() {
-  const Argument& input = getInput(0);
+  const Argument& input = getInput(0, getPrev(0)->getDeviceId());
   int batchSize = input.getBatchSize();
   if (bs_ == batchSize) {
     return;
@@ -135,35 +135,43 @@ void MKLDNNFcLayer::reshape() {
 
 void MKLDNNFcLayer::resetFwd() {
   bool hasBias = biases_ && biases_->getW();
-  const MatrixPtr& in = getInputValue(0);
   const MatrixPtr& wgt = weight_->getW();
   const MatrixPtr& bias = hasBias ? biases_->getW() : nullptr;
   const MatrixPtr& out = output_.value;
 
-  if (getPrev(0)->getDeviceId() == MKLDNN_DEVICE) {
+  if (prevIsMKLDNN()) {
+    const MatrixPtr& in = getInputValue(0);
     inVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(in);
     CHECK(inVal_) << "Input should be MKLDNNMatrix";
-    // TODO:  change input nchw to nc if available
-    // inVal_->downSpatial()
   } else {
+    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
+    const MatrixPtr& in = getInputValue(0, CPU_DEVICE);
     inVal_ = MKLDNNMatrix::create(
-        in,
-        hasSpatial_ ? memory::dims{bs_, ic_, ih_, iw_} : memory::dims{bs_, ic_},
-        hasSpatial_ ? format::nchw : format::nc,
-        engine_);
+        in, memory::dims{bs_, ic_, ih_, iw_}, format::nchw, engine_);
   }
-
+  inVal_->downSpatial();
   wgtVal_ = MKLDNNMatrix::create(
-      wgt,
-      hasSpatial_ ? memory::dims{oc_, ic_, ih_, iw_} : memory::dims{oc_, ic_},
-      hasSpatial_ ? format::oihw : format::oi,
-      engine_);
+      wgt, memory::dims{oc_, ic_, ih_, iw_}, format::oihw, engine_);
+  wgtVal_->downSpatial();
   biasVal_ =
       hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
   outVal_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
 
-  // change original output to mkldnn output
+  // change original output value to mkldnn output value
   output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
+  if (!nextIsMKLDNN()) {
+    Argument cpuOutput;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        cpuOutput = outputOtherDevice_[i];
+      }
+    }
+    cpuOutput.setFrameHeight(output_.getFrameHeight());
+    cpuOutput.setFrameWidth(output_.getFrameWidth());
+
+    // fc cpu output value do not need convert
+    cpuOutput.value = output_.value;
+  }
 
   // create forward handle
   prop_kind pk = prop_kind::forward;
@@ -176,12 +184,13 @@ void MKLDNNFcLayer::resetFwd() {
               : fc_fwd::desc(
                     pk, inVal_->getMD(), wgtVal_->getMD(), outVal_->getMD());
   fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
-
   if (hasBias) {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
   } else {
     fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
   }
+  printValueFormatFlow();
+
   pipelineFwd_.clear();
   pipelineFwd_.push_back(*fwd_);
 }
@@ -197,17 +206,24 @@ void MKLDNNFcLayer::resetBwd() {
   CHECK(inVal_) << "Should have input value";
   const MatrixPtr& wgt = weight_->getWGrad();
   const MatrixPtr& bias = hasBias ? biases_->getWGrad() : nullptr;
-  const MatrixPtr& out = output_.grad;
 
-  wgtGrad_ = MKLDNNMatrix::create(
-      wgt, wgtVal_->getDims(), wgtVal_->getFormat(), engine_);
-  biasGrad_ =
-      hasBias ? MKLDNNMatrix::create(bias, {oc_}, format::x, engine_) : nullptr;
+  if (nextIsMKLDNN()) {
+    // can not directly cast outputgrad to mkldnnmatrix,
+    // since each layer can not write the inputgrad to mkldnn inputgrad.
+    // So just create from matrix with outputvalue format.
+    const MatrixPtr& out = getOutput(MKLDNN_DEVICE).grad;
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+    // TODO: maybe need merge topdiffs
+  } else {
+    // TODO: merge topdiffs
+    const MatrixPtr& out = getOutput(CPU_DEVICE).grad;
+    // fc do not need to convert from cpu device since output always nc
+    // only need create from cpu device
+    outGrad_ = MKLDNNMatrix::create(out, outVal_->getPD());
+  }
 
-  outGrad_ = MKLDNNMatrix::create(out, {bs_, oc_}, format::nc, engine_);
-  // change original output to mkldnn output
-  // TODO: right?
-  output_.grad = std::dynamic_pointer_cast<Matrix>(outGrad_);
+  wgtGrad_ = MKLDNNMatrix::create(wgt, wgtVal_->getPD());
+  biasGrad_ = hasBias ? MKLDNNMatrix::create(bias, biasVal_->getPD()) : nullptr;
 
   // create memory primitive desc
   fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward,
@@ -235,21 +251,38 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdWgt_);
 
   /// backward data
-  const MatrixPtr& in = getInputGrad(0);
-  if (in == nullptr) {
-    return;
+  if (prevIsMKLDNN()) {
+    const MatrixPtr& in = getInputGrad(0, MKLDNN_DEVICE);
+    if (in == nullptr) {
+      return;
+    }
+    if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
+      // TODO: many mkldnn bots
+      // add sum handle
+    } else {
+      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
+    }
+  } else {
+    const MatrixPtr& in = getInputGrad(0, CPU_DEVICE);
+    if (in == nullptr) {
+      return;
+    }
+    if (getInput(0, CPU_DEVICE).getAllCount() > 1) {
+      // TODO: many  bots
+      // add sum handle
+    } else {
+      inGrad_ = MKLDNNMatrix::create(in, inVal_->getPD());
+    }
   }
+
   fc_bwdData::desc bwdDataDesc =
       fc_bwdData::desc(inVal_->getMD(), wgtGrad_->getMD(), outGrad_->getMD());
   fc_bwdData::primitive_desc bwdDataPD =
       fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
 
-  // TODO: check right, just from ingrad?
-  inGrad_ =
-      MKLDNNMatrix::create(in, inVal_->getDims(), inVal_->getFormat(), engine_);
-
   CHECK(wgtVal_) << "Should have weight memory";
   bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+  printGradFormatFlow();
   pipelineBwd_.push_back(*bwdData_);
 }
 
@@ -259,11 +292,7 @@ void MKLDNNFcLayer::forward(PassType passType) {
 
   {
     REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-
-    // update input data
-    // since it might be changed if this is after data layer
-    real* iData = getInputValue(0)->getData();
-    inVal_->updateData(iData);
+    syncInputValue();
 
     // just submit forward pipeline
     stream_->submit(pipelineFwd_);
@@ -285,10 +314,7 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
     resetBwd();
 
-    // update diff
-    real* oDiff = getOutputGrad()->getData();
-    outGrad_->updateData(oDiff);
-
+    syncOutputGrad();
     // just sumbmit backward pipeline
     stream_->submit(pipelineBwd_);
   }
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index fbd62d9aaa306e60265caeece9ace19aa5694256..3dd17a36ff7eef18b0a451c73c2a7eb38cde519f 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -125,23 +125,80 @@ public:
                        << ", oh: " << oh_ << ", ow: " << ow_;
   }
 
-  // TODO(TJ): move to MkldnnMatrix
-  // create memory desc
-  inline mkldnn::memory::desc createMD(
-      mkldnn::memory::dims dims,
-      mkldnn::memory::format fmt,
-      mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
-    // TODO(TJ): isFmtSuppoted(fmt)
-    return mkldnn::memory::desc(dims, type, fmt);
+  /**
+   * Print the mkldnn memory format flow of value
+   */
+  virtual void printValueFormatFlow() {
+    if (inVal_ && outVal_) {
+      VLOG(MKLDNN_FMTS) << "value format flow --- " << inVal_->getFormat()
+                        << " >>> " << outVal_->getFormat();
+    }
   }
 
-  void resetMKLDNNOutput(size_t height, size_t width) {
-    Layer::resetOutput(height, width);
-    // get valu and grad, use mkldnn matrix instaed
-    // output_.value;
+  /**
+   * Print the mkldnn memory format flow of grad
+   */
+  virtual void printGradFormatFlow() {
+    if (inGrad_ && outGrad_) {
+      VLOG(MKLDNN_FMTS) << "grad format flow --- " << inGrad_->getFormat()
+                        << " <<< " << outGrad_->getFormat();
+    }
   }
 
 protected:
+  /**
+   * If next layer only has MKLDNN type.
+   * Otherwise, only support otherdevice CPU device.
+   */
+  bool nextIsMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    return outputOtherDevice_.size() == 0;
+  }
+
+  /**
+   * Is previous layer MKLDNN type.
+   * Otherwise, only support otherdevice CPU device.
+   */
+  bool prevIsMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      // do not support GPU yet
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+
+  /**
+   * Sync input value data
+   */
+  void syncInputValue() {
+    if (prevIsMKLDNN()) {
+      return;
+    }
+    real* iData = getInputValue(0, CPU_DEVICE)->getData();
+    // update input data
+    // since it might be changed if this is after data layer
+    inVal_->updateData(iData);
+  }
+
+  /**
+   * Sync output grad data
+   */
+  void syncOutputGrad() {
+    if (nextIsMKLDNN()) {
+      return;
+    }
+
+    // update diff
+    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
+    outGrad_->updateData(oDiff);
+  }
+
   /**
    * Set deviceId of this layer.
    */
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 666a8b8368e3e2ebc522902c176d7491d2920d2a..94ef561f066a127496e2849a419835e175c526d7 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,13 @@ public:
    */
   virtual void* alloc(size_t size) {
     void* ptr;
+#ifdef PADDLE_USE_MKLDNN
+    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+    // memory alignment
+    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
+#else
     CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+#endif
     CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
     return ptr;
   }
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 44fc54278c993aa8b35d34d801f5684ddf397dc4..24d54ec0f73136b2559705feb52262a7f90b8fc7 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -18,29 +18,74 @@ using namespace mkldnn;  // NOLINT
 
 namespace paddle {
 
-MKLDNNMatrixPtr MKLDNNMatrix::create(const MatrixPtr& m,
-                                     memory::dims dims,
-                                     memory::format fmt,
-                                     engine& eg,
-                                     mkldnn::memory::data_type dtype) {
-  CpuMatrixPtr cpuM = std::dynamic_pointer_cast<CpuMatrix>(m);
-  CHECK(cpuM) << "Only support create from CPU matrix yet";
-
-  size_t ndims = dims.size();
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+  memory::desc md = pd.desc();
+  size_t ndims = md.data.ndims;
+  int* dims = md.data.dims;
   CHECK(ndims > 0) << "Input dims should not be empty";
-  size_t cnt = 1;
+  size_t cnts = 1;
   for (size_t i = 0; i < ndims; ++i) {
-    cnt *= dims[i];
+    cnts *= dims[i];
   }
-  CHECK_EQ(cnt, m->getElementCnt()) << "Count size does not match";
 
+  if (m == nullptr) {
+    size_t height = dims[0];
+    size_t width = cnts / dims[0];
+    // LOG(INFO) << height << "," << width;
+    m = Matrix::create(height, width, false, false);
+  }
+
+  CHECK(m) << " Matrix should not be empty";
+  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
+  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
+
+  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
   size_t width = m->getWidth();
   size_t height = m->getHeight();
   real* data = m->getData();
+  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+}
 
+MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
+                                     memory::dims dims,
+                                     memory::format fmt,
+                                     engine& eg,
+                                     mkldnn::memory::data_type dtype) {
   memory::desc md = memory::desc(dims, dtype, fmt);
   memory::primitive_desc pd = memory::primitive_desc(md, eg);
-  return std::make_shared<MKLDNNMatrix>(data, height, width, pd);
+  return create(m, pd);
+}
+
+void MKLDNNMatrix::downSpatial() {
+  int fmt = getFormat();
+  if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
+    // only support nchw and oihw yet, later can support more like nhwc, ihwo
+    return;
+  }
+
+  memory::dims srcDims = getDims();
+  const int H = 2, W = 3;
+  if (srcDims[H] != 1 || srcDims[W] != 1) {
+    // can not down spatial
+    return;
+  }
+
+  memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
+  memory::format dstFmt;
+  switch (fmt) {
+    case memory::format::nchw:
+      dstFmt = memory::format::nc;
+      break;
+    case memory::format::oihw:
+      dstFmt = memory::format::oi;
+      break;
+    default:
+      LOG(FATAL) << "unsupported format";
+  }
+  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
+  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
+  void* data = getData();
+  memory(pd, data);
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 54c0a1fdcbc477643848b6374653de15f252bf50..05adc867c2076511da22834a73e0665c4e5cfc68 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -39,20 +39,37 @@ public:
                mkldnn::memory::primitive_desc pd)
       : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
 
-  MKLDNNMatrix(size_t height, size_t width, mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(height, width, false), mkldnn::memory(pd) {
-    set_data_handle(CpuMatrix::getData());
-  }
-
   ~MKLDNNMatrix() {}
 
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
   static MKLDNNMatrixPtr create(
-      const MatrixPtr& m,
+      MatrixPtr m,
       mkldnn::memory::dims dims,
       mkldnn::memory::format fmt,
       mkldnn::engine& eg,
       mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
 
+public:
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+
+  /**
+   * Update the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void updateData(void* data) { set_data_handle(data); }
+
   /**
    * Get primitive descriptor.
    */
@@ -64,12 +81,13 @@ public:
   mkldnn::memory::desc getMD() { return getPD().desc(); }
 
   /**
-   * Get dims.
+   * Get dimensions.
    */
   mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMD();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
     mkldnn::memory::dims dst;
-    int* src = getMD().data.dims;
-    int ndims = getMD().data.ndims;
     dst.resize(ndims);
     for (int i = 0; i < ndims; ++i) {
       dst[i] = src[i];
@@ -85,11 +103,16 @@ public:
   }
 
   /**
-   * Update the memory data handle.
-   * Caution: This will not check the buffer size of the data,
-   *          it should be coverd by user.
+   * Get memory data type.
    */
-  void updateData(void* data) { set_data_handle(data); }
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMD().data.data_type);
+  }
+
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPD().get_engine(); }
 };
 
 }  // namespace paddle