diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 8318c8c519a4cec1610eadd28320ee5ce0b4147d..53433cef35a377a73f87b041fdcfadd848dd2ec9 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -77,24 +77,6 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
-void MKLDNNFcLayer::convertOutputToOtherDevice() {
-  copyOutputInfoToOtherDevice();
-  // find other cpu device and reorder output to cpu device
-  int cnt = 0;
-  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-    if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
-      // fc cpu output value do not need convert
-      // just share point
-      outputOtherDevice_[i].value = output_.value;
-      ++cnt;
-    }
-  }
-
-  if (cnt > 1) {
-    LOG(WARNING) << "should not have more than one CPU devie";
-  }
-}
-
 void MKLDNNFcLayer::reshape() {
   const Argument& input = getInput(0, getPrev(0)->getDeviceId());
   int batchSize = input.getBatchSize();
@@ -155,7 +137,10 @@ void MKLDNNFcLayer::resetFwd() {
   // change original output value to mkldnn output value
   output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
   if (!outputIsOnlyMKLDNN()) {
-    convertOutputToOtherDevice();
+    copyOutputInfoToOtherDevice();
+    // fc cpu output value do not need create convert
+    // just share point
+    getOutput(CPU_DEVICE).value->setData(output_.value->getData());
   }
 
   // create forward handle
@@ -235,13 +220,12 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdWgt_);
 
   /// backward data
-  device = inputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  const MatrixPtr& in = getInputGrad(0, device);
+  const MatrixPtr& in = inputLayers_[0]->getOutput().grad;
   if (in == nullptr) {
     return;
   }
-  if (getInput(0, device).getAllCount() > 1) {
-    // TODO(TJ): use outputMaps_ ways when merge outgrad done
+  if (getInput(0, MKLDNN_DEVICE).getAllCount() > 1) {
+    // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
   } else {
     inGrad_ = MKLDNNMatrix::create(in, inVal_->getPrimitiveDesc());
   }
@@ -258,13 +242,21 @@ void MKLDNNFcLayer::resetBwd() {
   pipelineBwd_.push_back(*bwdData_);
 }
 
+void MKLDNNFcLayer::updateInputData() {
+  if (inputLayers_[0]->getType() != "data") {
+    return;
+  }
+  real* iData = getInputValue(0, CPU_DEVICE)->getData();
+  inVal_->setData(iData);
+}
+
 void MKLDNNFcLayer::forward(PassType passType) {
   Layer::forward(passType);
   reshape();
 
   {
     REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-    syncInputValue();
+    updateInputData();
 
     // just submit forward pipeline
     stream_->submit(pipelineFwd_);
@@ -286,7 +278,6 @@ void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
     REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
     resetBwd();
 
-    syncOutputGrad();
     // just sumbmit backward pipeline
     stream_->submit(pipelineBwd_);
   }
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index e138a6faf181c412949218458e7ecf800a0d6a07..4ad67a16e056a718c45a28babcf22a7cd571b15c 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -53,6 +53,8 @@ public:
 
   void backward(const UpdateCallback& callback) override;
 
+  void updateInputData() override;
+
 protected:
   /**
    * reshape the input image sizes
@@ -72,8 +74,6 @@ protected:
    * only would be called when needed
    */
   void resetBwd();
-
-  void convertOutputToOtherDevice() override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index b983b833d510b823c5d4cff0b9390173e4cefc89..543364edceff684bdcd002a8f4f10e7ce5e6953b 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -114,10 +114,10 @@ public:
   virtual void convertWeightsToPaddle() {}
 
   /**
-   * convert MKLDNN output to other device.
-   * only support CPU device yet
+   * Update input value data when input layer is "data" type.
+   * Since the input value data address might be changed.
    */
-  virtual void convertOutputToOtherDevice() {}
+  virtual void updateInputData() {}
 
   /**
    * print info about sizes
@@ -155,6 +155,7 @@ protected:
    *        copy base info and do not copy data value
    */
   void copyOutputInfoToOtherDevice() {
+    int cnt = 0;
     for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
       outputOtherDevice_[i].setFrameHeight(output_.getFrameHeight());
       outputOtherDevice_[i].setFrameWidth(output_.getFrameWidth());
@@ -163,6 +164,12 @@ protected:
       outputOtherDevice_[i].subSequenceStartPositions =
           output_.subSequenceStartPositions;
       outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        ++cnt;
+      }
+    }
+    if (cnt > 1) {
+      LOG(WARNING) << "should not have more than one CPU devie";
     }
   }
 
@@ -193,32 +200,6 @@ protected:
     return outputOtherDevice_.size() == 0;
   }
 
-  /**
-   * Sync input value data
-   */
-  void syncInputValue() {
-    if (inputIsOnlyMKLDNN()) {
-      return;
-    }
-    real* iData = getInputValue(0, CPU_DEVICE)->getData();
-    // update input data
-    // since it might be changed if this is after data layer
-    inVal_->updateData(iData);
-  }
-
-  /**
-   * Sync output grad data
-   */
-  void syncOutputGrad() {
-    if (outputIsOnlyMKLDNN()) {
-      return;
-    }
-
-    // update diff
-    real* oDiff = getOutput(CPU_DEVICE).grad->getData();
-    outGrad_->updateData(oDiff);
-  }
-
   /**
    * Set deviceId of this layer.
    */
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 0a355e2644cce572ce90ecf5c9d2a5b7b395bc61..c4063e5069854242d9f93886b66580385557ca73 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -33,14 +33,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
     size_t width = cnts / dims[0];
     m = Matrix::create(height, width, false, false);
   }
-
   CHECK(m) << " Matrix should not be empty";
+
   CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
   CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
-
-  CHECK_EQ(cnts, m->getElementCnt()) << "Count size does not match";
-  return std::make_shared<MKLDNNMatrix>(
-      m->getData(), m->getHeight(), m->getWidth(), pd);
+  CHECK_EQ(cpuMatrix->getElementCnt(), cnts) << "Count size does not match";
+  return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
 }
 
 MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
@@ -138,7 +136,7 @@ void MKLDNNMatrix::downSpatial() {
       mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
       "could not create a memory primitive");
   reset(result);
-  set_data_handle(getData());
+  set_data_handle(data_);
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index e50f698b495713e6f15ab7a12a7ee7487662040f..eef3b429e6fa0087aeac3f5aed9dff983b06e826 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -30,11 +30,10 @@ typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
  */
 class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
 public:
-  MKLDNNMatrix(real* data,
-               size_t height,
-               size_t width,
-               mkldnn::memory::primitive_desc pd)
-      : CpuMatrix(data, height, width, false), mkldnn::memory(pd, data) {}
+  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
+        mkldnn::memory(pd, m->getData()),
+        m_(m) {}
 
   ~MKLDNNMatrix() {}
 
@@ -81,11 +80,29 @@ public:
   void downSpatial();
 
   /**
-   * Update the memory data handle.
+   * set the memory data handle.
    * Caution: This will not check the buffer size of the data,
    *          it should be coverd by user.
    */
-  void updateData(void* data) { set_data_handle(data); }
+  void setData(real* data) {
+    set_data_handle(data);
+    CpuMatrix::setData(data);
+    m_.reset();
+  }
+
+  /**
+   * override Matrix::getData
+   * check data before return
+   */
+  real* getData() override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  const real* getData() const override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
 
   /**
    * Get primitive descriptor.
@@ -143,6 +160,10 @@ protected:
                    memory::format srcFmt,
                    memory::format dstFmt,
                    memory::dims dm);
+
+private:
+  // save the CpuMatrixPtr in case the buffer released outside
+  CpuMatrixPtr m_;
 };
 
 }  // namespace paddle