diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index ed7684f2b4ee8817d505c8e45a3ee33fd047be1a..7ef7ee494d7280f7bcdafee6f6509012b305b539 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MKLDNNPoolLayer.h"
+#include "paddle/math/MathUtils.h"
 #include "paddle/utils/Logging.h"
 
 using namespace mkldnn;  // NOLINT
@@ -28,17 +29,49 @@ bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
     return false;
   }
 
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  ic_ = conf.channels();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  oc_ = ic_;
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  fh_ = conf.size_y();
+  fw_ = conf.size_x();
+  ph_ = conf.padding_y();
+  pw_ = conf.padding();
+  sh_ = conf.stride_y();
+  sw_ = conf.stride();
+
+  const std::string& type = conf.pool_type();
+  if (type == "max-projection") {
+    poolAlgo_ = algorithm::pooling_max;
+  } else if (type == "avg-projection") {
+    // TODO(TJ): support choosing exclude or include when paddle support it
+    // paddle only support pooling_avg_exclude_padding yet
+    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
+  } else {
+    LOG(FATAL) << "unknow pooling type!";
+  }
+
   return true;
 }
 
 void MKLDNNPoolLayer::reshape(
     int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
   reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
+      << "Input channel can not be changed";
 
   // cal output sizes
-  // oc can not be changed
-
+  // paddle used false caffeMode for pooling
+  oh = outputSize(ih, fh_, ph_, sh_, false);
+  ow = outputSize(iw, fw_, pw_, sw_, false);
   reshapeOutput(oh, ow);
+
   resizeOutput(bs, oc * oh * ow);
 
   printSizeInfo();
@@ -81,40 +114,166 @@ void MKLDNNPoolLayer::updateInputData() {
 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
   resetInValue(in);
+
   resetOutValue(out);
 }
 
-void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {}
+void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
+  if (inputIsOnlyMKLDNN()) {
+    const MatrixPtr& dnnIn = getInputValue(0);
+    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
+    CHECK(in) << "Input should be MKLDNNMatrix";
+  } else {
+    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
+    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
+    in = MKLDNNMatrix::create(
+        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
+  }
+}
 
-void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {}
+void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
+  CHECK(inVal_) << "Should reset input value first";
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  out = MKLDNNMatrix::create(
+      output_.value, outDims, inVal_->getFormat(), engine_);
+  output_.value = std::dynamic_pointer_cast<Matrix>(out);
+
+  // create reorder if output value has cpu device and pd do not match
+  cpuOutVal_ = nullptr;
+  cvtOutVal_ = nullptr;
+  if (!outputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
+    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
+    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
+      CHECK(cvtOutVal_) << "should not be emptry";
+    } else {
+      // CPU output share the same data of MKLDNN output
+      cpuOut->setData(out->getData());
+      cpuOutVal_ = out;
+    }
+  }
+}
 
 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr in,
-                                 MKLDNNMatrixPtr out) {}
+                                 MKLDNNMatrixPtr out) {
+  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  padding_kind padKind = padding_kind::zero;
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = pool_fwd::desc(pk,
+                                poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padKind);
+  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
+
+  // prepare workspace if necessary
+  workspace_ =
+      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
 
 void MKLDNNPoolLayer::resetFwdPipeline(
-    std::vector<mkldnn::primitive>& pipeline,
+    std::vector<primitive>& pipeline,
     std::shared_ptr<pool_fwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {}
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+  fwd_ = workspace_
+             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
+             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+
+  if (cvtOutVal_) {
+    pipeline.push_back(*cvtOutVal_);
+  }
+}
 
 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
   resetOutGrad(out);
+
   resetInGrad(in);
 }
-void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {}
+void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
+  CHECK(outVal_) << "Should have output value";
+  out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
+
+  // create reorder if output value has cpu device and pd do not match
+  cpuOutGrad_ = nullptr;
+  cvtOutGrad_ = nullptr;
+  if (!outputIsOnlyMKLDNN()) {
+    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    cpuOutGrad_ = MKLDNNMatrix::create(
+        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
+    if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
+      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
+      CHECK(cvtOutGrad_) << "should not be emptry";
+    } else {
+      // share the same data of CPU output
+      output_.grad->setData(cpuOut->getData());
+      out = cpuOutGrad_;
+    }
+  }
+}
 
-void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {}
+void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
+  in = nullptr;
+  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
+  if (inGrad == nullptr) {
+    return;
+  }
+  CHECK(inVal_);
+  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+}
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr& in,
-                                 MKLDNNMatrixPtr& out) {}
+                                 MKLDNNMatrixPtr& out) {
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  CHECK(in);
+  CHECK(out);
+  auto bwdDesc = pool_bwd::desc(poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padding_kind::zero);
+  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
 
 void MKLDNNPoolLayer::resetBwdPipeline(
-    std::vector<mkldnn::primitive>& pipeline,
+    std::vector<primitive>& pipeline,
     std::shared_ptr<pool_bwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
-    MKLDNNMatrixPtr& out) {}
+    MKLDNNMatrixPtr& out) {
+  pipeline.clear();
+  if (cvtOutGrad_) {
+    pipeline.push_back(*cvtOutGrad_);
+  }
+
+  bwdData_ =
+      workspace_
+          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
+          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
+  pipeline.push_back(*bwdData_);
+}
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index d797ee955cdba3c9cf04e869371558670c39de30..891e15a7efcdd2e54f61352efc1ba7345b91c76b 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -28,8 +28,28 @@ typedef mkldnn::pooling_backward pool_bwd;
  */
 class MKLDNNPoolLayer : public MKLDNNLayer {
 protected:
+  // padding height and width
+  int ph_, pw_;
+  // stride height and width
+  int sh_, sw_;
+  // filter(kenerl) height and width
+  int fh_, fw_;
+
+  // pooling_avg or pooling_max
+  mkldnn::algorithm poolAlgo_;
+
+  // MKLDNNMatrixPtr which should be created from CPU Device
+  MKLDNNMatrixPtr cpuOutVal_;
+  MKLDNNMatrixPtr cpuOutGrad_;
+  // convert handle between CPU device and MKLDNN device
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_pooling_forward.cpp, pool need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
 
 public:
   explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
@@ -56,6 +76,13 @@ public:
 
   void updateInputData() override;
 
+  void printSizeInfo() override {
+    MKLDNNLayer::printSizeInfo();
+    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
+                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", sw: " << sw_;
+  }
+
 protected:
   /**
    * Forward functions: reset buffers(input, output),
@@ -88,6 +115,24 @@ protected:
                         std::shared_ptr<pool_bwd::primitive_desc>& pd,
                         MKLDNNMatrixPtr& in,
                         MKLDNNMatrixPtr& out);
+
+  /**
+   * get padding_r according to
+   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+   * test_pooling_forward.cpp
+   */
+  mkldnn::memory::dims getPaddingR() const {
+    mkldnn::memory::dims padR = {ph_, pw_};
+    for (int i = 0; i < 2; ++i) {
+      if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
+        ++padR[0];
+      }
+      if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
+        ++padR[1];
+      }
+    }
+    return padR;
+  }
 };
 
 }  // namespace paddle