Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into omitlstmunit

4c183b17 · gongweibao · 0d0fd3fb · f63273b9 · 4c183b17 · 4c183b17
27 changed file
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -123,7 +123,8 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	}
 	err := c.SetDataset(paths)
 	if err != nil {
-		log.Error("error set dataset", log.Ctx{"error": err})
+		log.Error("error set dataset",
+			log.Ctx{"error": err, "paths": paths})
 		return C.PADDLE_MASTER_ERROR
 	}

--- a/go/master/client.go
+++ b/go/master/client.go
@@ -121,6 +121,7 @@ func (c *Client) StartGetRecords(passID int) {
 }
 func (c *Client) getRecords(passID int) {
+	i := 0
 	for {
 		t, err := c.getTask(passID)
 		if err != nil {
@@ -130,12 +131,20 @@ func (c *Client) getRecords(passID int) {
 				c.ch <- record{nil, err}
 				break
 			}
-			if err.Error() == ErrPassAfter.Error() {
-				// wait util last pass finishes
+			if i%60 == 0 {
-				time.Sleep(time.Second * 3)
+				log.Debug("getTask of passID error.",
-				continue
+					log.Ctx{"error": err, "passID": passID})
+				i = 0
 			}
-			log.Error("getTask error.", log.Ctx{"error": err})
+			// if err.Error() == ErrPassAfter.Error()
+			//   wait util last pass finishes
+			// if other error such as network error
+			//   wait to reconnect or task time out
+			time.Sleep(time.Second * 3)
+			i += 3
+			continue
 		}
 		for _, chunk := range t.Chunks {

--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -117,6 +117,7 @@ func TestNextRecord(t *testing.T) {
 			if e != nil {
 				panic(e)
 			}
 			// test for n passes
 			for pass := 0; pass < 10; pass++ {
 				c.StartGetRecords(pass)

--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -195,6 +195,14 @@ std::vector<int64_t> vectorize(const DDim& ddim) {
  return result;
 }
+// NOTE: framework::vectorize converts to type int64_t
+//       which does not fit cudnn inputs.
+std::vector<int> vectorize2int(const DDim& ddim) {
+  std::vector<int64_t> temp = vectorize(ddim);
+  std::vector<int> result(temp.begin(), temp.end());
+  return result;
+}
 struct ProductVisitor : public boost::static_visitor<int64_t> {
  template <int D>
  int64_t operator()(const Dim<D>& dim) {

--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -93,6 +93,7 @@ int64_t get(const DDim& dim, int idx);
 void set(DDim& dim, int idx, int val);
 std::vector<int64_t> vectorize(const DDim& ddim);
+std::vector<int> vectorize2int(const DDim& ddim);
 int64_t product(const DDim& ddim);

--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -216,17 +216,13 @@ void MKLDNNBatchNormLayer::resetFwdPD(
  }
  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
  pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
-  // TODO(TJ): use check macro
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
-  CHECK(out);
-  CHECK(out->getPrimitiveDesc() == pd->dst_primitive_desc());
  if (wgt) {
-    CHECK(wgt->getPrimitiveDesc() == pd->weights_primitive_desc());
+    CHECK_PRIMITIVE_DESC_EQ(wgt, pd->weights_primitive_desc());
  }
  if (passType_ != PASS_TEST || useGlobalStats_) {
-    CHECK(mean_);
+    CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
-    CHECK(mean_->getPrimitiveDesc() == pd->mean_primitive_desc());
+    CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
-    CHECK(var_);
-    CHECK(var_->getPrimitiveDesc() == pd->variance_primitive_desc());
  }
 }
@@ -283,19 +279,14 @@ void MKLDNNBatchNormLayer::resetBwdPD(
  if (in == nullptr) {
    return;
  }
-  CHECK(out);
+  CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
-  CHECK(out->getPrimitiveDesc() == in->getPrimitiveDesc());
  auto md = in->getMemoryDesc();
  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
  pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
-  // TODO(TJ): use check macro
-  CHECK(wgt);
-  CHECK(wgt->getPrimitiveDesc() == pd->diff_weights_primitive_desc());
  CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
-  CHECK(mean_);
+  CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
-  CHECK(mean_->getPrimitiveDesc() == pd->mean_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
-  CHECK(var_);
+  CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
-  CHECK(var_->getPrimitiveDesc() == pd->variance_primitive_desc());
 }
 void MKLDNNBatchNormLayer::resetBwdPipeline(

--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -262,12 +262,15 @@ void MKLDNNConvLayer::resetBwdWgtPD(
                                            padR,
                                            padKind);
  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
-  CHECK(pd->src_primitive_desc() == inVal_->getPrimitiveDesc())
+  CHECK_PRIMITIVE_DESC_EQ(inVal_, pd->src_primitive_desc());
-      << "primitive desc of in value should equal";
+  CHECK_PRIMITIVE_DESC_EQ(
-  CHECK(pd->diff_dst_primitive_desc() == outVal_->getPrimitiveDesc())
+      outVal_,
-      << "primitive desc of out grad should equal the out value";
+      pd->diff_dst_primitive_desc(),
-  CHECK(pd->diff_weights_primitive_desc() == wgtVal_->getPrimitiveDesc())
+      "primitive desc of out value and grad should be equal");
-      << "primitive desc of weight grad should equal the weight value";
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      pd->diff_weights_primitive_desc(),
+      "primitive desc of weight value and grad should be equal");
 }
 void MKLDNNConvLayer::resetBwdDataPD(
@@ -292,10 +295,14 @@ void MKLDNNConvLayer::resetBwdDataPD(
                                        padR,
                                        padding_kind::zero);
  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
-  CHECK(pd->diff_src_primitive_desc() == inVal_->getPrimitiveDesc())
+  CHECK_PRIMITIVE_DESC_EQ(
-      << "primitive desc of in grad should equal the in value";
+      inVal_,
-  CHECK(pd->diff_dst_primitive_desc() == outVal_->getPrimitiveDesc())
+      pd->diff_src_primitive_desc(),
-      << "primitive desc of out grad should equal";
+      "primitive desc of in value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
 }
 void MKLDNNConvLayer::resetBwdBuffers(
@@ -310,17 +317,20 @@ void MKLDNNConvLayer::resetBwdBuffers(
  resetWithMatrix(
      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
-  CHECK(wgtVal_ != nullptr &&
+  CHECK_PRIMITIVE_DESC_EQ(
-        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
+      wgtVal_,
-      << "primitive desc of weight grad and value should be equal";
+      wgt->getPrimitiveDesc(),
+      "primitive desc of weight grad and value should be equal");
  bias = nullptr;
  if (biases_ && biases_->getWGrad()) {
    resetWithMatrix(
        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
-    CHECK(bias && biasVal_ &&
+    CHECK(bias);
-          bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
+    CHECK_PRIMITIVE_DESC_EQ(
-        << "primitive desc of bias grad should equal the bias value";
+        biasVal_,
+        bias->getPrimitiveDesc(),
+        "primitive desc of bias grad and value should be equal");
  }
  if (dataPD == nullptr) {

--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -235,8 +235,7 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
  in = MKLDNNMatrix::create(intPD, inMat);
  Argument& arg = input->getOutput(this->getName());
  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
-  CHECK(inVal_);
+  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
-  CHECK(inVal_->getPrimitiveDesc() == intPD) << "the primitive desc must equal";
  if (inputIsOnlyMKLDNN()) {
    return;
  }
@@ -250,8 +249,7 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
      << "should have external input value and the format must be nchw(nc)";
  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
-  CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+  CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
-      << "should have internal input value and primitive desc must equal";
  in = MKLDNNMatrix::create(intPD);
  cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
  CHECK(cvtInGrad_);
@@ -277,8 +275,7 @@ void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
  CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
      << "should have external output value and the format must be nchw(nc)";
  extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
-  CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
+  CHECK_PRIMITIVE_DESC_EQ(outVal_, intPD);
-      << "should have internal output value and primitive desc must equal";
  out = MKLDNNMatrix::create(intPD);
  cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
  CHECK(cvtOutGrad_);

--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -24,6 +24,12 @@ namespace paddle {
 class MKLDNNMatrix;
 typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
+#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
+  CHECK(MAT) << " can not be empty.";                                \
+  CHECK(MAT->getPrimitiveDesc() == PD)                               \
+      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
+      << "" __VA_ARGS__;
 /**
 * @brief MKLDNN Matrix.
 *

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -69,6 +69,13 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
    endif()
+    # pool_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_cudnn_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
+    endif()
    # save_restore_op contains several operators
    if ("${TARGET}" STREQUAL "save_restore_op")
        set(pybind_flag 1)

--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -22,7 +22,7 @@ class AucOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Inference"),
                   "Input of Inference must be initialized.");
    PADDLE_ENFORCE(ctx->HasInput("Label"),
@@ -62,18 +62,18 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(
        R"DOC(Computes the AUC according forward output and label.
-        Best to use for binary classification evaluations.
+Best to use for binary classification evaluations.
-        If input label contains values other than 0 and 1, it will be cast
+If input label contains values other than 0 and 1, it will be cast
-        to bool.
+to bool.
-        You can find the definations here: 
+You can find the definations here: 
-        https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
+https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
-        Possible curves are:
+Possible curves are:
-        - ROC: Receiver operating characteristic
+- ROC: Receiver operating characteristic
-        - PR: Precision Recall
+- PR: Precision Recall
-        )DOC");
+)DOC");
  }
 };

--- a/paddle/operators/conv_cudnn_op.cu
+++ b/paddle/operators/conv_cudnn_op.cu
@@ -31,16 +31,6 @@ using CUDADeviceContext = platform::CUDADeviceContext;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;
-// NOTE: framework::vectorize converts to type int64_t
-//       which does not fit cudnn inputs.
-std::vector<int> Dims2Vector(const framework::DDim& dims) {
-  std::vector<int> ret;
-  for (int i = 0; i < dims.size(); i++) {
-    ret.push_back(dims[i]);
-  }
-  return ret;
-}
 template <typename T>
 class CudnnConvOpKernel : public framework::OpKernel<T> {
 public:
@@ -68,12 +58,12 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
    ScopedConvolutionDescriptor conv_desc;
    DataLayout layout = DataLayout::kNCHW;
-    cudnnTensorDescriptor_t cudnn_input_desc =
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()), groups);
+        layout, framework::vectorize2int(input->dims()), groups);
-    cudnnTensorDescriptor_t cudnn_output_desc =
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
-        output_desc.descriptor<T>(layout, Dims2Vector(output->dims()), groups);
+        layout, framework::vectorize2int(output->dims()), groups);
-    cudnnFilterDescriptor_t cudnn_filter_desc =
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        filter_desc.descriptor<T>(layout, Dims2Vector(filter->dims()), groups);
+        layout, framework::vectorize2int(filter->dims()), groups);
    cudnnConvolutionDescriptor_t cudnn_conv_desc =
        conv_desc.descriptor<T>(paddings, strides, dilations);
@@ -156,13 +146,13 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    ScopedConvolutionDescriptor conv_desc;
    DataLayout layout = DataLayout::kNCHW;
-    cudnnTensorDescriptor_t cudnn_input_desc =
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
-        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()), groups);
+        layout, framework::vectorize2int(input->dims()), groups);
    cudnnTensorDescriptor_t cudnn_output_grad_desc =
-        output_grad_desc.descriptor<T>(layout, Dims2Vector(output_grad->dims()),
+        output_grad_desc.descriptor<T>(
-                                       groups);
+            layout, framework::vectorize2int(output_grad->dims()), groups);
-    cudnnFilterDescriptor_t cudnn_filter_desc =
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
-        filter_desc.descriptor<T>(layout, Dims2Vector(filter->dims()), groups);
+        layout, framework::vectorize2int(filter->dims()), groups);
    cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
    cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;
@@ -192,7 +182,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    auto handle = ctx.cuda_device_context().cudnn_handle();
    if (input_grad) {
      cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
-          layout, Dims2Vector(input_grad->dims()), groups);
+          layout, framework::vectorize2int(input_grad->dims()), groups);
      PADDLE_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
              handle, cudnn_filter_desc,
@@ -213,7 +203,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    if (filter_grad) {
      cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
-          layout, Dims2Vector(filter_grad->dims()), groups);
+          layout, framework::vectorize2int(filter_grad->dims()), groups);
      PADDLE_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,

--- a/paddle/operators/pool_cudnn_op.cc
+++ b/paddle/operators/pool_cudnn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/pool_cudnn_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad,
+            ops::PoolOpGrad);
+REGISTER_OP_CPU_KERNEL(pool2d_cudnn,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
--- a/paddle/operators/pool_cudnn_op.cu
+++ b/paddle/operators/pool_cudnn_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/pool_cudnn_op.h"
+#include "paddle/platform/cudnn_helper.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
+using DataLayout = platform::DataLayout;
+using PoolingMode = platform::PoolingMode;
+template <typename T>
+class PoolCudnnOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    const Tensor *input = ctx.Input<Tensor>("X");
+    Tensor *output = ctx.Output<Tensor>("Out");
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>(ctx.GetPlace());
+    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    if (ctx.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+      }
+    }
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
+    DataLayout layout = DataLayout::kNCHW;
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()));
+    PoolingMode pooling_mode;
+    if (pooling_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = PoolingMode::kAverage;
+    }
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
+    // ------------------- cudnn pool algorithm ---------------------
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    T alpha = 1.0f, beta = 0.0f;
+    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
+        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
+        cudnn_output_desc, output_data));
+  }
+};
+template <typename T>
+class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    const Tensor *input = ctx.Input<Tensor>("X");
+    const Tensor *output = ctx.Input<Tensor>("Out");
+    const Tensor *output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    if (ctx.Attr<bool>("globalPooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+    }
+    const T *input_data = input->data<T>();
+    const T *output_data = output->data<T>();
+    const T *output_grad_data = output_grad->data<T>();
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
+    DataLayout layout = DataLayout::kNCHW;
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()));
+    PoolingMode pooling_mode;
+    if (pooling_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = PoolingMode::kAverage;
+    }
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
+    // ------------------- cudnn pool algorithm ---------------------
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    T alpha = 1.0f, beta = 0.0f;
+    if (input_grad) {
+      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<paddle::platform::GPUPlace, T> set_zero;
+      set_zero(ctx.device_context(), input_grad, static_cast<T>(0));
+      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
+          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
+          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
+          &beta, cudnn_input_desc, input_grad_data));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel<float>);
--- a/paddle/operators/pool_cudnn_op.h
+++ b/paddle/operators/pool_cudnn_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/pool_op.h"
+namespace paddle {
+namespace operators {}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -29,7 +29,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
  auto in_x_dims = ctx->GetInputDim("X");
-  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
+  std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
@@ -37,7 +37,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                 "Pooling intput should be 4-D or 5-D tensor.");
-  if (ctx->Attrs().Get<bool>("global_pooling")) {
+  if (ctx->Attrs().Get<bool>("globalPooling")) {
    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
    for (size_t i = 0; i < ksize.size(); ++i)
      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
@@ -80,34 +80,30 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
            "the number of channels, H and W is the height and "
            "width of feature.");
-  AddAttr<std::string>("pooling_type",
+  AddAttr<std::string>("poolingType",
-                       "Pooling_type of pooling operator."
+                       "(string), pooling type, can be \"max\" for max-pooling "
-                       "Str constant equal to 'max' or 'avg'.")
+                       "and \"avg\" for average-pooling.")
      .InEnum({"max", "avg"});
  AddAttr<std::vector<int>>(
      "ksize",
-      "The pooling window size(height, width) of pooling operator."
+      "(vector ), the pooling window size(height, width) of pooling operator."
-      "If global_pooling = true, ksize is ignored and need not be "
+      "If globalPooling = true, ksize is ignored and need not be "
      "specified.");  // TODO(Chengduo): Add checker. (Currently,
-                      // TypedAttrChecker don't support vector type.)
+  // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>(
+  AddAttr<bool>("globalPooling",
-      "global_pooling",
+                "(bool default: false), whether to use the global pooling."
-      "Whether to use the global_pooling."
+                "If globalPooling = true, ksize is ignored.")
-      "Bool constant equal to false or true."
-      "Default false."
-      "If global_pooling = true, ksize is ignored and need not be specified.")
      .SetDefault(false);
-  AddAttr<std::vector<int>>("strides",
+  AddAttr<std::vector<int>>(
-                            "The strides(height, width) of pooling window."
+      "strides",
-                            "Default {1,1}.")
+      "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
      .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
-                            // TypedAttrChecker don't support vector type.)
+  // TypedAttrChecker don't support vector type.)
-  AddAttr<std::vector<int>>("paddings",
+  AddAttr<std::vector<int>>(
-                            "The zero padding(height, width) size on both sides"
+      "paddings",
-                            "Default {0,0}.")
+      "(vector defalut:{0,0}), paddings(height, width) of pooling operator.")
      .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
-                            // TypedAttrChecker don't support vector type.)
+  // TypedAttrChecker don't support vector type.)
  AddComment(R"DOC(
 The pooling2d operation calculates the output based on
@@ -123,7 +119,6 @@ Example:
       X shape: (N, C, H_in, W_in)
  Output:
       Out shape: (N, C, H_out, W_out)
-       Mask shape: (N, C, H_out, W_out)
  where
       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
@@ -146,33 +141,29 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
            "the number of channels, D, H and W is the depth, height and "
            "width of feature.");
-  AddAttr<std::string>("pooling_type",
+  AddAttr<std::string>("poolingType",
-                       "PoolingType of pooling operator."
+                       "(string), pooling type, can be \"max\" for max-pooling "
-                       "Str constant equal to 'max' or 'avg'.")
+                       "and \"avg\" for average-pooling.")
      .InEnum({"max", "avg"});
  AddAttr<std::vector<int>>(
      "ksize",
-      "The pooling window size(depth, height, width) of pooling operator."
+      "(vector ), the pooling window size(depth, height, width) of pooling "
-      "If global_pooling = true, ksize is ignored and need not be "
+      "operator."
+      "If globalPooling = true, ksize is ignored and need not be "
      "specified.");  // TODO(Chengduo): Add checker. (Currently,
                      // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>(
+  AddAttr<bool>("globalPooling",
-      "global_pooling",
+                "(bool default: false), whether to use the global pooling."
-      "Whether to use the global_pooling."
+                "If globalPooling = true, ksize is ignored.")
-      "Bool constant equal to false or true."
-      "Default false."
-      "If global_pooling = true, ksize is ignored and need not be specified.")
      .SetDefault(false);
  AddAttr<std::vector<int>>("strides",
-                            "Strides(depth, height, width) of pooling operator."
+                            "(vector, default:{1,1,1}), strides(depth, height, "
-                            "Default {1,1,1}.")
+                            "width) of pooling operator.")
      .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
                               // TypedAttrChecker don't support vector type.)
-  AddAttr<std::vector<int>>(
+  AddAttr<std::vector<int>>("paddings",
-      "paddings",
+                            "(vector defalut:{0,0,0}), paddings(depth, height, "
-      "Paddings(depth, height, width) of pooling operator."
+                            "width) of pooling operator.")
-      "Default {0,0,0}.")
      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                               // TypedAttrChecker don't support vector type.)
@@ -190,7 +181,6 @@ Example:
       X shape: (N, C, D_in, H_in, W_in)
  Output:
       Out shape: (N, C, D_out, H_out, W_out)
-       Mask shape: (N, C, D_out, H_out, W_out)
  where
       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;

--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -57,11 +57,11 @@ class PoolKernel : public framework::OpKernel<T> {
    const Tensor* in_x = context.Input<Tensor>("X");
    Tensor* out = context.Output<Tensor>("Out");
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::string pooling_type = context.Attr<std::string>("poolingType");
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("global_pooling")) {
+    if (context.Attr<bool>("globalPooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
      }
@@ -117,12 +117,12 @@ class PoolGradKernel : public framework::OpKernel<T> {
        context.Input<Tensor>(framework::GradVarName("Out"));
    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::string pooling_type = context.Attr<std::string>("poolingType");
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("global_pooling")) {
+    if (context.Attr<bool>("globalPooling")) {
      for (size_t i = 0; i < ksize.size(); ++i)
        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
    }

--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -44,7 +44,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                   "Pooling intput should be 4-D or 5-D tensor.");
-    if (ctx->Attrs().Get<bool>("global_pooling")) {
+    if (ctx->Attrs().Get<bool>("globalPooling")) {
      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
      for (size_t i = 0; i < ksize.size(); ++i)
        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
@@ -105,28 +105,24 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<std::vector<int>>(
        "ksize",
-        "The pooling window size(height, width) of pooling operator."
+        "(vector ), the pooling window size(height, width) of pooling operator."
-        "If global_pooling = true, ksize is ignored and need not be "
+        "If globalPooling = true, ksize is ignored and need not be "
        "specified.");  // TODO(Chengduo): Add checker. (Currently,
-                        // TypedAttrChecker don't support vector type.)
+    // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>(
+    AddAttr<bool>("globalPooling",
-        "global_pooling",
+                  "(bool default: false), whether to use the global pooling."
-        "Whether to use the global_pooling."
+                  "If globalPooling = true, ksize is ignored.")
-        "Bool constant equal to false or true."
-        "Default false."
-        "If global_pooling = true, ksize is ignored and need not be specified.")
        .SetDefault(false);
-    AddAttr<std::vector<int>>("strides",
+    AddAttr<std::vector<int>>(
-                              "The strides(height, width) of pooling window."
+        "strides",
-                              "Default {1,1}.")
+        "(vector, default:{1, 1}), strides(height, width) of pooling operator.")
        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
-                              // TypedAttrChecker don't support vector type.)
+    // TypedAttrChecker don't support vector type.)
    AddAttr<std::vector<int>>(
        "paddings",
-        "The zero padding(height, width) size on both sides"
+        "(vector defalut:{0,0}), paddings(height, width) of pooling operator.")
-        "Default {0,0}.")
        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
-                              // TypedAttrChecker don't support vector type.)
+    // TypedAttrChecker don't support vector type.)
    AddComment(R"DOC(
 The maxPooling2d with index operation calculates the output and the mask
@@ -176,29 +172,25 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<std::vector<int>>(
        "ksize",
-        "The pooling window size(depth, height, width) of pooling operator."
+        "(vector ), the pooling window size(depth, height, width) of pooling "
-        "If global_pooling = true, ksize is ignored and need not be "
+        "operator."
+        "If globalPooling = true, ksize is ignored and need not be "
        "specified.");  // TODO(Chengduo): Add checker. (Currently,
-                        // TypedAttrChecker don't support vector type.)
+    // TypedAttrChecker don't support vector type.)
-    AddAttr<bool>(
+    AddAttr<bool>("globalPooling",
-        "global_pooling",
+                  "(bool default: false), whether to use the global pooling."
-        "Whether to use the global_pooling."
+                  "If globalPooling = true, ksize is ignored.")
-        "Bool constant equal to false or true."
-        "Default false."
-        "If global_pooling = true, ksize is ignored and need not be specified.")
        .SetDefault(false);
-    AddAttr<std::vector<int>>(
+    AddAttr<std::vector<int>>("strides",
-        "strides",
+                              "(vector, default:{1,1,1}), strides(depth, "
-        "Strides(depth, height, width) of pooling operator."
+                              "height, width) of pooling operator.")
-        "Default {1,1,1}.")
        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
-                                 // TypedAttrChecker don't support vector type.)
+    // TypedAttrChecker don't support vector type.)
-    AddAttr<std::vector<int>>(
+    AddAttr<std::vector<int>>("paddings",
-        "paddings",
+                              "(vector defalut:{0,0,0}), paddings(depth, "
-        "Paddings(depth, height, width) of pooling operator."
+                              "height, width) of pooling operator.")
-        "Default {0,0,0}.")
        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
-                                 // TypedAttrChecker don't support vector type.)
+    // TypedAttrChecker don't support vector type.)
    AddComment(R"DOC(
 The maxpooling3d with index operation calculates the output and the mask

--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -35,7 +35,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("global_pooling")) {
+    if (context.Attr<bool>("globalPooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
      }
@@ -70,7 +70,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("global_pooling")) {
+    if (context.Attr<bool>("globalPooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
      }

--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gzip
+import struct
+import os
+from paddle.trainer_config_helpers.layers import LayerOutput
+from paddle.v2.parameters import Parameters
+from paddle.proto import ModelConfig_pb2
+from paddle.v2.topology import Topology
+def merge_v2_model(net, param_file, output_file):
+    '''Integrate the model config and model parameters into one file.
+    The model configuration file describes the model structure which
+    ends with .py. The parameters file stores the parameters of the model
+    which ends with .tar.gz.
+    @param  net            The output layer of the network.
+    @param  param_file     Path of the model parameters(.tar.gz) which is stored by v2 api.
+    @param  output_file    Path of the merged file which will be generated.
+    Usage:
+        from paddle.util.merge_model import merge_v2_model
+        # import your network configuration
+        from mobilenet import mobile_net
+        net = mobile_net(3*224*224, 102)
+        param_file = './param_pass_00000.tar.gz'
+        output_file = './output.paddle'
+        merge_v2_model(net, param_file, output_file)
+    '''
+    assert isinstance(net, LayerOutput), \
+            "The net should be the output of the network"
+    assert os.path.exists(param_file), \
+            "The model parameters file %s does not exists " % (param_file)
+    model_proto = Topology(net).proto()
+    assert isinstance(model_proto, ModelConfig_pb2.ModelConfig)
+    with gzip.open(param_file) as f:
+        params = Parameters.from_tar(f)
+    if os.path.exists(output_file):
+        os.remove(output_file)
+    with open(output_file, 'w') as f:
+        param_names = [param.name for param in model_proto.parameters]
+        conf_str = model_proto.SerializeToString()
+        f.write(struct.pack('q', len(conf_str)))
+        f.write(conf_str)
+        for pname in param_names:
+            params.serialize(pname, f)
+    print 'Generate  %s  success!' % (output_file)
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -284,9 +284,9 @@ def pool2d(input,
        inputs={"X": input},
        outputs={"Out": pool_out},
        attrs={
-            "pooling_type": pool_type,
+            "poolingType": pool_type,
            "ksize": pool_size,
-            "global_pooling": global_pooling,
+            "globalPooling": global_pooling,
            "strides": pool_stride,
            "paddings": pool_padding
        })

--- a/python/paddle/v2/framework/tests/test_auc_op.py
+++ b/python/paddle/v2/framework/tests/test_auc_op.py
@@ -62,5 +62,6 @@ class TestAucOp(OpTest):
        self.check_output()
-if __name__ == "__main__":
+# TODO(typhoonzero): add this back till we fix it
-    unittest.main()
+#if __name__ == "__main__":
+#    unittest.main()
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
@@ -43,5 +43,6 @@ class TestHuberLossOp(OpTest):
            ['X'], 'Out', max_relative_error=0.008, no_grad_set=set('residual'))
-if __name__ == '__main__':
+# TODO(typhoonzero): should add this back till we fix it
-    unittest.main()
+#if __name__ == '__main__':
+#    unittest.main()
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
@@ -46,7 +46,9 @@ def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
 class TestPool2d_Op(OpTest):
    def setUp(self):
-        self.initTestCase()
+        self.init_test_case()
+        self.init_op_type()
+        self.init_pool_type()
        input = np.random.random(self.shape).astype("float32")
        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
                                           self.paddings, self.global_pool)
@@ -56,8 +58,8 @@ class TestPool2d_Op(OpTest):
            'strides': self.strides,
            'paddings': self.paddings,
            'ksize': self.ksize,
-            'pooling_type': self.pool_type,
+            'poolingType': self.pool_type,
-            'global_pooling': self.global_pool,
+            'globalPooling': self.global_pool,
        }
        self.outputs = {'Out': output.astype('float32')}
@@ -69,76 +71,197 @@ class TestPool2d_Op(OpTest):
        if self.pool_type != "max":
            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
-    def initTestCase(self):
+    def init_test_case(self):
        self.global_pool = True
-        self.op_type = "pool2d"
-        self.pool_type = "avg"
        self.pool2D_forward_naive = avg_pool2D_forward_naive
        self.shape = [2, 3, 5, 5]
        self.ksize = [3, 3]
        self.strides = [1, 1]
        self.paddings = [0, 0]
+    def init_op_type(self):
+        self.op_type = "pool2d"
+    def init_pool_type(self):
+        self.pool_type = "avg"
 class TestCase1(TestPool2d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
        self.global_pool = False
-        self.op_type = "pool2d"
-        self.pool_type = "avg"
        self.pool2D_forward_naive = avg_pool2D_forward_naive
        self.shape = [2, 3, 7, 7]
        self.ksize = [3, 3]
        self.strides = [1, 1]
        self.paddings = [0, 0]
+    def init_op_type(self):
+        self.op_type = "pool2d"
+    def init_pool_type(self):
+        self.pool_type = "avg"
 class TestCase2(TestPool2d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
        self.global_pool = False
-        self.op_type = "pool2d"
-        self.pool_type = "avg"
        self.pool2D_forward_naive = avg_pool2D_forward_naive
        self.shape = [2, 3, 7, 7]
        self.ksize = [3, 3]
        self.strides = [1, 1]
        self.paddings = [1, 1]
+    def init_op_type(self):
+        self.op_type = "pool2d"
+    def init_pool_type(self):
+        self.pool_type = "avg"
 class TestCase3(TestPool2d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
        self.global_pool = True
-        self.op_type = "pool2d"
-        self.pool_type = "max"
        self.pool2D_forward_naive = max_pool2D_forward_naive
        self.shape = [2, 3, 5, 5]
        self.ksize = [3, 3]
        self.strides = [1, 1]
        self.paddings = [0, 0]
+    def init_op_type(self):
+        self.op_type = "pool2d"
+    def init_pool_type(self):
+        self.pool_type = "max"
 class TestCase4(TestPool2d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
        self.global_pool = False
-        self.op_type = "pool2d"
-        self.pool_type = "max"
        self.pool2D_forward_naive = max_pool2D_forward_naive
        self.shape = [2, 3, 7, 7]
        self.ksize = [3, 3]
        self.strides = [1, 1]
        self.paddings = [0, 0]
+    def init_op_type(self):
+        self.op_type = "pool2d"
+    def init_pool_type(self):
+        self.pool_type = "max"
 class TestCase5(TestPool2d_Op):
-    def initTestCase(self):
+    def init_test_case(self):
        self.global_pool = False
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+    def init_op_type(self):
        self.op_type = "pool2d"
+    def init_pool_type(self):
+        self.pool_type = "max"
+#--------------------test pool2d_cudnn--------------------
+class TestCaseCudnn1(TestPool2d_Op):
+    def init_test_case(self):
+        self.global_pool = True
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+    def init_op_type(self):
+        self.op_type = "pool2d_cudnn"
+    def init_pool_type(self):
+        self.pool_type = "avg"
+class TestCaseCudnn2(TestPool2d_Op):
+    def init_test_case(self):
+        self.global_pool = False
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+    def init_op_type(self):
+        self.op_type = "pool2d_cudnn"
+    def init_pool_type(self):
+        self.pool_type = "avg"
+class TestCaseCudnn3(TestPool2d_Op):
+    def init_test_case(self):
+        self.global_pool = False
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+    def init_op_type(self):
+        self.op_type = "pool2d_cudnn"
+    def init_pool_type(self):
+        self.pool_type = "avg"
+class TestCaseCudnn4(TestPool2d_Op):
+    def init_test_case(self):
+        self.global_pool = True
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+    def init_op_type(self):
+        self.op_type = "pool2d_cudnn"
+    def init_pool_type(self):
+        self.pool_type = "max"
+class TestCaseCudnn5(TestPool2d_Op):
+    def init_test_case(self):
+        self.global_pool = False
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+    def init_op_type(self):
+        self.op_type = "pool2d_cudnn"
+    def init_pool_type(self):
        self.pool_type = "max"
+class TestCaseCudnn6(TestPool2d_Op):
+    def init_test_case(self):
+        self.global_pool = False
        self.pool2D_forward_naive = max_pool2D_forward_naive
        self.shape = [2, 3, 7, 7]
        self.ksize = [3, 3]
        self.strides = [1, 1]
        self.paddings = [1, 1]
+    def init_op_type(self):
+        self.op_type = "pool2d_cudnn"
+    def init_pool_type(self):
+        self.pool_type = "max"
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
@@ -64,8 +64,8 @@ class TestPool3d_Op(OpTest):
            'strides': self.strides,
            'paddings': self.paddings,
            'ksize': self.ksize,
-            'pooling_type': self.pool_type,
+            'poolingType': self.pool_type,
-            'global_pooling': self.global_pool,
+            'globalPooling': self.global_pool,
        }
        self.outputs = {'Out': output.astype('float32')}

--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
@@ -86,7 +86,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
            'strides': self.strides,
            'paddings': self.paddings,
            'ksize': self.ksize,
-            'global_pooling': self.global_pool,
+            'globalPooling': self.global_pool,
        }
        self.inputs = {'X': input}

--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -61,7 +61,7 @@ def recordio(paths, buf_size=100):
    """
    Creates a data reader from given RecordIO file paths separated by ",",
        glob pattern is supported.
-    :path: path of recordio files.
+    :path: path of recordio files, can be a string or a string list.
    :returns: data reader of recordio files.
    """
@@ -92,7 +92,7 @@ def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
    """
    Create a data reader that yield a record one by one from
        the paths:
-    :path: path of recordio files.
+    :paths: path of recordio files, can be a string or a string list.
    :etcd_endpoints: the endpoints for etcd cluster
    :returns: data reader of recordio files.
@@ -107,7 +107,12 @@ def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
    import cPickle as pickle
    import paddle.v2.master as master
    c = master.client(etcd_endpoints, timeout_sec, buf_size)
-    c.set_dataset(paths)
+    if isinstance(paths, basestring):
+        path = [paths]
+    else:
+        path = paths
+    c.set_dataset(path)
    def reader():
        global pass_num