Merge branch 'develop' into image

f4729a24 · Luo Tao · 15133229 · 502d7daf · f4729a24 · f4729a24
18 changed file
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -13,7 +13,7 @@ define_py_data_sources2(
 settings(
    batch_size=batch_size,
-    learning_rate=0.01 / batch_size,
+    learning_rate=0.001 / batch_size,
    learning_method=MomentumOptimizer(0.9),
    regularization=L2Regularization(0.0005 * batch_size))

--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -62,16 +62,14 @@ void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
                                MKLDNNMatrixPtr& wgt,
                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  if (biases_) {
+  resetFwdBuffers(inVals_, bias, out);
-    LOG(FATAL) << "not implemented yet";
-  }
-  resetFwdBuffers(inVals_, out);
  in = inVals_[0];
  std::shared_ptr<sum::primitive_desc> fwdPD;
-  resetFwdPD(fwdPD, inVals_, out);
+  std::shared_ptr<sum::primitive_desc> biasPD;
+  resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
-  resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
 }
 void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -79,7 +77,7 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
                                MKLDNNMatrixPtr& wgt,
                                MKLDNNMatrixPtr& bias,
                                MKLDNNMatrixPtr& out) {
-  resetBwdBuffers(inGrads_, out);
+  resetBwdBuffers(inGrads_, bias, out);
  in = inGrads_[0];
  // backward only need share output grad to input grad
@@ -89,6 +87,20 @@ void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
      inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
    }
  }
+  // backward bias
+  bwdBias_ = nullptr;
+  if (bias) {
+    std::vector<double> scales(bs_, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
+    auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+    std::vector<primitive::at> srcs;
+    for (size_t i = 0; i < grads_.size(); ++i) {
+      srcs.push_back(*(grads_[i]));
+    }
+    bwdBias_.reset(new sum(biasPD, srcs, *bias));
+    pipeline.push_back(*bwdBias_);
+  }
 }
 void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
@@ -97,7 +109,25 @@ void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
  }
 }
+void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
+                                   const MatrixPtr& biasMat,
+                                   const MKLDNNMatrixPtr& out,
+                                   std::vector<MKLDNNMatrixPtr>& outs) {
+  auto pd = MKLDNNMatrix::createPrimitiveDesc(
+      {(int)layerSize_}, memory::format::x, engine_);
+  bias = MKLDNNMatrix::create(pd, biasMat);
+  outs.clear();
+  real* data = out->getData();
+  CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
+  for (int i = 0; i < bs_; ++i) {
+    MatrixPtr tmp =
+        Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
+    outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
+  }
+}
 void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
                                       MKLDNNMatrixPtr& out) {
  inputs.resize(inputLayers_.size());
  for (size_t i = 0; i < inputs.size(); i++) {
@@ -110,10 +140,18 @@ void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
  }
  resetOutValue(out, inputs[0]->getPrimitiveDesc());
+  if (biases_ && biases_->getW()) {
+    prepareBias(bias, biases_->getW(), out, vals_);
+  } else {
+    bias = nullptr;
+  }
 }
 void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::shared_ptr<sum::primitive_desc>& biasPD,
                                  std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr bias,
                                  MKLDNNMatrixPtr out) {
  std::vector<double> scales(inputs.size(), 1.0);
  std::vector<memory::primitive_desc> srcPDs;
@@ -123,12 +161,23 @@ void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
  CHECK(out);
  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+  biasPD = nullptr;
+  if (bias) {
+    std::vector<double> scales(2, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(2, bias->getPrimitiveDesc());
+    biasPD.reset(
+        new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
+    CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
+  }
 }
 void MKLDNNAddtoLayer::resetFwdPipeline(
    std::vector<primitive>& pipeline,
    std::shared_ptr<sum::primitive_desc>& pd,
+    std::shared_ptr<sum::primitive_desc>& biasPD,
    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& bias,
    MKLDNNMatrixPtr& out) {
  std::vector<primitive::at> srcs;
  for (size_t i = 0; i < inputs.size(); i++) {
@@ -136,9 +185,23 @@ void MKLDNNAddtoLayer::resetFwdPipeline(
  }
  fwd_.reset(new sum(*pd, srcs, *out));
  pipeline.push_back(*fwd_);
+  fwdBias_.clear();
+  if (biasPD == nullptr || bias == nullptr) {
+    return;
+  }
+  fwdBias_.resize(vals_.size());
+  for (size_t i = 0; i < vals_.size(); ++i) {
+    std::vector<primitive::at> srcs;
+    srcs.push_back(*(vals_[i]));
+    srcs.push_back(*bias);
+    fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
+    pipeline.push_back(*fwdBias_[i]);
+  }
 }
 void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
                                       MKLDNNMatrixPtr& out) {
  CHECK(outVal_);
  resetOutGrad(out, outVal_->getPrimitiveDesc());
@@ -149,6 +212,12 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
    resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
  }
+  if (biases_ && biases_->getWGrad()) {
+    prepareBias(bias, biases_->getWGrad(), out, grads_);
+  } else {
+    bias = nullptr;
+  }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -32,9 +32,15 @@ protected:
  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
  size_t layerSize_;
-  // TODO(TJ): this part has not been optimized by MKL-DNN
  std::unique_ptr<Weight> biases_;
+  // buffers for adding bias
+  std::vector<MKLDNNMatrixPtr> vals_;
+  std::vector<MKLDNNMatrixPtr> grads_;
+  // primitives for adding bias
+  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
+  std::shared_ptr<mkldnn::primitive> bwdBias_;
 public:
  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
@@ -91,20 +97,34 @@ protected:
   *                    reset pipeline.
   */
  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr bias,
                  MKLDNNMatrixPtr out);
  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
  /**
   * Backward functions: reset buffers(inputs, output, bias)
   */
  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
                       MKLDNNMatrixPtr& out);
+  /**
+   * prepare for bias
+   */
+  void prepareBias(MKLDNNMatrixPtr& bias,
+                   const MatrixPtr& biasMat,
+                   const MKLDNNMatrixPtr& out,
+                   std::vector<MKLDNNMatrixPtr>& outs);
 };
 }  // namespace paddle
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -300,13 +300,8 @@ void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
  TestConfig dnnConfig;
  getAddtoConfig(dnnConfig, pm, nInputs);
  dnnConfig.layerConfig.set_type("mkldnn_addto");
-  // TODO(TJ): test with bias
+  for (auto withBias : {false, true}) {
-  for (auto withBias : {false}) {
+    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
-    if (withBias) {
-      dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
-    } else {
-      dnnConfig.biasSize = 0;
-    }
    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
  }
 }

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -65,7 +65,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
    size_t num_samples = inference->dims()[0];
    size_t infer_width = inference->dims()[1];
-    cudaMemset((void**)&accuracy_data, 0, sizeof(float));
+    PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
    if (num_samples == 0) {
      return;

--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -75,10 +75,10 @@ class FillConstantBatchSizeLikeOpMaker
              "with the specified value");
    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
    AddAttr<int>("input_dim_idx",
-                 "(int, default 0) the index of input's batch size dimension")
+                 "(int, default 0) The index of input's batch size dimension")
        .SetDefault(0);
    AddAttr<int>("output_dim_idx",
-                 "(int, default 0) the index of output's batch size dimension")
+                 "(int, default 0) The index of output's batch size dimension")
        .SetDefault(0);
    AddAttr<float>("value", "(float, default 0) The value to be filled")
        .SetDefault(0.0f);

--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -34,9 +34,9 @@ class LstmUnitOp : public framework::OperatorWithKernel {
    auto c_prev_dims = ctx->GetInputDim("C_prev");
    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE(x_dims[0] == c_prev_dims[0],
+    PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
                      "Batch size of inputs and states must be equal");
-    PADDLE_ENFORCE(x_dims[1] == c_prev_dims[1] * 4,
+    PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
                      "Dimension of FC should equal to prev state * 4");
    int b_size = c_prev_dims[0];  // batch size

--- a/paddle/operators/pool_cudnn_op.cu
+++ b/paddle/operators/pool_cudnn_op.cu
@@ -37,11 +37,11 @@ class PoolCudnnOpKernel : public framework::OpKernel<T> {
    const T *input_data = input->data<T>();
    T *output_data = output->mutable_data<T>(ctx.GetPlace());
-    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.Attr<bool>("globalPooling")) {
+    if (ctx.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
        ksize[i] = static_cast<int>(input->dims()[i + 2]);
@@ -92,12 +92,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel<T> {
        ctx.Input<Tensor>(framework::GradVarName("Out"));
    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-    std::string pooling_type = ctx.Attr<std::string>("poolingType");
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
-    if (ctx.Attr<bool>("globalPooling")) {
+    if (ctx.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
        ksize[i] = static_cast<int>(input->dims()[i + 2]);

--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -29,7 +29,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
  auto in_x_dims = ctx->GetInputDim("X");
-  std::string pooling_type = ctx->Attrs().Get<std::string>("poolingType");
+  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
@@ -37,7 +37,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                 "Pooling intput should be 4-D or 5-D tensor.");
-  if (ctx->Attrs().Get<bool>("globalPooling")) {
+  if (ctx->Attrs().Get<bool>("global_pooling")) {
    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
    for (size_t i = 0; i < ksize.size(); ++i) {
      paddings[i] = 0;
@@ -83,20 +83,20 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
            "H is the height of the feature, "
            "and W is the width of the feature.");
-  AddAttr<std::string>("poolingType",
+  AddAttr<std::string>("pooling_type",
                       "(string), pooling type, can be \"max\" for max-pooling "
                       "and \"avg\" for average-pooling.")
      .InEnum({"max", "avg"});
  AddAttr<std::vector<int>>("ksize",
                            "(vector<int>) The pooling window "
                            "size(height, width) of the pooling operator. "
-                            "If globalPooling = true, ksize and paddings will "
+                            "If global_pooling = true, ksize and paddings will "
                            "be ignored.");  // TODO(Chengduo): Add checker.
                                             // (Currently,
  // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("globalPooling",
+  AddAttr<bool>("global_pooling",
                "(bool, default false) Whether to use the global pooling. "
-                "If globalPooling = true, ksize and paddings will be ignored.")
+                "If global_pooling = true, ksize and paddings will be ignored.")
      .SetDefault(false);
  AddAttr<std::vector<int>>("strides",
                            "(vector<int>, default {1, 1}), strides(height, "
@@ -107,7 +107,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
      "paddings",
      "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
      "operator."
-      "If globalPooling = true, paddings and ksize will be ignored.")
+      "If global_pooling = true, paddings and ksize will be ignored.")
      .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
  // TypedAttrChecker don't support vector type.)
@@ -115,7 +115,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
 Pool2d Operator.
 The pooling2d operation calculates the output based on
-the input, poolingType and ksize, strides, paddings parameters.
+the input, pooling_type and ksize, strides, paddings parameters.
 Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
 number of channels, H is the height of the feature, and W is the width of the feature.
 Parameters(ksize, strides, paddings) are two elements.
@@ -152,7 +152,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
            "the number of channels, and D, H and W is the depth, height and "
            "width of the feature, respectively.");
-  AddAttr<std::string>("poolingType",
+  AddAttr<std::string>("pooling_type",
                       "(string) Pooling type, can be \"max\" for max-pooling "
                       "and \"avg\" for average-pooling.")
      .InEnum({"max", "avg"});
@@ -160,13 +160,14 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
      "ksize",
      "(vector<int>) The pooling window size(depth, height, "
      "width) of pooling operator. "
-      "If globalPooling = true, ksize and paddings will "
+      "If global_pooling = true, ksize and paddings will "
      "be ignored.");  // TODO(Chengduo): Add checker.
                       // (Currently,
  // TypedAttrChecker don't support vector type.)
-  AddAttr<bool>("globalPooling",
+  AddAttr<bool>(
+      "global_pooling",
      "(bool, default false) Whether to use the global pooling. "
-                "If globalPooling = true, ksize and paddings wille be ignored.")
+      "If global_pooling = true, ksize and paddings wille be ignored.")
      .SetDefault(false);
  AddAttr<std::vector<int>>(
      "strides",
@@ -178,7 +179,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
      "paddings",
      "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
      "width) of pooling operator. "
-      "If globalPooling = true, ksize and paddings will be ignored.")
+      "If global_pooling = true, ksize and paddings will be ignored.")
      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
                               // TypedAttrChecker don't support vector type.)
@@ -186,7 +187,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
 Pool3d Operator.
 The pooling3d operation calculates the output based on
-the input, poolingType, ksize, strides, and paddings parameters.
+the input, pooling_type, ksize, strides, and paddings parameters.
 Input(X) and output(Out) are in NCDHW format, where N is batch
 size, C is the number of channels, and D, H and W are the depth, height and
 width of the feature, respectively. Parameters(ksize, strides, paddings) 

--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -57,11 +57,11 @@ class PoolKernel : public framework::OpKernel<T> {
    const Tensor* in_x = context.Input<Tensor>("X");
    Tensor* out = context.Output<Tensor>("Out");
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
@@ -119,12 +119,12 @@ class PoolGradKernel : public framework::OpKernel<T> {
        context.Input<Tensor>(framework::GradVarName("Out"));
    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
-    std::string pooling_type = context.Attr<std::string>("poolingType");
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);

--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -44,7 +44,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                   "Pooling intput should be 4-D or 5-D tensor.");
-    if (ctx->Attrs().Get<bool>("globalPooling")) {
+    if (ctx->Attrs().Get<bool>("global_pooling")) {
      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
@@ -110,14 +110,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<std::vector<int>>("ksize",
                              "(vector<int>) The pooling window size(height, "
                              "width) of pooling operator. "
-                              "If globalPooling = true, ksize and paddings "
+                              "If global_pooling = true, ksize and paddings "
                              "will be ignored.");  // TODO(Chengduo): Add
                                                    // checker. (Currently,
    // TypedAttrChecker don't support vector type.)
    AddAttr<bool>(
-        "globalPooling",
+        "global_pooling",
        "(bool, default false) Whether to use the global pooling. "
-        "If globalPooling = true, ksize and paddings will be ignored.")
+        "If global_pooling = true, ksize and paddings will be ignored.")
        .SetDefault(false);
    AddAttr<std::vector<int>>("strides",
                              "(vector<int>, default {1, 1}), strides(height, "
@@ -128,7 +128,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
        "paddings",
        "(vector<int>, defalut {0, 0}), paddings(height, width) of pooling "
        "operator. "
-        "If globalPooling = true, paddings and will be ignored.")
+        "If global_pooling = true, paddings and will be ignored.")
        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
    // TypedAttrChecker don't support vector type.)
@@ -188,14 +188,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<std::vector<int>>("ksize",
                              "(vector<int>) The pooling window size(depth, "
                              "height, width) of pooling operator. "
-                              "If globalPooling = true, ksize and paddings "
+                              "If global_pooling = true, ksize and paddings "
                              "will be ignored.");  // TODO(Chengduo): Add
                                                    // checker. (Currently,
    // TypedAttrChecker don't support vector type.)
    AddAttr<bool>(
-        "globalPooling",
+        "global_pooling",
        "(bool, default false) Whether to use the global pooling. "
-        "If globalPooling = true, ksize and paddings will be ignored.")
+        "If global_pooling = true, ksize and paddings will be ignored.")
        .SetDefault(false);
    AddAttr<std::vector<int>>("strides",
                              "(vector<int>, default {1,1,1}), strides(depth, "
@@ -206,7 +206,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
        "paddings",
        "(vector, defalut {0,0,0}), paddings(depth, "
        "height, width) of pooling operator. "
-        "If globalPooling = true, paddings and ksize will be ignored.")
+        "If global_pooling = true, paddings and ksize will be ignored.")
        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
    // TypedAttrChecker don't support vector type.)

--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -35,7 +35,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
@@ -72,7 +72,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    if (context.Attr<bool>("globalPooling")) {
+    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);

--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
@@ -134,9 +134,7 @@ def _create_op_func_(op_type):
    o_name = not_intermediate_outputs[0].name
    intermediate_output_names = [output.name for output in intermediate_outputs]
-    def func(**kwargs):
+    def infer_and_check_data_type(op_proto, **kwargs):
-        helper = LayerHelper(op_type, **kwargs)
-        inputs = dict()
        dtype = None
        for ipt in op_proto.inputs:
            name = _convert_(ipt.name)
@@ -153,6 +151,20 @@ def _create_op_func_(op_type):
                elif dtype != each.data_type:
                    raise ValueError(
                        "operator {0} must input same dtype".format(op_type))
+        return dtype
+    def func(**kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+        dtype = infer_and_check_data_type(op_proto, **kwargs)
+        inputs = dict()
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
            inputs[ipt.name] = val
        outputs = dict()
@@ -178,6 +190,20 @@ _create_op_func_('reshape')
 _create_op_func_('elementwise_add')
 _create_op_func_('sigmoid')
 _create_op_func_('scale')
+_create_op_func_('reshape')
+_create_op_func_('transpose')
+def fill_constant(data_type, shape, value=None, program=None):
+    helper = LayerHelper('fill_constant', **locals())
+    out = helper.create_tmp_variable(dtype=data_type)
+    helper.append_op(
+        type='fill_constant',
+        outputs={'Out': [out]},
+        attrs={'data_type': data_type,
+               'shape': shape,
+               'value': value})
+    return out
 def cast(x, data_type, main_program=None):
@@ -414,9 +440,9 @@ def pool2d(input,
        inputs={"X": input},
        outputs={"Out": pool_out},
        attrs={
-            "poolingType": pool_type,
+            "pooling_type": pool_type,
            "ksize": pool_size,
-            "globalPooling": global_pooling,
+            "global_pooling": global_pooling,
            "strides": pool_stride,
            "paddings": pool_padding
        })
@@ -762,6 +788,46 @@ class StaticRNN(object):
            })
+def lstm(x,
+         c_pre_init,
+         hidden_dim,
+         forget_bias=None,
+         main_program=None,
+         startup_program=None):
+    helper = LayerHelper('lstm_unit', **locals())
+    rnn = StaticRNN()
+    with rnn.step():
+        c_pre = rnn.memory(init=c_pre_init)
+        x_t = rnn.step_input(x)
+        before_fc = concat(
+            input=[x_t, c_pre],
+            axis=1,
+            main_program=main_program,
+            startup_program=startup_program)
+        after_fc = fc(input=before_fc,
+                      size=hidden_dim * 4,
+                      main_program=main_program,
+                      startup_program=startup_program)
+        data_type = x.data_type
+        c = helper.create_tmp_variable(data_type)
+        h = helper.create_tmp_variable(data_type)
+        helper.append_op(
+            type='lstm_unit',
+            inputs={"X": after_fc,
+                    "C_prev": c_pre},
+            outputs={"C": c,
+                     "H": h},
+            attrs={"forget_bias": forget_bias})
+        rnn.update_memory(c_pre, c)
+        rnn.output(h)
+    return rnn()
 def lod_rank_table(x, level=0, main_program=None):
    helper = LayerHelper("lod_rank_table", **locals())
    table = helper.create_variable(

--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
@@ -26,5 +26,4 @@ class TestAccuracyOp(OpTest):
 if __name__ == '__main__':
-    exit(0)
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
@@ -61,8 +61,8 @@ class TestPool2d_Op(OpTest):
            'strides': self.strides,
            'paddings': self.paddings,
            'ksize': self.ksize,
-            'poolingType': self.pool_type,
+            'pooling_type': self.pool_type,
-            'globalPooling': self.global_pool,
+            'global_pooling': self.global_pool,
        }
        self.outputs = {'Out': output.astype('float32')}

--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
@@ -67,8 +67,8 @@ class TestPool3d_Op(OpTest):
            'strides': self.strides,
            'paddings': self.paddings,
            'ksize': self.ksize,
-            'poolingType': self.pool_type,
+            'pooling_type': self.pool_type,
-            'globalPooling': self.global_pool,
+            'global_pooling': self.global_pool,
        }
        self.outputs = {'Out': output.astype('float32')}

--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
@@ -86,7 +86,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
            'strides': self.strides,
            'paddings': self.paddings,
            'ksize': self.ksize,
-            'globalPooling': self.global_pool,
+            'global_pooling': self.global_pool,
        }
        self.inputs = {'X': input}

--- a/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+from paddle.v2.framework.framework import g_main_program, g_startup_program
+from paddle.v2.framework.executor import Executor
+import numpy as np
+def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
+    data = layers.data(
+        name="words",
+        shape=[seq_len * batch_size, 1],
+        append_batch_size=False,
+        data_type="int64")
+    label = layers.data(
+        name="label",
+        shape=[batch_size, 1],
+        append_batch_size=False,
+        data_type="int64")
+    emb = layers.embedding(input=data, size=[dict_dim, emb_dim])
+    emb = layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
+    emb = layers.transpose(x=emb, axis=[1, 0, 2])
+    c_pre_init = layers.fill_constant(
+        dtype=emb.data_type, shape=[batch_size, emb_dim], value=0.0)
+    layer_1_out = layers.lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = layers.transpose(x=layer_1_out, axis=[1, 0, 2])
+    prediction = layers.fc(input=layer_1_out, size=class_dim, act="softmax")
+    cost = layers.cross_entropy(input=prediction, label=label)
+    avg_cost = layers.mean(x=cost)
+    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    opts = adam_optimizer.minimize(avg_cost)
+    acc = layers.accuracy(input=prediction, label=label)
+    return avg_cost, acc
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+def chop_data(data, chop_len=80, batch_len=50):
+    data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
+    return data[:batch_len]
+def prepare_feed_data(data, place):
+    tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
+    label = np.array(map(lambda x: x[1], data)).astype("int64")
+    label = label.reshape([50, 1])
+    tensor_label = core.LoDTensor()
+    tensor_label.set(label, place)
+    return tensor_words, tensor_label
+def main():
+    word_dict = paddle.dataset.imdb.word_dict()
+    cost, acc = lstm_net(dict_dim=len(word_dict), class_dim=2)
+    batch_size = 100
+    train_data = paddle.batch(
+        paddle.reader.buffered(
+            paddle.dataset.imdb.train(word_dict), size=batch_size * 10),
+        batch_size=batch_size)
+    data = chop_data(next(train_data()))
+    place = core.CPUPlace()
+    tensor_words, tensor_label = prepare_feed_data(data, place)
+    exe = Executor(place)
+    exe.run(g_startup_program)
+    while True:
+        outs = exe.run(g_main_program,
+                       feed={"words": tensor_words,
+                             "label": tensor_label},
+                       fetch_list=[cost, acc])
+        cost_val = np.array(outs[0])
+        acc_val = np.array(outs[1])
+        print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+        if acc_val > 0.9:
+            break
+if __name__ == '__main__':
+    main()