Merge branch 'develop' into ProtoDataProvider

b7cb9561 · Luo Tao · cd6d69a9 · 374e1685 · b7cb9561 · b7cb9561
226 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -21,7 +21,7 @@ third_party/
 cmake-build-*

 # generated while compiling
-python/paddle/v2/framework/core.so
+python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake

--- a/doc/design/evaluator.md
+++ b/doc/design/evaluator.md
+## Evaluator Design
+
+### The Problem
+
+During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
+
+### Evaluator Design
+Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
+
+1. Initialize the metric state and add it into the block.
+
+2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
+
+
+3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
+
+### Implementation
+This design is shown in python API. 
+Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass. 
+
+    
+```python
+class Evaluator(object):
+    """
+    Evaluator Base class.
+    """
+    def __init__(self, name, **kwargs):
+       """
+       Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
+       Auc need four variables, `true_positives`,
+         `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
+
+       The initialization of Evaluator should be responsible for:
+       create metric states and append to the main_program
+       """ 
+       pass
+
+    def _update_ops(self, input, label, **kwargs)
+       """
+       Add mini-batch evaluator caculate operators to the main_program.
+       Add increment operator to accumulate the metric states.
+       """
+    
+
+    def reset(self, executor, reset_program=None):
+      """
+      Reset metric states at the begin of each pass/user specified batch number.
+      Execute the reset_program to reset the states.
+      """
+      
+
+    def eval(self, executor, eval_program=None):
+      """
+      Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+      Execute the eval_program and return the result.
+      """
+      return eval_result
+```
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -121,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat,

 paddle_matrix paddle_matrix_create_sparse(
    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+#ifndef PADDLE_MOBILE_INFERENCE
  auto ptr = new paddle::capi::CMatrix();
  ptr->mat = paddle::Matrix::createSparseMatrix(
      height,
@@ -131,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse(
      false,
      useGpu);
  return ptr;
+#else
+  return nullptr;
+#endif
 }

 paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
@@ -140,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                            uint64_t colSize,
                                            float* valueArray,
                                            uint64_t valueSize) {
+#ifndef PADDLE_MOBILE_INFERENCE
  if (mat == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (rowArray == nullptr || colArray == nullptr ||
@@ -160,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
  } else {
    return kPD_NOT_SUPPORTED;
  }
+#else
+  return kPD_NOT_SUPPORTED;
+#endif
 }
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height,
 * @param isBinary is binary (either 1 or 0 in matrix) or not.
 * @param useGpu is using GPU or not.
 * @return paddle_matrix.
+ * @note Mobile inference does not support this interface.
 */
 PD_API paddle_matrix paddle_matrix_create_sparse(
    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
@@ -129,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
 * NULL if the matrix is binary.
 * @param [in] valueSize length of value array. Zero if the matrix is binary.
 * @return paddle_error
+ * @note Mobile inference does not support this interface.
 */
 PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                                   int* rowArray,

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -27,7 +27,9 @@ if(WITH_GPU)
    set_source_files_properties(${CUDA_CXX_SOURCES}
                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
+    if (NOT MOBILE_INFERENCE)
    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
+    endif()
 endif()

 set(CUDA_CU_SOURCES

--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "hl_base.h"

 /**
- * @brief   Maximum pool forward.
+ * @brief   Maximum pool forward with Mask output.
 *
 * @param[in]   frameCnt    batch size of input image.
 * @param[in]   inputData   input data.
@@ -35,7 +35,7 @@ limitations under the License. */
 * @param[in]   paddingW    padding width.
 * @param[out]  tgtData     output data.
 * @param[in]   tgtStride   stride between output data samples.
- *
+ * @param[out]  maskData    the location indices of select max data.
 */
 extern void hl_maxpool_forward(const int frameCnt,
                               const real* inputData,
@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt,
                               const int paddingH,
                               const int paddingW,
                               real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               real* maskData = NULL);

 /**
 * @brief   Maximum pool backward.

--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt,
                               const int paddingH,
                               const int paddingW,
                               real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               real* MaskData) {}

 inline void hl_maxpool_backward(const int frameCnt,
                                const real* inputData,

--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads,
                                 const int offsetH,
                                 const int offsetW,
                                 real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 real* maskData) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    int pw = index % pooledW;
@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads,
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
    real maxval = -FLT_MAX;
+    int max_index = -1;
    inputData += (frameNum * channels + c) * height * width;
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w])
-          maxval = inputData[h * width + w];
+        if (maxval < inputData[h * width + w]) {
+          max_index = h * width + w;
+          maxval = inputData[max_index];
+        }
      }
    }
    int tgtIndex =
        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
    tgtData[tgtIndex] = maxval;
+    if (maskData != NULL) {
+      maskData[tgtIndex] = max_index;
+    }
  }
 }

@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt,
                        const int paddingH,
                        const int paddingW,
                        real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        real* maskData) {
  int num_kernels = pooledH * pooledW * channels * frameCnt;
  int blocks = (num_kernels + 1024 - 1) / 1024;
  dim3 threads(1024, 1);
@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt,
                                                         paddingH,
                                                         paddingW,
                                                         tgtData,
-                                                         tgtStride);
+                                                         tgtStride,
+                                                         maskData);
  CHECK_SYNC("hl_maxpool_forward failed");
 }


--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -61,6 +61,7 @@ public:
    // function arguments
    strides_ = config.get<std::vector<size_t>>("strides");
    paddings_ = config.get<std::vector<size_t>>("paddings");
+    dilations_ = config.get<std::vector<size_t>>("dilations");
    groups_ = config.get<size_t>("groups");

    // number of inputs and outputs
@@ -118,6 +119,7 @@ protected:

  std::vector<size_t> strides_;
  std::vector<size_t> paddings_;
+  std::vector<size_t> dilations_;

  /// Group size, refer to grouped convolution in
  /// Alex Krizhevsky's paper: when group=2, the first half of the
@@ -133,6 +135,10 @@ protected:

  inline int paddingW() const { return paddings_[1]; }

+  inline int dilationH() const { return dilations_[0]; }
+
+  inline int dilationW() const { return dilations_[1]; }
+
  // A temporary memory in convolution calculation.
  MemoryHandlePtr memory_;


--- a/paddle/function/ConvOpTest.h
+++ b/paddle/function/ConvOpTest.h
@@ -79,45 +79,59 @@ void Convolution(const std::string& conv1,
            if (outputChannels < inputChannels) continue;
            for (size_t stride : {1, 2}) {
              for (size_t padding : {0, 1}) {
-                if (padding >= filterSize) break;
+                for (size_t dilation : {1, 3}) {
+                  if (padding >= filterSize) break;
+                  size_t filterS = (filterSize - 1) * dilation + 1;

-                // NNPACK only supports stride = 1 if batchSize > 1
-                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
-                    batchSize > 1 && stride > 1)
-                  break;
+                  if (inputSize + 2 * padding < filterS) break;

-                size_t outputSize =
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputSize
-                        << " inputWidth=" << inputSize
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterSize
-                        << " filterWidth=" << filterSize
-                        << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
+                  if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
+                       conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
+                      dilation > 1)
+                    break;

-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", (size_t)1)
-                        .set("algo", (std::string) "auto"));
+                  // NNPACK only supports stride = 1 if batchSize > 1
+                  if ((conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
+                      batchSize > 1 && stride > 1)
+                    break;

-                TensorShape input{
-                    batchSize, inputChannels, inputSize, inputSize};
-                TensorShape filter{
-                    outputChannels, inputChannels, filterSize, filterSize};
-                TensorShape output{
-                    batchSize, outputChannels, outputSize, outputSize};
+                  size_t outputSize =
+                      (inputSize - filterS + 2 * padding + stride) / stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputSize
+                          << " inputWidth=" << inputSize
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterSize
+                          << " filterWidth=" << filterSize
+                          << " outputHeight=" << outputSize
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;

-                function(test, input, filter, output);
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  std::vector<size_t> dilations = {dilation, dilation};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("dilations", dilations)
+                          .set("groups", (size_t)1)
+                          .set("algo", (std::string) "auto"));
+
+                  TensorShape input{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape output{
+                      batchSize, outputChannels, outputSize, outputSize};
+
+                  function(test, input, filter, output);
+                }
              }
            }
          }
@@ -144,6 +158,7 @@ void Convolution2(const std::string& conv1,
              for (size_t outputChannels : {7}) {
                size_t stride = 1;
                size_t padding = 0;
+                size_t dilation = 1;
                size_t outputHeight =
                    (inputHeight - filterHeight + 2 * padding + stride) /
                    stride;
@@ -162,6 +177,7 @@ void Convolution2(const std::string& conv1,

                std::vector<size_t> paddings = {padding, padding};
                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {dilation, dilation};
                Compare2Function<DType1, DType2> test(
                    conv1,
                    conv2,
@@ -169,6 +185,7 @@ void Convolution2(const std::string& conv1,
                        .set("paddings", paddings)
                        .set("strides", strides)
                        .set("groups", (size_t)1)
+                        .set("dilations", dilations)
                        .set("algo", (std::string) "auto"));

                TensorShape input{
@@ -223,6 +240,7 @@ void DepthwiseConvolution(const std::string& conv1,

                std::vector<size_t> paddings = {padding, padding};
                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {1, 1};
                size_t groups = inputChannels;
                Compare2Function<DType1, DType2> test(
                    conv1,
@@ -231,6 +249,7 @@ void DepthwiseConvolution(const std::string& conv1,
                        .set("paddings", paddings)
                        .set("strides", strides)
                        .set("groups", groups)
+                        .set("dilations", dilations)
                        .set("algo", (std::string) "auto"));

                TensorShape input{

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -100,7 +100,9 @@ public:
                 strideH(),
                 strideW(),
                 paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
        } else {
          colData = inputData + g * inputOffset;
        }
@@ -223,7 +225,9 @@ public:
                 strideH(),
                 strideW(),
                 paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
        }
      }
      inputGrad += inputChannels * inputHeight * inputWidth;
@@ -310,7 +314,9 @@ public:
                 strideH(),
                 strideW(),
                 paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
        } else {
          colData = inputData + g * inputOffset;
        }

--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -78,7 +78,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };

 template <ColFormat Format, DeviceType Device, class T>
@@ -91,7 +93,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };

 }  // namespace paddle
--- a/paddle/function/Im2ColOp.cpp
+++ b/paddle/function/Im2ColOp.cpp
@@ -31,7 +31,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -47,8 +49,8 @@ public:
      int c_im = c / filterWidth / filterHeight;
      for (int h = 0; h < outputHeight; ++h) {
        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
          if ((imRowIdx - paddingHeight) < 0 ||
              (imRowIdx - paddingHeight) >= inputHeight ||
              (imColIdx - paddingWidth) < 0 ||
@@ -81,7 +83,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -97,8 +101,8 @@ public:
      int c_im = c / filterWidth / filterHeight;
      for (int h = 0; h < outputHeight; ++h) {
        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
          if ((imRowIdx - paddingHeight) >= 0 &&
              (imRowIdx - paddingHeight) < inputHeight &&
              (imColIdx - paddingWidth) >= 0 &&
@@ -134,7 +138,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -147,9 +153,10 @@ public:
        for (int channel = 0; channel < inputChannels; ++channel) {
          for (int filterH = 0; filterH < filterHeight; ++filterH) {
            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
              int colDataOffset =
                  (((outputH * outputWidth + outputW) * inputChannels +
                    channel) *
@@ -189,7 +196,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -202,9 +211,10 @@ public:
        for (int channel = 0; channel < inputChannels; ++channel) {
          for (int filterH = 0; filterH < filterHeight; ++filterH) {
            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
              int colDataOffset =
                  (((outputH * outputWidth + outputW) * inputChannels +
                    channel) *

--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -28,6 +28,8 @@ __global__ void im2col(const T* data_im,
                       int strideW,
                       int paddingH,
                       int paddingW,
+                       int dilationH,
+                       int dilationW,
                       int height_col,
                       int width_col,
                       T* data_col) {
@@ -44,8 +46,8 @@ __global__ void im2col(const T* data_im,
    data_col += (channel_out * height_col + h_out) * width_col + w_out;
    for (int i = 0; i < blockH; ++i) {
      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in + i);
-        int cIdx = int(w_in + j);
+        int rIdx = int(h_in + i * dilationH);
+        int cIdx = int(w_in + j * dilationW);
        if ((rIdx - (int)paddingH) >= (int)height ||
            (rIdx - (int)paddingH) < 0 ||
            (cIdx - (int)paddingW) >= (int)width ||
@@ -77,7 +79,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -102,6 +106,8 @@ public:
                                                    strideWidth,
                                                    paddingHeight,
                                                    paddingWidth,
+                                                    dilationHeight,
+                                                    dilationWidth,
                                                    outputHeight,
                                                    outputWidth,
                                                    colData);
@@ -121,6 +127,8 @@ __global__ void col2im(size_t n,
                       size_t strideW,
                       size_t paddingH,
                       size_t paddingW,
+                       size_t dilationH,
+                       size_t dilationW,
                       size_t height_col,
                       size_t width_col,
                       T* data_im) {
@@ -131,23 +139,34 @@ __global__ void col2im(size_t n,
    int w = int(index % width);
    int h = int((index / width) % height);
    int c = int(index / (width * height));
+    int filterH = (blockH - 1) * dilationH + 1;
+    int filterW = (blockW - 1) * dilationW + 1;
+
    if ((w - (int)paddingW) >= 0 &&
        (w - (int)paddingW) < (width - 2 * paddingW) &&
        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
      // compute the start and end of the output
      int w_col_start =
-          (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
      int h_col_start =
-          (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
      int h_col_end = min(int(h / strideH + 1), int(height_col));
+
      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH * blockW) +
-                      (h - h_col * (int)strideH) * (int)blockW +
-                      (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+          int h_k = (h - h_col * strideH);
+          int w_k = (w - w_col * strideW);
+          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
+            h_k /= dilationH;
+            w_k /= dilationW;
+            int c_col =
+                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
+                    width_col +
+                w_col;
+            val += data_col[c_col];
+          }
        }
      }
      h -= paddingH;
@@ -173,7 +192,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -205,6 +226,8 @@ public:
        strideWidth,
        paddingHeight,
        paddingWidth,
+        dilationHeight,
+        dilationWidth,
        outputHeight,
        outputWidth,
        imData);
@@ -229,6 +252,8 @@ __global__ void im2colOCF(const T* imData,
                          int strideWidth,
                          int paddingHeight,
                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                          int outputHeight,
                          int outputWidth) {
  int swId = blockIdx.x;
@@ -237,8 +262,10 @@ __global__ void im2colOCF(const T* imData,
       channelId += blockDim.z) {
    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int widthOffset =
+            idx * dilationHeight + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationWidth + shId * strideHeight - paddingHeight;
        int imOffset = widthOffset + heightOffset * inputWidth +
                       channelId * inputHeight * inputWidth;

@@ -273,7 +300,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -312,6 +341,8 @@ public:
                                                       strideWidth,
                                                       paddingHeight,
                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                       outputHeight,
                                                       outputWidth);
    CHECK_SYNC("Im2ColFunctor GPU failed");
@@ -330,6 +361,8 @@ __global__ void col2imOCF(T* imData,
                          int strideWidth,
                          int paddingHeight,
                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                          int outputHeight,
                          int outputWidth) {
  int swId = blockIdx.x;
@@ -338,8 +371,10 @@ __global__ void col2imOCF(T* imData,
       channelId += blockDim.z) {
    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int widthOffset =
+            idx * dilationWidth + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationHeight + shId * strideHeight - paddingHeight;
        int imOffset = widthOffset + heightOffset * inputWidth +
                       channelId * inputHeight * inputWidth;

@@ -372,7 +407,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -411,6 +448,8 @@ public:
                                                       strideWidth,
                                                       paddingHeight,
                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                       outputHeight,
                                                       outputWidth);
    CHECK_SYNC("Col2ImFunctor GPU failed");

--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -29,82 +29,98 @@ void TestIm2ColFunctor() {
          for (size_t filterWidth : {3, 7}) {
            for (size_t stride : {1, 2}) {
              for (size_t padding : {0, 1}) {
-                if (inputHeight <= filterHeight || inputWidth <= filterWidth)
-                  break;
-                if (padding >= filterHeight || padding >= filterWidth) break;
-                size_t outputHeight =
-                    (inputHeight - filterHeight + 2 * padding + stride) /
-                    stride;
-                size_t outputWidth =
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
-
-                TensorShape imShape =
-                    TensorShape({channels, inputHeight, inputWidth});
-                TensorShape colShape1 = TensorShape({channels,
-                                                     filterHeight,
-                                                     filterWidth,
-                                                     outputHeight,
-                                                     outputWidth});
-                TensorShape colShape2 = TensorShape({outputHeight,
-                                                     outputWidth,
-                                                     channels,
-                                                     filterHeight,
-                                                     filterWidth});
-
-                size_t height = channels * filterHeight * filterWidth;
-                size_t width = outputHeight * outputWidth;
-                VectorPtr input1 = Vector::create(imShape.getElements(), false);
-                VectorPtr input2 = Vector::create(imShape.getElements(), false);
-                MatrixPtr output1 = Matrix::create(height, width, false, false);
-                MatrixPtr output2 = Matrix::create(width, height, false, false);
-                input1->uniform(0.001, 1);
-                input2->copyFrom(*input1);
-
-                Im2ColFunctor<kCFO, Device, T> im2Col1;
-                Im2ColFunctor<kOCF, Device, T> im2Col2;
-                im2Col1(input1->getData(),
-                        imShape,
-                        output1->getData(),
-                        colShape1,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-                im2Col2(input2->getData(),
-                        imShape,
-                        output2->getData(),
-                        colShape2,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-
-                // The transposition of the result of ColFormat == kCFO
-                // is equal to the result of ColFormat == kOCF.
-                MatrixPtr test;
-                output2->transpose(test, true);
-                autotest::TensorCheckErr(*output1, *test);
-
-                Col2ImFunctor<kCFO, Device, T> col2Im1;
-                Col2ImFunctor<kOCF, Device, T> col2Im2;
-                col2Im1(input1->getData(),
-                        imShape,
-                        output1->getData(),
-                        colShape1,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-                col2Im2(input2->getData(),
-                        imShape,
-                        output2->getData(),
-                        colShape2,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-
-                autotest::TensorCheckErr(*input1, *input2);
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+                  TensorShape colShape2 = TensorShape({outputHeight,
+                                                       outputWidth,
+                                                       channels,
+                                                       filterHeight,
+                                                       filterWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(width, height, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, Device, T> im2Col1;
+                  Im2ColFunctor<kOCF, Device, T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+
+                  // The transposition of the result of ColFormat == kCFO
+                  // is equal to the result of ColFormat == kOCF.
+                  MatrixPtr test;
+                  output2->transpose(test, true);
+                  autotest::TensorCheckErr(*output1, *test);
+
+                  Col2ImFunctor<kCFO, Device, T> col2Im1;
+                  Col2ImFunctor<kOCF, Device, T> col2Im2;
+
+                  col2Im1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  col2Im2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  autotest::TensorCheckErr(*input1, *input2);
+                }
              }
            }
          }

--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -84,9 +84,49 @@ if(MOBILE_INFERENCE)
         gradientmachines/GradientMachineMode.cpp
         gradientmachines/MultiGradientMachine.cpp)

-    # Remove useless layers
+    # Remove layers that used in training
    list(REMOVE_ITEM GSERVER_SOURCES
-    	 layers/RecurrentLayerGroup.cpp)
+    	 layers/RecurrentLayerGroup.cpp
+         layers/CostLayer.cpp
+         layers/MultiBoxLossLayer.cpp
+         layers/WarpCTCLayer.cpp
+         layers/CTCLayer.cpp
+         layers/LinearChainCTC.cpp
+         layers/PrintLayer.cpp)
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/OuterProdLayer.cpp
+         layers/SumToOneNormLayer.cpp
+         layers/ConvShiftLayer.cpp
+         layers/InterpolationLayer.cpp
+         layers/AgentLayer.cpp
+         layers/DotMulOperator.cpp
+         layers/GruStepLayer.cpp
+         layers/LstmStepLayer.cpp
+         layers/ConvexCombinationLayer.cpp
+         layers/Conv3DLayer.cpp
+         layers/DeConv3DLayer.cpp
+         layers/CropLayer.cpp
+         layers/CrossEntropyOverBeam.cpp
+         layers/DataNormLayer.cpp
+         layers/FeatureMapExpandLayer.cpp
+         layers/HierarchicalSigmoidLayer.cpp
+         layers/MultinomialSampler.cpp
+         layers/NCELayer.cpp
+         layers/KmaxSeqScoreLayer.cpp
+         layers/MDLstmLayer.cpp
+         layers/MultiplexLayer.cpp
+         layers/PadLayer.cpp
+         layers/Pool3DLayer.cpp
+         layers/ResizeLayer.cpp
+         layers/RotateLayer.cpp
+         layers/RowConvLayer.cpp
+         layers/RowL2NormLayer.cpp
+         layers/SamplingIdLayer.cpp
+         layers/ScaleShiftLayer.cpp
+         layers/SelectiveFullyConnectedLayer.cpp
+         layers/SpatialPyramidPoolLayer.cpp
+         layers/BilinearInterpLayer.cpp
+         layers/ClipLayer.cpp)
 endif()

 if(WITH_GPU)

--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -16,7 +16,6 @@ limitations under the License. */

 #include "NeuralNetwork.h"
 #include "hl_gpu.h"
-#include "paddle/gserver/layers/AgentLayer.h"
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -28,6 +27,7 @@ limitations under the License. */
 #ifndef PADDLE_MOBILE_INFERENCE
 #include "MultiNetwork.h"
 #include "RecurrentGradientMachine.h"
+#include "paddle/gserver/layers/AgentLayer.h"
 #endif

 namespace paddle {
@@ -192,9 +192,11 @@ void NeuralNetwork::init(const ModelConfig& config,
 void NeuralNetwork::connect(LayerPtr agentLayer,
                            LayerPtr realLayer,
                            int height) {
+#ifndef PADDLE_MOBILE_INFERENCE
  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
  CHECK_NOTNULL(agent);
  agent->setRealLayer(realLayer, height);
+#endif
 }

 void NeuralNetwork::connect(std::string agentLayerName,

--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -79,6 +79,10 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
  for (int i = 0; i < config_.inputs_size(); i++) {
    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    std::vector<size_t> dilations = {(size_t)dilationY_[i],
+                                     (size_t)dilation_[i]};
+
+    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);

    // Convolution Layer uses the GemmConv function by default.
    convType = "GemmConv";
@@ -97,13 +101,14 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
      if ((filterSize_[i] == filterSizeY_[i]) &&
          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
-          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
+          !useDilation) {
        convType = "NeonDepthwiseConv";
      }
 #endif
    }

-    if (FLAGS_use_nnpack && !isDeconv_) {
+    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
      createFunction(forward_,
                     "NNPACKConv",
                     FuncConfig()
@@ -117,6 +122,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
+                         .set("dilations", dilations)
                         .set("groups", (size_t)groups_[i]));

      createFunction(backward_,
@@ -124,6 +130,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
+                         .set("dilations", dilations)
                         .set("groups", (size_t)groups_[i]));

      createFunction(backward_,
@@ -131,6 +138,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
+                         .set("dilations", dilations)
                         .set("groups", (size_t)groups_[i]));
    }
  }

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -98,6 +98,7 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
 LayerPtr Layer::create(const LayerConfig& config) {
  std::string type = config.type();

+#ifndef PADDLE_MOBILE_INFERENCE
  // NOTE: As following types have illegal character '-',
  // they can not use REGISTER_LAYER to registrar.
  // Besides, to fit with old training models,
@@ -106,7 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) {
    return LayerPtr(new MultiClassCrossEntropy(config));
  else if (type == "rank-cost")
    return LayerPtr(new RankingCost(config));
-#ifndef PADDLE_MOBILE_INFERENCE
  else if (type == "auc-validation")
    return LayerPtr(new AucValidation(config));
  else if (type == "pnpair-validation")

--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxPoolWithMaskLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+  setOutput("mask", &mask_);
+  return true;
+}
+
+size_t MaxPoolWithMaskLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  layerSize = outputX_ * outputY_ * channels_;
+  getOutput().setFrameHeight(outputY_);
+  getOutput().setFrameWidth(outputX_);
+
+  return layerSize;
+}
+
+void MaxPoolWithMaskLayer::forward(PassType passType) {
+  size_t size = getSize();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  int batchSize = inputV->getHeight();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  CHECK_EQ(size, outV->getWidth());
+
+  resetSpecifyOutput(mask_,
+                     batchSize,
+                     size,
+                     /* isValueClean */ false,
+                     /* isGradClean */ true);
+
+  MatrixPtr maskV = mask_.value;
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_,
+                       maskV);
+}
+
+void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  MatrixPtr outGrad = getOutputGrad();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
+
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class MaxPoolWithMaskLayer : public PoolLayer {
+protected:
+  Argument mask_;
+
+public:
+  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
+      : PoolLayer(config) {}
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "PoolLayer.h"
+#include "MaxPoolWithMaskLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
 #ifdef PADDLE_WITH_CUDA
@@ -44,7 +45,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-
  return true;
 }

@@ -57,6 +57,8 @@ Layer* PoolLayer::create(const LayerConfig& config) {
  } else if (CudnnPoolLayer::typeCheck(pool)) {
    return new CudnnPoolLayer(config);
 #endif
+  } else if (pool == "max-pool-with-mask") {
+    return new MaxPoolWithMaskLayer(config);
  } else {
    LOG(FATAL) << "Unknown pool type: " << pool;
    return nullptr;

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
 # gserver pacakge unittests

 add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_MultinomialSampler)
 add_simple_unittest(test_RecurrentLayer)

+if(NOT MOBILE_INFERENCE)
+  add_simple_unittest(test_MultinomialSampler)
+endif()
+
 function(gserver_test TARGET)
  add_unittest_without_exec(${TARGET}
      ${TARGET}.cpp
@@ -24,6 +27,7 @@ gserver_test(test_ConvUnify)
 gserver_test(test_BatchNorm)
 gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
+gserver_test(test_MaxPoolingWithMaskOutput)

 ########## test_Mkldnn layers and activations ##########
 if(WITH_MKLDNN)
@@ -48,7 +52,7 @@ if(WITH_PYTHON)
 endif()

 ############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE)
+if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
    add_unittest_without_exec(test_WarpCTCLayer
        test_WarpCTCLayer.cpp)


--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -434,7 +434,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
  config.layerConfig.set_partial_sum(1);
  config.layerConfig.set_shared_biases(true);

-  int dilation = 1;
+  int dilation = 2;
  if (type == "cudnn_conv") {
 #if CUDNN_VERSION >= 6000
    dilation = 2;
@@ -1234,6 +1234,7 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 TEST(Layer, PoolLayer) {
  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);

 #ifdef PADDLE_WITH_CUDA
  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
@@ -1242,6 +1243,7 @@ TEST(Layer, PoolLayer) {
  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }


--- a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+++ b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(1);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
+                                       const string& poolType,
+                                       bool use_gpu,
+                                       MatrixPtr& maskMat) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(5);
+  pool->set_img_size_y(5);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+
+  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
+                   maskMat);
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  MatrixPtr inputMat;
+  MatrixPtr maskMat;
+  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
+                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
+                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
+  real maskData[] = {12, 4, 22, 24};
+
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->setData(inputData);
+  maskMat->setData(maskData);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->copyFrom(inputData, 25);
+  maskMat->copyFrom(maskData, 4);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#endif
+}
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1902,5 +1902,52 @@ void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
 }

 template class BaseMatrixT<real>;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 template class BaseMatrixT<int>;
+
+#else
+
+template <>
+void BaseMatrixT<int>::zero() {
+  applyUnary(unary::Zero<int>());
+}
+
+template <>
+void BaseMatrixT<int>::assign(int p) {
+  applyUnary(unary::Assign<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
+  applyBinary(binary::IsEqual<int>(value), b);
+}
+
+template <>
+void BaseMatrixT<int>::neg() {
+  applyUnary(unary::Neg<int>());
+}
+
+template <>
+void BaseMatrixT<int>::abs2() {
+  applyUnary(unary::Abs<int>());
+}
+
+template <>
+void BaseMatrixT<int>::add(int p) {
+  applyUnary(unary::Add<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::add(int p1, int p2) {
+  applyUnary(unary::Add2<int>(p1, p2));
+}
+
+template <>
+void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
+  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
+}
+
+#endif
 }  // namespace paddle
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -25,6 +25,19 @@ else()
    message(STATUS "Compile with MKLDNNMatrix")
 endif()

+if(MOBILE_INFERENCE)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
+    # Remove sparse
+    list(REMOVE_ITEM MATH_HEADERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
+endif()
 set(MATH_SOURCES
    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"

--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "Matrix.h"

@@ -309,3 +312,57 @@ private:
  using Matrix::subMatrix;
 };
 }  // namespace paddle
+
+#else
+
+#include "Matrix.h"
+
+namespace paddle {
+
+class CpuSparseMatrix : public Matrix {
+public:
+  CpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  real* getValue() const { return nullptr; }
+  size_t getColStartIdx(size_t i) const { return 0; }
+  size_t getRowStartIdx(size_t i) const { return 0; }
+  size_t getColNum(size_t i) const { return 0; }
+  int* getRowCols(size_t i) const { return nullptr; }
+
+  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
+    return nullptr;
+  }
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -451,6 +451,7 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
 }

 void GpuMatrix::collectBias(Matrix& a, real scale) {
+#ifdef PADDLE_WITH_CUDA
  CHECK_EQ(getHeight(), (size_t)1);
  CHECK_EQ(width_, a.getWidth());
  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
@@ -461,6 +462,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
  }
+#endif
 }

 void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
@@ -552,6 +554,7 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                    const GpuMatrix& b,
                    real scaleAB,
                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
  CHECK(isContiguous());
  CHECK(b.isContiguous());
  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
@@ -578,12 +581,14 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                          b.height_,
                          scaleAB,
                          scaleT);
+#endif
 }

 void GpuMatrix::mul(const GpuMatrix& a,
                    const GpuSparseMatrix& b,
                    real scaleAB,
                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
  CHECK(isContiguous());
  CHECK(a.isContiguous());
  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
@@ -622,6 +627,7 @@ void GpuMatrix::mul(const GpuMatrix& a,
                            scaleAB,
                            scaleT);
  }
+#endif
 }

 /* this = a*b */
@@ -1028,15 +1034,23 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";

  real* inputData = inputMat.getData();
+  real* maskData = NULL;
  size_t frameNum = inputMat.getHeight();
  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
  CHECK(height_ == inputMat.getHeight());
  CHECK(width_ == outputH * outputW * channels);

+  if (maskMatP != NULL) {
+    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
+    CHECK(outputH * outputW * channels == maskMatP->getWidth());
+    maskData = maskMatP->getData();
+  }
+
  hl_maxpool_forward(frameNum,
                     inputData,
                     channels,
@@ -1051,7 +1065,8 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                     paddingH,
                     paddingW,
                     data_,
-                     getStride());
+                     getStride(),
+                     maskData);
 }

 void GpuMatrix::maxPoolBackward(Matrix& inputMat,
@@ -1548,6 +1563,7 @@ void GpuMatrix::bilinearBackward(const Matrix& out,
 }

 void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);

@@ -1563,9 +1579,11 @@ void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
  hl_matrix_multi_binary_cross_entropy(
      output_d, entropy_d, mat_d, height_, outputPtr->width_);
+#endif
 }

 void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);

@@ -1581,6 +1599,7 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
  hl_matrix_multi_binary_cross_entropy_bp(
      output_d, grad_d, mat_d, height_, width_);
+#endif
 }

 void GpuMatrix::vol2Col(real* dataSrc,
@@ -1973,9 +1992,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
  real* inputData = inputMat.getData();
  real* outData = data_;
+  real* maskData = NULL;
  size_t num = inputMat.getHeight();
  size_t inLength = imgSizeH * imgSizeW;
  size_t outLength = outputH * outputW;
@@ -1984,6 +2005,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
  CHECK_EQ(channels * outLength, this->getWidth());
  size_t outStride = getStride();

+  if (maskMatP != NULL) {
+    maskData = maskMatP->getData();
+    CHECK_EQ(channels * outLength, maskMatP->getWidth());
+  }
+
  /* initialize the data_ */
  for (size_t i = 0; i < height_; i++) {
    for (size_t j = 0; j < width_; j++) {
@@ -2005,10 +2031,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
          int wstart = pw * strideW - paddingW;
          int wend = std::min(wstart + sizeX, imgSizeW);
          wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              outData[ph * outputW + pw] = std::max(
-                  outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+          if (maskData == NULL) {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                outData[ph * outputW + pw] = std::max(
+                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+              }
+            }
+          } else {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
+                  outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
+                  maskData[ph * outputW + pw] = h * imgSizeW + w;
+                }
+              }
            }
          }
        }
@@ -2016,6 +2053,8 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
      // compute offset
      inputData += inLength;
      outData += outLength;
+
+      if (maskData != NULL) maskData += outLength;
    }
  }
 }
@@ -3226,6 +3265,7 @@ template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
                                                           real scaleAB,
                                                           real scaleT);

+#ifndef PADDLE_MOBILE_INFERENCE
 void SharedCpuMatrix::mul(CpuSparseMatrix* a,
                          CpuMatrix* b,
                          real scaleAB,
@@ -3354,6 +3394,7 @@ void SharedCpuMatrix::initBlock(int blockNum) {
  }
 }

+#endif
 /* Add a (column) vector b to matrix a, column by column */
 void CpuMatrix::addColumnVector(const Matrix& b) {
  BaseMatrix::addColVector(const_cast<Matrix&>(b));

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -861,7 +861,8 @@ public:

  /**
   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value
+   * in the sizeX of value, if the maskMatP is not NULL, it will
+   * also caculate the location indices.
   */
  virtual void maxPoolForward(Matrix& inputMat,
                              size_t imgSizeH,
@@ -874,7 +875,8 @@ public:
                              size_t outputH,
                              size_t outputW,
                              size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              MatrixPtr maskMatP = NULL) {
    LOG(FATAL) << "Not implemeted";
  }

@@ -1426,7 +1428,8 @@ public:
                      size_t outputH,
                      size_t outputW,
                      size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);

  void maxPoolBackward(Matrix& image,
                       size_t imgSizeH,
@@ -1697,7 +1700,8 @@ public:
                      size_t outputH,
                      size_t outputW,
                      size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);

  void maxPoolBackward(Matrix& image,
                       size_t imgSizeH,
@@ -2066,6 +2070,7 @@ public:

 class SharedCpuMatrix : public CpuMatrix {
 public:
+#ifndef PADDLE_MOBILE_INFERENCE
  /* blockNum is number of partitions of the matrix  */
  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
      : CpuMatrix(height, width, trans) {
@@ -2111,6 +2116,7 @@ private:
  ThreadLocal<CpuMatrixPtr> localBuf_;
  ThreadLocal<std::vector<int>> localBufRows_;
  ThreadLocal<std::vector<int>> blockSeq_;
+#endif
 };

 typedef struct { unsigned int col; } sparse_non_value_t;

--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "CpuSparseMatrix.h"
 #include "Matrix.h"
@@ -237,3 +240,47 @@ private:
 };

 }  // namespace paddle
+
+#else
+
+#include "CpuSparseMatrix.h"
+
+namespace paddle {
+
+class GpuSparseMatrix : public Matrix {
+public:
+  GpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, true) {}
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once

+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <gflags/gflags.h>
 #include <string.h>
 #include <algorithm>
@@ -313,3 +315,27 @@ private:
 };

 }  // namespace paddle
+
+#else
+namespace paddle {
+
+class SparseRowCpuMatrix : public CpuMatrix {
+public:
+  void reserveStore() {}
+  void clearIndices() {}
+};
+
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+public:
+  void setupIndices() {}
+  void addRows(MatrixPtr input) {}
+  void addRows(IVectorPtr ids) {}
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
+class SparseRowIdsCpuMatrix : public CpuMatrix {};
+
+}  // namespace paddle
+
+#endif
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -3,8 +3,10 @@
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
-add_simple_unittest(test_SparseMatrix)
 add_simple_unittest(test_RowBuffer)
+if(NOT MOBILE_INFERENCE)
+    add_simple_unittest(test_SparseMatrix)
+endif()

 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare

--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -30,6 +30,10 @@ class AccuracyOp : public framework::OperatorWithKernel {
                   "Input (Label) of accuracy op should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
                   "Output (Accuracy) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Correct"),
+                   "Output (Correct) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Total"),
+                   "Output (Total) of AccuracyOp should not be null.");

    auto inference_dim = ctx->GetInputDim("Out");
    auto label_dim = ctx->GetInputDim("Label");
@@ -43,6 +47,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
                      " the same as label.");

    ctx->SetOutputDim("Accuracy", {1});
+    ctx->SetOutputDim("Correct", {1});
+    ctx->SetOutputDim("Total", {1});
    ctx->ShareLoD("Out", /*->*/ "Accuracy");
  }

@@ -66,6 +72,8 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label", "Label of the training data");
    // TODO(typhoonzero): AddInput("Weight", ...
    AddOutput("Accuracy", "The accuracy of current batch");
+    AddOutput("Correct", "The correct samples count of current batch");
+    AddOutput("Total", "The samples count of current batch");

    AddComment(R"DOC(
 Accuracy Operator. 

--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -24,7 +24,8 @@ using platform::PADDLE_CUDA_NUM_THREADS;
 template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N, const int D,
                                   const int64_t* Xdata,
-                                   const int64_t* labeldata, float* accuracy) {
+                                   const int64_t* labeldata, int* correct_data,
+                                   float* accuracy) {
  int count = 0;
  __shared__ int total[BlockSize];

@@ -43,6 +44,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
  // reduce the count with init value 0, and output accuracy.
  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
  if (threadIdx.x == 0) {
+    *correct_data = result;
    *accuracy = static_cast<float>(result) / static_cast<float>(N);
  }
 }
@@ -56,31 +58,48 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
    auto* inference = ctx.Input<Tensor>("Out");
    auto* indices = ctx.Input<Tensor>("Indices");
    auto* label = ctx.Input<Tensor>("Label");
+
    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
    // FIXME(typhoonzero): only support indices currently
    // if add support for output values, how to detect the data type?
    const int64_t* indices_data = indices->data<int64_t>();
    const int64_t* label_data = label->data<int64_t>();
+
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());

-    size_t num_samples = inference->dims()[0];
+    int num_samples = static_cast<int>(inference->dims()[0]);
    size_t infer_width = inference->dims()[1];
    PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
+    // cudaMemset((void**)&correct_data, 0, sizeof(float));

    if (num_samples == 0) {
      return;
    }
+    cudaMemcpy(total_data, &num_samples, sizeof(int), cudaMemcpyHostToDevice);

    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
        1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>(
-        num_samples, infer_width, indices_data, label_data, accuracy_data);
+        num_samples, infer_width, indices_data, label_data, correct_data,
+        accuracy_data);
+
+    int d_num_samples, d_num_correct;
+    float d_accuracy;
+    cudaMemcpy(&d_num_correct, correct_data, sizeof(int),
+               cudaMemcpyDeviceToHost);
+    cudaMemcpy(&d_num_samples, total_data, sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&d_accuracy, accuracy_data, sizeof(float),
+               cudaMemcpyDeviceToHost);
  }
 };

 }  // namespace operators
 }  // namespace paddle

-// FIXME(typhoonzero): types of T is for infernece data.
-// label data is always int
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
 REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
                       paddle::operators::AccuracyOpCUDAKernel<double>);
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -29,7 +29,11 @@ class AccuracyKernel : public framework::OpKernel<T> {
    auto* indices = ctx.Input<Tensor>("Indices");
    auto* label = ctx.Input<Tensor>("Label");
    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");

+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());

    const int64_t* indices_data = indices->data<int64_t>();
@@ -55,7 +59,8 @@ class AccuracyKernel : public framework::OpKernel<T> {
      }
    }

-    // FIXME(typhoonzero): we don't accumulate the accuracy for now.
+    *correct_data = num_correct;
+    *total_data = num_samples;
    *accuracy_data =
        static_cast<float>(num_correct) / static_cast<float>(num_samples);
  }

--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -27,6 +27,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
  void Run(const framework::Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+
    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
    const size_t step_num = ids->size();

--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -94,5 +94,13 @@ class CompareOp : public framework::OperatorWithKernel {

 REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
 REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
+REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_OP(greater_than, "Out = X > Y");
+REGISTER_LOGICAL_KERNEL(greater_than, CPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_OP(greater_equal, "Out = X >= Y");
+REGISTER_LOGICAL_KERNEL(greater_equal, CPU,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_OP(equal, "Out = X == Y");
 REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@@ -15,4 +15,9 @@
 #include "paddle/operators/compare_op.h"

 REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(greater_than, GPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -27,6 +27,24 @@ struct LessThanFunctor {
  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
 };

+template <typename T>
+struct LessEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
+};
+
+template <typename T>
+struct GreaterThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
+};
+
+template <typename T>
+struct GreaterEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
+};
+
 template <typename T>
 struct EqualFunctor {
  using ELEM_TYPE = T;

--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -34,7 +34,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
            elementwise_add_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
    elementwise_add,
-    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
    elementwise_add_grad,
-    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -35,7 +35,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
            elementwise_div_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
    elementwise_div,
-    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
    elementwise_div_grad,
-    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -37,8 +37,12 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul,
    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
    elementwise_mul_grad,
    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -34,7 +34,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
            elementwise_sub_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
    elementwise_sub,
-    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int64_t>);
 REGISTER_OP_CPU_KERNEL(
    elementwise_sub_grad,
-    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -27,15 +27,15 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
    const int ksize_height = ksize[0];
    const int ksize_width = ksize[1];
    const int stride_height = strides[0];
@@ -47,7 +47,7 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
    const int output_stride = output_height * output_width;

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -87,11 +87,12 @@ template <typename PoolProcess, class T>
 class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@@ -110,7 +111,7 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -154,10 +155,11 @@ template <class T>
 class MaxPool2dGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@@ -176,7 +178,7 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -240,17 +242,17 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
@@ -265,7 +267,7 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
    const int output_stride = output_depth * output_height * output_width;

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -315,11 +317,12 @@ template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@@ -343,7 +346,7 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -398,10 +401,11 @@ template <class T>
 class MaxPool3dGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@@ -425,7 +429,7 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -498,15 +502,15 @@ template <typename T>
 class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
    const int ksize_height = ksize[0];
    const int ksize_width = ksize[1];
    const int stride_height = strides[0];
@@ -517,8 +521,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
    const int output_stride = output_height * output_width;

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -563,13 +567,13 @@ template <typename T>
 class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_height = input_grad.dims()[2];
-    const int input_width = input_grad.dims()[3];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
    const int output_channels = output_grad.dims()[1];
    const int output_height = output_grad.dims()[2];
    const int output_width = output_grad.dims()[3];
@@ -578,7 +582,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {

    const T* mask_data = mask.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int n = 0; n < batch_size; ++n) {
      for (int c = 0; c < output_channels; ++c) {
@@ -612,17 +616,17 @@ template <typename T>
 class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
@@ -636,8 +640,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
    const int output_stride = output_depth * output_height * output_width;

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -691,14 +695,14 @@ template <typename T>
 class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_depth = input_grad.dims()[2];
-    const int input_height = input_grad.dims()[3];
-    const int input_width = input_grad.dims()[4];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
    const int output_channels = output_grad.dims()[1];
    const int output_depth = output_grad.dims()[2];
    const int output_height = output_grad.dims()[3];
@@ -708,7 +712,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {

    const T* mask_data = mask.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int n = 0; n < batch_size; ++n) {
      for (int c = 0; c < output_channels; ++c) {

--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
@@ -21,13 +21,13 @@ namespace math {

 template <typename PoolProcess, typename T>
 __global__ void KernelPool2D(const int nthreads, const T* input_data,
-                             T* output_data, const int channels,
-                             const int input_height, const int input_width,
-                             const int output_height, const int output_width,
-                             const int ksize_height, const int ksize_width,
-                             const int stride_height, const int stride_width,
-                             const int padding_height, const int padding_width,
-                             PoolProcess pool_process) {
+                             const int channels, const int input_height,
+                             const int input_width, const int output_height,
+                             const int output_width, const int ksize_height,
+                             const int ksize_width, const int stride_height,
+                             const int stride_width, const int padding_height,
+                             const int padding_width, PoolProcess pool_process,
+                             T* output_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -59,11 +59,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2DGrad(
    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, PoolProcess pool_process) {
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    PoolProcess pool_process, T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int offsetW = index % input_width + padding_width;
@@ -107,11 +107,11 @@ __global__ void KernelPool2DGrad(
 template <typename T>
 __global__ void KernelMaxPool2DGrad(
    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width) {
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -158,16 +158,16 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
    const int ksize_height = ksize[0];
    const int ksize_width = ksize[1];
    const int stride_height = strides[0];
@@ -176,7 +176,7 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
    const int padding_width = paddings[1];

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * output_channels * output_height * output_width;
    int blocks = (nthreads + 1024 - 1) / 1024;
@@ -187,11 +187,10 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
        PoolProcess,
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, output_data, input_channels,
-                              input_height, input_width, output_height,
-                              output_width, ksize_height, ksize_width,
-                              stride_height, stride_width, padding_height,
-                              padding_width, pool_process);
+                 .stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, pool_process, output_data);
  }
 };

@@ -204,11 +203,11 @@ template <typename PoolProcess, typename T>
 class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process) {
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
@@ -225,7 +224,7 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * input_channels * input_height * input_width;
    int blocks = (nthreads + 1024 - 1) / 1024;
@@ -237,10 +236,10 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, output_height, output_width,
-        ksize_height, ksize_width, stride_height, stride_width, padding_height,
-        padding_width, pool_process);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        pool_process, input_grad_data);
  }
 };

@@ -253,10 +252,11 @@ template <typename T>
 class MaxPool2dGradFunctor<platform::GPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
@@ -274,7 +274,7 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * output_channels * output_height * output_width;
    int blocks = (nthreads + 1024 - 1) / 1024;
@@ -285,10 +285,10 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, output_height, output_width,
-        ksize_height, ksize_width, stride_height, stride_width, padding_height,
-        padding_width);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
  }
 };

@@ -313,14 +313,16 @@ template class Pool2dGradFunctor<
    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;

 template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(
-    const int nthreads, const T* input_data, T* output_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process) {
+__global__ void KernelPool3D(const int nthreads, const T* input_data,
+                             const int channels, const int input_depth,
+                             const int input_height, const int input_width,
+                             const int output_depth, const int output_height,
+                             const int output_width, const int ksize_depth,
+                             const int ksize_height, const int ksize_width,
+                             const int stride_depth, const int stride_height,
+                             const int stride_width, const int padding_depth,
+                             const int padding_height, const int padding_width,
+                             PoolProcess pool_process, T* output_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -358,13 +360,13 @@ __global__ void KernelPool3D(
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3DGrad(
    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process) {
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, PoolProcess pool_process,
+    T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int offsetW = index % input_width + padding_width;
@@ -422,13 +424,12 @@ __global__ void KernelPool3DGrad(
 template <typename T>
 __global__ void KernelMaxPool3DGrad(
    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height,
-    const int padding_width) {
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -480,18 +481,18 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
@@ -503,7 +504,7 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
    const int padding_width = paddings[2];

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * output_channels * output_depth * output_height *
                   output_width;
@@ -516,11 +517,11 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(
-        nthreads, input_data, output_data, input_channels, input_depth,
-        input_height, input_width, output_depth, output_height, output_width,
-        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width,
-        pool_process);
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, pool_process,
+        output_data);
  }
 };

@@ -533,11 +534,11 @@ template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process) {
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_depth = input.dims()[2];
@@ -560,7 +561,7 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    int nthreads =
        batch_size * input_channels * input_depth * input_height * input_width;
@@ -573,11 +574,11 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_depth, input_height, input_width, output_depth,
-        output_height, output_width, ksize_depth, ksize_height, ksize_width,
-        stride_depth, stride_height, stride_width, padding_depth,
-        padding_height, padding_width, pool_process);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, pool_process, input_grad_data);
  }
 };

@@ -590,10 +591,11 @@ template <class T>
 class MaxPool3dGradFunctor<platform::GPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_depth = input.dims()[2];
@@ -616,7 +618,7 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * output_channels * output_depth * output_height *
                   output_width;
@@ -628,11 +630,11 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_depth, input_height, input_width, output_depth,
-        output_height, output_width, ksize_depth, ksize_height, ksize_width,
-        stride_depth, stride_height, stride_width, padding_depth,
-        padding_height, padding_width);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, input_grad_data);
  }
 };

@@ -658,11 +660,11 @@ template class Pool3dGradFunctor<

 template <typename T>
 __global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T* input_data, T* output_data, T* mask_data,
-    const int channels, const int input_height, const int input_width,
-    const int output_height, const int output_width, const int ksize_height,
-    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width) {
+    const int nthreads, const T* input_data, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, T* output_data, T* mask_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -697,11 +699,11 @@ __global__ void KernelMaxPool2dWithIdx(

 template <typename T>
 __global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, T* input_grad, const T* output_grad, const T* mask_data,
+    const int nthreads, const T* output_grad, const T* mask_data,
    const int channels, const int input_height, const int input_width,
    const int output_height, const int output_width, const int ksize_height,
    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width) {
+    const int padding_height, const int padding_width, T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int w_offset = index % input_width;
@@ -748,16 +750,16 @@ template <typename T>
 class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
    const int ksize_height = ksize[0];
    const int ksize_width = ksize[1];
    const int stride_height = strides[0];
@@ -766,8 +768,8 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
    const int padding_width = paddings[1];

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * output_channels * output_height * output_width;
    int blocks = (nthreads + 1024 - 1) / 1024;
@@ -777,11 +779,10 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
    KernelMaxPool2dWithIdx<
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, output_data, mask_data,
-                              input_channels, input_height, input_width,
-                              output_height, output_width, ksize_height,
-                              ksize_width, stride_height, stride_width,
-                              padding_height, padding_width);
+                 .stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, output_data, mask_data);
  }
 };

@@ -794,14 +795,14 @@ template <typename T>
 class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_channels = input_grad.dims()[1];
-    const int input_height = input_grad.dims()[2];
-    const int input_width = input_grad.dims()[3];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
    const int output_height = output_grad.dims()[2];
    const int output_width = output_grad.dims()[3];
    const int ksize_height = ksize[0];
@@ -813,7 +814,7 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {

    const T* mask_data = mask.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * input_channels * input_height * input_width;
    int blocks = (nthreads + 1024 - 1) / 1024;
@@ -823,11 +824,11 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
    KernelMaxPool2DWithIdxGrad<
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_grad_data, output_grad_data,
-                              mask_data, input_channels, input_height,
-                              input_width, output_height, output_width,
-                              ksize_height, ksize_width, stride_height,
-                              stride_width, padding_height, padding_width);
+                 .stream()>>>(nthreads, output_grad_data, mask_data,
+                              input_channels, input_height, input_width,
+                              output_height, output_width, ksize_height,
+                              ksize_width, stride_height, stride_width,
+                              padding_height, padding_width, input_grad_data);
  }
 };

@@ -838,13 +839,13 @@ template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double>;

 template <typename T>
 __global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T* input_data, T* output_data, T* mask_data,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width) {
+    const int nthreads, const T* input_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    T* output_data, T* mask_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -886,13 +887,13 @@ __global__ void KernelMaxPool3DWithIdx(

 template <typename T>
 __global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, T* input_grad, const T* output_grad, const T* mask,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width) {
+    const int nthreads, const T* output_grad, const T* mask, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int w_offset = index % input_width;
@@ -952,18 +953,18 @@ template <typename T>
 class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
@@ -975,8 +976,8 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
    const int padding_width = paddings[2];

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());

    int nthreads = batch_size * output_channels * output_depth * output_height *
                   output_width;
@@ -988,11 +989,10 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(
-        nthreads, input_data, output_data, mask_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width);
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, output_data, mask_data);
  }
 };

@@ -1005,15 +1005,15 @@ template <typename T>
 class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_channels = input_grad.dims()[1];
-    const int input_depth = input_grad.dims()[2];
-    const int input_height = input_grad.dims()[3];
-    const int input_width = input_grad.dims()[4];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
    const int output_depth = output_grad.dims()[2];
    const int output_height = output_grad.dims()[3];
    const int output_width = output_grad.dims()[4];
@@ -1029,7 +1029,7 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {

    const T* output_grad_data = output_grad.data<T>();
    const T* mask_data = mask.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    int nthreads =
        batch_size * input_channels * input_depth * input_height * input_width;
@@ -1041,11 +1041,11 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
        T><<<grid, threads, 0,
             reinterpret_cast<const platform::CUDADeviceContext&>(context)
                 .stream()>>>(
-        nthreads, input_grad_data, output_grad_data, mask_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width);
+        nthreads, output_grad_data, mask_data, input_channels, input_depth,
+        input_height, input_width, output_depth, output_height, output_width,
+        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        input_grad_data);
  }
 };


--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -88,60 +88,62 @@ template <typename Place, typename PoolProcess, typename T>
 class Pool2dFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* output);
 };

 template <typename Place, typename PoolProcess, typename T>
 class Pool2dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };

 template <typename Place, class T>
 class MaxPool2dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };

 template <typename Place, typename PoolProcess, typename T>
 class Pool3dFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* output);
 };

 template <typename Place, typename PoolProcess, typename T>
 class Pool3dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };

 template <typename Place, class T>
 class MaxPool3dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };

 /*
@@ -155,38 +157,38 @@ template <typename Place, typename T>
 class MaxPool2dWithIndexFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask);
 };

 template <typename Place, typename T>
 class MaxPool2dWithIndexGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };

 template <typename Place, typename T>
 class MaxPool3dWithIndexFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask);
 };

 template <typename Place, typename T>
 class MaxPool3dWithIndexGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };

 }  // namespace math

--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -75,16 +75,16 @@ class PoolKernel : public framework::OpKernel<T> {
              Place, paddle::operators::math::MaxPool<T>, T>
              pool2d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);

        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool2dFunctor<
              Place, paddle::operators::math::AvgPool<T>, T>
              pool2d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
        }
      } break;
      case 3: {
@@ -93,15 +93,15 @@ class PoolKernel : public framework::OpKernel<T> {
              Place, paddle::operators::math::MaxPool<T>, T>
              pool3d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool3dFunctor<
              Place, paddle::operators::math::AvgPool<T>, T>
              pool3d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
        }
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -142,30 +142,30 @@ class PoolGradKernel : public framework::OpKernel<T> {
          if (pooling_type == "max") {
            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
                pool2d_backward;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings);
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, in_x_grad);
          } else if (pooling_type == "avg") {
            paddle::operators::math::Pool2dGradFunctor<
                Place, paddle::operators::math::AvgPoolGrad<T>, T>
                pool2d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings, pool_process);
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, pool_process, in_x_grad);
          }
        } break;
        case 3: {
          if (pooling_type == "max") {
            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
                pool3d_backward;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings);
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, in_x_grad);
          } else if (pooling_type == "avg") {
            paddle::operators::math::Pool3dGradFunctor<
                Place, paddle::operators::math::AvgPoolGrad<T>, T>
                pool3d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings, pool_process);
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, pool_process, in_x_grad);
          }
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }

--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -46,14 +46,14 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
      case 2: {
        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
            pool2d_forward;
-        pool2d_forward(context.device_context(), *in_x, *out, *mask, ksize,
-                       strides, paddings);
+        pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                       paddings, out, mask);
      } break;
      case 3: {
        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
            pool3d_forward;
-        pool3d_forward(context.device_context(), *in_x, *out, *mask, ksize,
-                       strides, paddings);
+        pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                       paddings, out, mask);
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
    }
@@ -89,14 +89,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
        case 2: {
          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
              pool2d_backward;
-          pool2d_backward(context.device_context(), *in_x_grad, *out_grad,
-                          *mask, ksize, strides, paddings);
+          pool2d_backward(context.device_context(), *out_grad, *mask, ksize,
+                          strides, paddings, in_x_grad);
        } break;
        case 3: {
          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
              pool3d_backward;
-          pool3d_backward(context.device_context(), *in_x_grad, *out_grad,
-                          *mask, ksize, strides, paddings);
+          pool3d_backward(context.device_context(), *out_grad, *mask, ksize,
+                          strides, paddings, in_x_grad);
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
      }

--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
@@ -14,6 +14,7 @@

 #pragma once

+#include "glog/logging.h"
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"

@@ -26,6 +27,10 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;

+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+
 struct SumFunctor {
  template <typename Place, typename X, typename Y, typename Dim>
  void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
@@ -133,10 +138,17 @@ class ReduceKernel : public framework::OpKernel<T> {
      dims_vector.erase(dims_vector.begin() + dim);
      dims = framework::make_ddim(dims_vector);
    }
-    auto out = EigenTensor < T, D == 1 ? 1 : (D - 1) > ::From(*output, dims);
+
    auto& place = context.GetEigenDevice<Place>();
    Functor functor;
-    functor(place, x, out, reduce_dim);
+
+    if (D == 1) {
+      auto out = EigenScalar<T>::From(*output);
+      functor(place, x, out, reduce_dim);
+    } else {
+      auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
+      functor(place, x, out, reduce_dim);
+    }
  }
 };

@@ -186,13 +198,13 @@ class ReduceGradKernel : public framework::OpKernel<T> {
    auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);

-    Eigen::array<int, D> braodcast_dim;
-    for (size_t i = 0; i < D; ++i) braodcast_dim[i] = 1;
-    braodcast_dim[dim] = input0->dims()[dim];
+    Eigen::array<int, D> broadcast_dim;
+    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+    broadcast_dim[dim] = input0->dims()[dim];
    auto& place = context.GetEigenDevice<Place>();
    Functor functor;
-    functor(place, x, x_reduce, x_grad, x_reduce_grad, braodcast_dim,
-            braodcast_dim[dim]);
+    functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+            broadcast_dim[dim]);
  }
 };


--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -200,7 +200,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
                                     false,
                                     useGpu_);
    }
-  } else if (matType == MAT_NORMAL_SHARED) {
+  }
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOLINTNEXTLINE
+  else if (matType == MAT_NORMAL_SHARED) {
    CHECK_EQ(height * width, bufs_[pType]->getSize());
    size_t blockNum = 0;
    CHECK(isGradShared(&blockNum));
@@ -259,7 +262,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
    CHECK(isGradSparseUpdate());
    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  } else {
+  }
+#endif
+  // NOLINTNEXTLINE
+  else {
    LOG(FATAL) << "Unsupported mat type" << matType;
  }
 }

--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -33,6 +33,7 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
                                 bool withValue,
                                 bool useGpu,
                                 bool equalNnzPerSample) {
+#ifndef PADDLE_MOBILE_INFERENCE
  std::vector<int64_t> ids(height);
  std::vector<int64_t> indices(height + 1);
  indices[0] = 0;
@@ -84,6 +85,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
    }
    return mat;
  }
+#endif
+  return nullptr;
 }

 void generateSequenceStartPositions(size_t batchSize,

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -37,10 +37,10 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)


-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
        DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)


 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -66,7 +66,7 @@ if (WITH_TESTING)
    add_subdirectory(paddle/v2/tests)
    add_subdirectory(paddle/v2/reader/tests)
    add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/v2/framework/tests)
+    add_subdirectory(paddle/v2/fluid/tests)
  endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1200,8 +1200,14 @@ def TestData(data_config, async_load_data=None):

 #caffe_mode: compute the output size using floor instead of ceil,
 #            which is consistent of caffe and CuDNN's convention.
-def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
-    output = (2 * padding + img_size - filter_size) / float(stride)
+def cnn_output_size(img_size,
+                    filter_size,
+                    padding,
+                    stride,
+                    caffe_mode,
+                    dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    output = (2 * padding + img_size - filter_s) / float(stride)
    if caffe_mode:
        return 1 + int(math.floor(output))
    else:
@@ -1210,8 +1216,14 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):

 #calcualte image_size based on output_size for de-convolution (ConvTransLayer).
 #It is the reverse function of cnn_output_size
-def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
-    img_size = (output_size - 1) * stride + filter_size - 2 * padding
+def cnn_image_size(output_size,
+                   filter_size,
+                   padding,
+                   stride,
+                   caffe_mode,
+                   dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    img_size = (output_size - 1) * stride + filter_s - 2 * padding
    if not caffe_mode:
        img_size = img_size + 1
    return img_size
@@ -1253,9 +1265,9 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
 def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
    pool_conf.pool_type = pool.pool_type
    config_assert(pool.pool_type in [
-        'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
-    ], "pool-type %s is not in "
-                  "['max-projection', 'avg-projection', "
+        'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
+    ], "pool-type %s is not in " \
+              "['max-projection', 'avg-projection', 'max-pool-with-mask'," \
                  "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)

    pool_conf.channels = pool.channels
@@ -1376,6 +1388,12 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
    conv_conf.stride_y = conv.stride_y
    conv_conf.groups = conv.groups
    conv_conf.caffe_mode = conv.caffe_mode
+    if not conv.dilation:
+        conv.dilation = 1
+        conv.dilation_y = 1
+    else:
+        conv_conf.dilation = conv.dilation
+        conv_conf.dilation_y = conv.dilation_y

    if not trans:
        conv_conf.filter_channels = conv.channels / conv.groups
@@ -1383,20 +1401,20 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
            get_img_size(input_layer_name, conv.channels)
        conv_conf.output_x = cnn_output_size(
            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
        conv_conf.output_y = cnn_output_size(
            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
    else:
        conv_conf.filter_channels = num_filters / conv.groups
        conv_conf.output_x, conv_conf.output_y = \
            get_img_size(input_layer_name, conv.channels)
        conv_conf.img_size = cnn_image_size(
            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
        conv_conf.img_size_y = cnn_image_size(
            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)


 #caffe_mode: compute the output size using floor instead of ceil,

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -20,7 +20,7 @@ from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
    ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, BasePoolingType, \
+from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
    CudnnAvgPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
@@ -888,7 +888,7 @@ def mixed_layer(size=0,
    :type size: int
    :param input: The input of this layer. It is an optional parameter. If set,
                  then this function will just return layer's name.
-    :param act: Activation Type. LinearActivation is the default.
+    :param act: Activation Type. LinearActivation is the default activation.
    :type act: BaseActivation
    :param bias_attr: The bias attribute. If the parameter is set to False or an object
                      whose type is not ParameterAttribute, no bias is defined. If the
@@ -1030,7 +1030,7 @@ def fc_layer(input,
    :type input: LayerOutput | list | tuple
    :param size: The layer dimension.
    :type size: int
-    :param act: Activation Type. TanhActivation is the default.
+    :param act: Activation Type. TanhActivation is the default activation.
    :type act: BaseActivation
    :param param_attr: The Parameter Attribute|list.
    :type param_attr: ParameterAttribute
@@ -1527,7 +1527,7 @@ def lstmemory(input,
    :type input: LayerOutput
    :param reverse: is sequence process reversed or not.
    :type reverse: bool
-    :param act: Activation type. TanhActivation is the default. :math:`h_t`
+    :param act: Activation type. TanhActivation is the default activation.
    :type act: BaseActivation
    :param gate_act: gate activation type, SigmoidActivation by default.
    :type gate_act: BaseActivation
@@ -1920,7 +1920,7 @@ def repeat_layer(input,
                          False for treating input as column vector and repeating
                          in the row direction.
    :type as_row_vector: bool
-    :param act: Activation type. IdentityActivation is the default.
+    :param act: Activation type. IdentityActivation is the default activation.
    :type act: BaseActivation
    :type name: basestring
    :param layer_attr: extra layer attributes.
@@ -1974,7 +1974,7 @@ def seq_reshape_layer(input,
    :type reshape_size: int
    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param act: Activation type. IdentityActivation is the default.
+    :param act: Activation type. IdentityActivation is the default activation.
    :type act: BaseActivation
    :param layer_attr: extra layer attributes.
    :type layer_attr: ExtraLayerAttribute.
@@ -2487,7 +2487,7 @@ def img_conv_layer(input,
                        shape will be (filter_size, filter_size_y).
    :type filter_size_y: int | None
    :param num_filters: Each filter group's number of filter
-    :param act: Activation type. ReluActivation is the default.
+    :param act: Activation type. ReluActivation is the default activation.
    :type act: BaseActivation
    :param groups: Group size of filters.
    :type groups: int
@@ -2571,7 +2571,9 @@ def img_conv_layer(input,

    if layer_type:
        if dilation > 1 or dilation_y > 1:
-            assert layer_type in ["cudnn_conv", "cudnn_convt"]
+            assert layer_type in [
+                "cudnn_conv", "cudnn_convt", "exconv", "exconvt"
+            ]
        if trans:
            assert layer_type in ["exconvt", "cudnn_convt"]
        else:
@@ -2699,9 +2701,9 @@ def img_pool_layer(input,
    elif isinstance(pool_type, AvgPooling):
        pool_type.name = 'avg'

-    assert type(pool_type) in [AvgPooling, MaxPooling, CudnnAvgPooling,
+    assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
                               CudnnMaxPooling], \
-        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling are supported"
+        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"

    type_name = pool_type.name + '-projection' \
        if (
@@ -3253,7 +3255,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
    :param input: Input layers. It could be a LayerOutput or list/tuple of
                 LayerOutput.
    :type input: LayerOutput | list | tuple
-    :param act: Activation Type. LinearActivation is the default.
+    :param act: Activation Type. LinearActivation is the default activation.
    :type act: BaseActivation
    :param bias_attr: The bias attribute. If the parameter is set to False or an object
                      whose type is not ParameterAttribute, no bias is defined. If the
@@ -3311,7 +3313,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
    :type name: basestring
    :param input: input layers or projections
    :type input: list | tuple | collections.Sequence
-    :param act: Activation type. IdentityActivation is the default.
+    :param act: Activation type. IdentityActivation is the default activation.
    :type act: BaseActivation
    :param layer_attr: Extra Layer Attribute.
    :type layer_attr: ExtraLayerAttribute
@@ -3406,7 +3408,7 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
    :type a: LayerOutput
    :param b: input sequence layer
    :type b: LayerOutput
-    :param act: Activation type. IdentityActivation is the default.
+    :param act: Activation type. IdentityActivation is the default activation.
    :type act: BaseActivation
    :param layer_attr: Extra Layer Attribute.
    :type layer_attr: ExtraLayerAttribute
@@ -3572,31 +3574,32 @@ def lstm_step_layer(input,
        ...


-    This layer has two outputs. Default output is :math:`h_t`. The other
-    output is :math:`o_t`, whose name is 'state' and can use
+    This layer has two outputs. The default output is :math:`h_t`. The other
+    output is :math:`o_t`, whose name is 'state' and users can use
    :code:`get_output_layer` to extract this output.

    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param size: Layer's size. NOTE: lstm layer's size, should be equal to
-                 :code:`input.size/4`, and should be equal to
-                 :code:`state.size`.
+    :param size: The dimension of this layer's output, which must be
+                 equal to the dimension of the state.
    :type size: int
-    :param input: input layer. :math:`Wx_t + Wh_{t-1}`
+    :param input: The input of this layer.
    :type input: LayerOutput
-    :param state: State Layer. :math:`c_{t-1}`
+    :param state: The state of the LSTM unit.
    :type state: LayerOutput
-    :param act: Activation type. TanhActivation is the default.
+    :param act: Activation type. TanhActivation is the default activation.
    :type act: BaseActivation
-    :param gate_act: Gate Activation Type. SigmoidActivation is the default.
+    :param gate_act: Activation type of the gate. SigmoidActivation is the
+                     default activation.
    :type gate_act: BaseActivation
-    :param state_act: State Activation Type. TanhActivation is the default.
+    :param state_act: Activation type of the state. TanhActivation is the
+                      default activation.
    :type state_act: BaseActivation
    :param bias_attr: The bias attribute. If the parameter is set to False or an object
                      whose type is not ParameterAttribute, no bias is defined. If the
                      parameter is set to True, the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -3641,22 +3644,31 @@ def gru_step_layer(input,
                   layer_attr=None):
    """

-    :param input:
+    :param input: The input of this layer, whose dimension can be divided by 3.
    :type input: LayerOutput
-    :param output_mem:
-    :param size:
-    :param act:
+    :param output_mem: A memory which memorizes the output of this layer at previous
+                       time step.
+    :type output_mem: LayerOutput
+    :param size: The dimension of this layer's output. If it is not set or set to None,
+                 it will be set to one-third of the dimension of the input automatically.
+    :type size: int
+    :param act: Activation type of this layer's output. TanhActivation
+                is the default activation.
    :type act: BaseActivation
    :param name: The name of this layer. It is optional.
-    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
+    :type name: basestring
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
+                     the default activation.
    :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: the parameter_attribute for transforming the output_mem
-                       from previous step.
-    :param layer_attr:
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3701,24 +3713,34 @@ def gru_step_naive_layer(input,
                         param_attr=None,
                         layer_attr=None):
    """
-    GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
+    GRU Step Layer, which is realized using PaddlePaddle API. It supports ERROR_CLIPPING
    and DROPOUT.

-    :param input:
-    :param output_mem:
-    :param size:
+    :param input: The input of this layer, whose dimensionality can be divided by 3.
+    :param output_mem: A memory which memorizes the output of this layer at previous
+                       time step.
+    :type output_mem: LayerOutput
+    :param size: The dimension of this layer's output. If it is not set or set to None,
+                 it will be set to one-third of the dimension of the input automatically.
+    :type size: int
    :param name: The name of this layer. It is optional.
-    :param act:
+    :type name: basestring
+    :param act: Activation type of this layer's output. TanhActivation
+                is the default activation.
    :type act: BaseActivation
-    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation
+                     is the default activation.
    :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr:
-    :param layer_attr:
-    :return:
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
    :rtype: LayerOutput
    """
    if input.size % 3 != 0:
@@ -3780,12 +3802,13 @@ def get_output_layer(input, arg_name, name=None, layer_attr=None):

    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param input: get output layer's input. And this layer should contains
+    :param input: The input layer. And this layer should contain
                   multiple outputs.
    :type input: LayerOutput
-    :param arg_name: Output name from input.
+    :param arg_name: The name of the output to be extracted from the input layer.
    :type arg_name: basestring
-    :param layer_attr: Layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3842,17 +3865,20 @@ def recurrent_layer(input,

    :param input: The input of this layer.
    :type input: LayerOutput
-    :param act: Activation type. TanhActivation is the default.
+    :param act: Activation type. TanhActivation is the default activation.
    :type act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to 
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If the parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: parameter attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
    :type param_attr: ParameterAttribute
    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param layer_attr: Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -3877,7 +3903,7 @@ def recurrent_layer(input,
 class StaticInput(object):
    """
    StaticInput is only used in recurrent_group which defines a read-only memory
-    that can be a sequence or non-sequence.
+    and can be a sequence or non-sequence.
    :param size: DEPRECATED
    :param is_seq: DEPRECATED
    """
@@ -3910,8 +3936,8 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
    Recurrent layer group is an extremely flexible recurrent unit in
    PaddlePaddle. As long as the user defines the calculation done within a
    time step, PaddlePaddle will iterate such a recurrent calculation over
-    sequence input. This is extremely usefull for attention based model, or
-    Neural Turning Machine like models.
+    sequence input. This is useful for attention-based models, or Neural
+    Turning Machine like models.

    The basic usage (time steps) is:

@@ -3933,18 +3959,17 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
                  demo/seqToseq/seqToseq_net.py
    - sequence steps: paddle/gserver/tests/sequence_nest_layer_group.conf

-    :param step: recurrent one time step function.The input of this function is
-                 input of the group. The return of this function will be
-                 recurrent group's return value.
+    :param step: A step function which takes the input of recurrent_group as its own
+                 input and returns values as recurrent_group's output every time step.

-                 The recurrent group scatter a sequence into time steps. And
-                 for each time step, will invoke step function, and return
-                 a time step result. Then gather each time step of output into
+                 The recurrent group scatters a sequence into time steps. And
+                 for each time step, it will invoke step function, and return
+                 a time step result. Then gather outputs of each time step into
                 layer group's output.

    :type step: callable

-    :param name: recurrent_group's name.
+    :param name: The recurrent_group's name. It is optional.
    :type name: basestring

    :param input: Input links array.
@@ -3952,11 +3977,11 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
                  LayerOutput will be scattered into time steps.
                  SubsequenceInput will be scattered into sequence steps.
                  StaticInput will be imported to each time step, and doesn't change
-                  through time. It's a mechanism to access layer outside step function.
+                  over time. It's a mechanism to access layer outside step function.

    :type input: LayerOutput | StaticInput | SubsequenceInput | list | tuple

-    :param reverse: If reverse is set true, the recurrent unit will process the
+    :param reverse: If reverse is set to True, the recurrent unit will process the
                    input sequence in a reverse order.
    :type reverse: bool

@@ -4091,7 +4116,8 @@ def maxid_layer(input, name=None, layer_attr=None):
    :type input: LayerOutput
    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute.
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -4124,11 +4150,12 @@ def out_prod_layer(input1, input2, name=None, layer_attr=None):

    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param input1: The first input layer name.
+    :param input1: The first input layer.
    :type input: LayerOutput
-    :param input2: The second input layer name.
+    :param input2: The second input layer.
    :type input2: LayerOutput
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute.
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -4167,9 +4194,10 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
    :type name: basestring
    :param input: The input of this layer.
    :type input: LayerOutput
-    :param eos_id: end id of sequence
+    :param eos_id: End id of sequence
    :type eos_id: int
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute.
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -4230,8 +4258,9 @@ def beam_search(step,
    - machine translation : demo/seqToseq/translation/gen.conf \
                            demo/seqToseq/seqToseq_net.py

-    :param name: Name of the recurrent unit that generates sequences.
-    :type name: base string
+    :param name: The name of the recurrent unit that is responsible for
+                 generating sequences. It is optional.
+    :type name: basestring
    :param step: A callable function that defines the calculation in a time
                 step, and it is applied to sequences with arbitrary length by
                 sharing a same set of weights.
@@ -4356,16 +4385,18 @@ def square_error_cost(input,

    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param input: Network prediction.
+    :param input: The first input layer.
    :type input: LayerOutput
-    :param label: Data label.
+    :param label: The input label.
    :type label: LayerOutput
-    :param weight: The weight affects the cost, namely the scale of cost.
-                   It is an optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
    :type weight: LayerOutput
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
    :type coeff: float
-    :param layer_attr: layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -4398,17 +4429,20 @@ def classification_cost(input,

    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param input: input layer name. network output.
+    :param input: The first input layer.
    :type input: LayerOutput
-    :param label: label layer name. data_layer often.
+    :param label: The input label.
    :type label: LayerOutput
-    :param weight: The weight affects the cost, namely the scale of cost.
-                   It is an optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
    :type weight: LayerOutput
-    :param evaluator: Evaluator method.
-    :param layer_attr: layer's extra attribute.
+    :param evaluator: Evaluator method. classification_error_evaluator is the default.
+    :type evaluator: Evaluator method
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
    :type coeff: float
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -4461,7 +4495,7 @@ def conv_operator(img,
    Different from img_conv_layer, conv_op is an Operator, which can be used
    in mixed_layer. And conv_op takes two inputs to perform convolution.
    The first input is the image and the second is filter kernel. It only
-    support GPU mode.
+    supports GPU mode.

    The example usage is:

@@ -4473,27 +4507,31 @@ def conv_operator(img,
                          num_filters=64,
                          num_channels=64)

-    :param img: input image
+    :param img: The input image.
    :type img: LayerOutput
-    :param filter: input filter
+    :param filter: The input filter.
    :type filter: LayerOutput
-    :param filter_size: The x dimension of a filter kernel.
+    :param filter_size: The dimension of the filter kernel on the x axis.
    :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since
-                        PaddlePaddle now supports rectangular filters,
-                        the filter's shape can be (filter_size, filter_size_y).
+    :param filter_size_y: The dimension of the filter kernel on the y axis.
+                          If the parameter is not set or set to None, it will
+                          set to 'filter_size' automatically.
    :type filter_size_y: int
-    :param num_filters: channel of output data.
+    :param num_filters: The number of the output channels.
    :type num_filters: int
-    :param num_channels: channel of input data.
+    :param num_channels: The number of the input channels. If the parameter is not set
+                         or set to None, it will be automatically set to the channel
+                         number of the 'img'.
    :type num_channels: int
-    :param stride: The x dimension of the stride.
+    :param stride: The stride on the x axis.
    :type stride: int
-    :param stride_y: The y dimension of the stride.
+    :param stride_y: The stride on the y axis. If the parameter is not set or
+                     set to None, it will be set to 'stride' automatically.
    :type stride_y: int
-    :param padding: The x dimension of padding.
+    :param padding: The padding size on the x axis.
    :type padding: int
-    :param padding_y: The y dimension of padding.
+    :param padding_y: The padding size on the y axis. If the parameter is not set
+                      or set to None, it will be set to 'padding' automatically.
    :type padding_y: int
    :return: A ConvOperator Object.
    :rtype: ConvOperator
@@ -4544,9 +4582,9 @@ def conv_projection(input,
                    param_attr=None,
                    trans=False):
    """
-    Different from img_conv_layer and conv_op, conv_projection is an Projection,
-    which can be used in mixed_layer and conat_layer. It use cudnn to implement
-    conv and only support GPU mode.
+    Different from img_conv_layer and conv_op, conv_projection is a Projection,
+    which can be used in mixed_layer and concat_layer. It uses cudnn to implement
+    convolution and only supports GPU mode.

    The example usage is:

@@ -4559,32 +4597,45 @@ def conv_projection(input,

    :param input: The input of this layer.
    :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel.
-    :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since
-                          PaddlePaddle now supports rectangular filters,
-                          the filter's shape can be (filter_size, filter_size_y).
+    :param filter_size: The dimensions of the filter kernel. If the parameter is
+                        set to one integer, the two dimensions on x and y axises
+                        will be same when filter_size_y is not set. If it is set
+                        to a list, the first element indicates the dimension on
+                        the x axis, and the second is used to specify the dimension
+                        on the y axis when filter_size is not provided.
+    :type filter_size: int | tuple | list
+    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
+                          is not set, it will be set automatically according to filter_size.
    :type filter_size_y: int
-    :param num_filters: channel of output data.
+    :param num_filters: The number of filters.
    :type num_filters: int
-    :param num_channels: channel of input data.
+    :param num_channels: The number of the input channels.
    :type num_channels: int
-    :param stride: The x dimension of the stride.
-    :type stride: int
-    :param stride_y: The y dimension of the stride.
+    :param stride: The strides. If the parameter is set to one integer, the strides
+                   on x and y axises will be same when stride_y is not set. If it is
+                   set to a list, the first element indicates the stride on the x axis,
+                   and the second is used to specify the stride on the y axis when
+                   stride_y is not provided.
+    :type stride: int | tuple | list
+    :param stride_y: The stride on the y axis.
    :type stride_y: int
-    :param padding: The x dimension of padding.
-    :type padding: int
-    :param padding_y: The y dimension of padding.
+    :param padding: The padding sizes. If the parameter is set to one integer, the padding
+                    sizes on x and y axises will be same when padding_y is not set. If it
+                    is set to a list, the first element indicates the padding size on the
+                    x axis, and the second is used to specify the padding size on the y axis
+                    when padding_y is not provided.
+    :type padding: int | tuple | list
+    :param padding_y: The padding size on the y axis.
    :type padding_y: int
    :param groups: The group number.
    :type groups: int
-    :param param_attr: Convolution param attribute. None means default attribute
+    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
+                       details.
    :type param_attr: ParameterAttribute
-    :param trans: whether it is convTrans or conv
+    :param trans: Whether it is ConvTransProjection or ConvProjection
    :type trans: bool
-    :return: A DotMulProjection Object.
-    :rtype: DotMulProjection
+    :return: A Projection Object.
+    :rtype: ConvTransProjection | ConvProjection
    """
    if num_channels is None:
        assert input.num_filters is not None
@@ -4649,13 +4700,13 @@ def pad_layer(input,
              layer_attr=None):
    """
    This operation pads zeros to the input data according to pad_c,pad_h
-    and pad_w. pad_c, pad_h, pad_w specifies the which dimension and size
-    of padding. And the input data shape is NCHW.
+    and pad_w. pad_c, pad_h, pad_w specify the size in the corresponding
+    dimension. And the input data shape is NCHW.

-    For example, pad_c=[2,3] means padding 2 zeros before the
-    input data and 3 zeros after the input data in channel dimension.
-    pad_h means padding zeros in height dimension. pad_w means padding zeros
-    in width dimension.
+    For example, pad_c=[2,3] means padding 2 zeros before the input data
+    and 3 zeros after the input data in the channel dimension. pad_h means
+    padding zeros in the height dimension. pad_w means padding zeros in the
+    width dimension.

    For example,

@@ -4692,13 +4743,14 @@ def pad_layer(input,

    :param input: The input of this layer.
    :type input: LayerOutput
-    :param pad_c: padding size in channel dimension.
+    :param pad_c: The padding size in the channel dimension.
    :type pad_c: list | None
-    :param pad_h: padding size in height dimension.
+    :param pad_h: The padding size in the height dimension.
    :type pad_h: list | None
-    :param pad_w: padding size in width dimension.
+    :param pad_w: The padding size in the width dimension.
    :type pad_w: list | None
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
    :param name: The name of this layer. It is optional.
    :type name: basestring
@@ -4747,7 +4799,7 @@ def pad_layer(input,
 @layer_support()
 def conv_shift_layer(a, b, name=None, layer_attr=None):
    """
-    This layer performs cyclic convolution for two input. For example:
+    This layer performs cyclic convolution on two inputs. For example:
      - a[in]: contains M elements.
      - b[in]: contains N elements (N should be odd).
      - c[out]: contains M elements.
@@ -4756,7 +4808,7 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):

        c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}

-    In this formular:
+    In this formula:
     - a's index is computed modulo M. When it is negative, then get item from
       the right side (which is the end of array) to the left.
     - b's index is computed modulo N. When it is negative, then get item from
@@ -4770,11 +4822,12 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):

    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param a: Input layer a.
+    :param a: The first input of this layer.
    :type a: LayerOutput
-    :param b: input layer b.
+    :param b: The second input of this layer.
    :type b: LayerOutput
-    :param layer_attr: layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -4805,8 +4858,8 @@ def tensor_layer(a,
                 bias_attr=None,
                 layer_attr=None):
    """
-    This layer performs tensor operation for two input.
-    For example, each sample:
+    This layer performs tensor operation on two inputs.
+    For example:

    .. math::
       y_{i} = a * W_{i} * {b^\mathrm{T}}, i=0,1,...,K-1
@@ -4826,21 +4879,24 @@ def tensor_layer(a,

    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param a: Input layer a.
+    :param a: The first input of this layer.
    :type a: LayerOutput
-    :param b: input layer b.
+    :param b: The second input of this layer.
    :type b: LayerOutput
-    :param size: the layer dimension.
-    :type size: int.
-    :param act: Activation type. LinearActivation is the default.
+    :param size: The dimension of this layer.
+    :type size: int
+    :param act: Activation type. LinearActivation is the default activation.
    :type act: BaseActivation
-    :param param_attr: The Parameter Attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer config.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute | None
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -4876,7 +4932,7 @@ def selective_fc_layer(input,
                       layer_attr=None):
    """
    Selectived fully connected layer. Different from fc_layer, the output
-    of this layer maybe sparse. It requires an additional input to indicate
+    of this layer can be sparse. It requires an additional input to indicate
    several selected columns for output. If the selected columns is not
    specified, selective_fc_layer acts exactly like fc_layer.

@@ -4890,21 +4946,34 @@ def selective_fc_layer(input,
    :type name: basestring
    :param input: The input of this layer.
    :type input: LayerOutput | list | tuple
-    :param select: The select layer. The output of select layer should be a
-                   sparse binary matrix, and treat as the mask of selective fc.
-                   If is None, acts exactly like fc_layer.
+    :param select: The layer to select columns to output. It should be a sparse
+                   binary matrix, and is treated as the mask of selective fc. If
+                   it is not set or set to None, selective_fc_layer acts exactly
+                   like fc_layer.
    :type select: LayerOutput
-    :param size: The layer dimension.
+    :param size: The dimension of this layer, which should be equal to that of
+                 the layer 'select'.
    :type size: int
-    :param act: Activation type. TanhActivation is the default.
+    :param act: Activation type. TanhActivation is the default activation.
    :type act: BaseActivation
-    :param param_attr: The Parameter Attribute.
+    :param pass_generation: The flag which indicates whether it is during generation.
+    :type pass_generation: bool
+    :param has_selected_colums: The flag which indicates whether the parameter 'select'
+                                has been set. True is the default.
+    :type has_selected_colums: bool
+    :param mul_ratio: A ratio helps to judge how sparse the output is and determine
+                      the computation method for speed consideration.
+    :type mul_ratio: float
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer config.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute | None
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -4955,7 +5024,7 @@ def selective_fc_layer(input,
 @layer_support()
 def sampling_id_layer(input, name=None, layer_attr=None):
    """
-    A layer for sampling id from multinomial distribution from the input layer.
+    A layer for sampling id from a multinomial distribution from the input layer.
    Sampling one id for one sample.

    The simple usage is:
@@ -4968,8 +5037,9 @@ def sampling_id_layer(input, name=None, layer_attr=None):
    :type input: LayerOutput
    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -4990,8 +5060,7 @@ def slope_intercept_layer(input,
                          intercept=0.0,
                          layer_attr=None):
    """
-    This layer for applying a slope and an intercept to the input
-    element-wise. There is no activation and weight.
+    This layer for applying a slope and an intercept to the input.

    ..  math::
        y = slope * x + intercept
@@ -5006,12 +5075,13 @@ def slope_intercept_layer(input,
    :type input: LayerOutput
    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param slope: the scale factor.
-    :type slope: float.
-    :param intercept: the offset.
-    :type intercept: float.
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param slope: The scale factor.
+    :type slope: float
+    :param intercept: The offset.
+    :type intercept: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -5066,12 +5136,13 @@ def linear_comb_layer(weights, vectors, size=None, name=None, layer_attr=None):
    :type weights: LayerOutput
    :param vectors: The vector layer.
    :type vectors: LayerOutput
-    :param size: the dimension of this layer.
+    :param size: The dimension of this layer.
    :type size: int
    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -5118,11 +5189,11 @@ def block_expand_layer(input,

       outputW = 1 + (2 * padding_x + imgSizeW - block_x + stride_x - 1) / stride_x

-    The expand method is the same with ExpandConvLayer, but saved the transposed
+    The expanding method is the same with ExpandConvLayer, but saved the transposed
    value. After expanding, output.sequenceStartPositions will store timeline.
-    The number of time steps are outputH * outputW and the dimension of each
+    The number of time steps is outputH * outputW and the dimension of each
    time step is block_y * block_x * num_channels. This layer can be used after
-    convolution neural network, and before recurrent neural network.
+    convolutional neural network, and before recurrent neural network.

    The simple usage is:

@@ -5137,8 +5208,10 @@ def block_expand_layer(input,

    :param input: The input of this layer.
    :type input: LayerOutput
-    :param num_channels: The channel number of input layer.
-    :type num_channels: int | None
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :type num_channels: int
    :param block_x: The width of sub block.
    :type block_x: int
    :param block_y: The width of sub block.
@@ -5152,9 +5225,10 @@ def block_expand_layer(input,
    :param padding_y: The padding size in vertical direction.
    :type padding_y: int
    :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :type name: basestring.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -5184,12 +5258,19 @@ def block_expand_layer(input,
 @layer_support()
 def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
    """
-    A layer to do max out on conv layer output.
-      - Input: output of a conv layer.
-      - Output: feature map size same as input. Channel is (input channel) / groups.
+    A layer to do max out on convolutional layer output.
+      - Input: the output of a convolutional layer.
+      - Output: feature map size same as the input's, and its channel number is
+        (input channel) / groups.

    So groups should be larger than 1, and the num of channels should be able
-    to devided by groups.
+    to be devided by groups.
+
+    Reference:
+        Maxout Networks
+        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+        Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
+        https://arxiv.org/pdf/1312.6082v4.pdf

    .. math::
       y_{si+j} = \max_k x_{gsi + sk + j}
@@ -5199,12 +5280,6 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
       0 \le j < s
       0 \le k < groups

-    Please refer to Paper:
-      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-      - Multi-digit Number Recognition from Street View \
-        Imagery using Deep Convolutional Neural Networks: \
-        https://arxiv.org/pdf/1312.6082v4.pdf
-
    The simple usage is:

    .. code-block:: python
@@ -5215,14 +5290,16 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):

    :param input: The input of this layer.
    :type input: LayerOutput
-    :param num_channels: The channel number of input layer. If None will be set
-                     automatically from previous output.
-    :type num_channels: int | None
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :type num_channels: int
    :param groups: The group number of input layer.
    :type groups: int
    :param name: The name of this layer. It is optional.
-    :type name: None | basestring.
-    :param layer_attr: Extra Layer attribute.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -5254,20 +5331,20 @@ def ctc_layer(input,
              layer_attr=None):
    """
    Connectionist Temporal Classification (CTC) is designed for temporal
-    classication task. That is, for sequence labeling problems where the
+    classication task. e.g. sequence labeling problems where the
    alignment between the inputs and the target labels is unknown.

-    More details can be found by referring to `Connectionist Temporal
-    Classification: Labelling Unsegmented Sequence Data with Recurrent
-    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
-    icml2006_GravesFGS06.pdf>`_
+    Reference:
+        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        with Recurrent Neural Networks
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf

    Note:
-        Considering the 'blank' label needed by CTC, you need to use
-        (num_classes + 1) as the input size. num_classes is the category number.
-        And the 'blank' is the last category index. So the size of 'input' layer, such as
-        fc_layer with softmax activation, should be num_classes + 1. The size of ctc_layer
-        should also be num_classes + 1.
+        Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
+        as the size of the input, where num_classes is the category number.
+        And the 'blank' is the last category index. So the size of 'input' layer (e.g.
+        fc_layer with softmax activation) should be (num_classes + 1). The size of
+        ctc_layer should also be (num_classes + 1).

    The example usage is:

@@ -5280,16 +5357,17 @@ def ctc_layer(input,

    :param input: The input of this layer.
    :type input: LayerOutput
-    :param label: The data layer of label with variable length.
+    :param label: The input label.
    :type label: LayerOutput
-    :param size: category numbers + 1.
+    :param size: The dimension of this layer, which must be equal to (category number + 1).
    :type size: int
    :param name: The name of this layer. It is optional.
-    :type name: basestring | None
-    :param norm_by_times: Whether to normalization by times. False by default.
+    :type name: basestring
+    :param norm_by_times: Whether to do normalization by times. False is the default.
    :type norm_by_times: bool
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -5330,20 +5408,19 @@ def warp_ctc_layer(input,
    building process, PaddlePaddle will clone the source codes, build and
    install it to :code:`third_party/install/warpctc` directory.

-    More details of CTC can be found by referring to `Connectionist Temporal
-    Classification: Labelling Unsegmented Sequence Data with Recurrent
-    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
-    icml2006_GravesFGS06.pdf>`_.
+    Reference:
+        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        with Recurrent Neural Networks
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf

    Note:
-        - Let num_classes represent the category number. Considering the 'blank'
-          label needed by CTC, you need to use (num_classes + 1) as the input size.
-          Thus, the size of both warp_ctc layer and 'input' layer should be set to
-          num_classes + 1.
+        - Let num_classes represents the category number. Considering the 'blank'
+          label needed by CTC, you need to use (num_classes + 1) as the size of
+          warp_ctc layer.
        - You can set 'blank' to any value ranged in [0, num_classes], which
-          should be consistent as that used in your labels.
+          should be consistent with those used in your labels.
        - As a native 'softmax' activation is interated to the warp-ctc library,
-          'linear' activation is expected instead in the 'input' layer.
+          'linear' activation is expected to be used instead in the 'input' layer.

    The example usage is:

@@ -5357,18 +5434,19 @@ def warp_ctc_layer(input,

    :param input: The input of this layer.
    :type input: LayerOutput
-    :param label: The data layer of label with variable length.
+    :param label: The input label.
    :type label: LayerOutput
-    :param size: category numbers + 1.
+    :param size: The dimension of this layer, which must be equal to (category number + 1).
    :type size: int
    :param name: The name of this layer. It is optional.
-    :type name: basestring | None
-    :param blank: the 'blank' label used in ctc
+    :type name: basestring
+    :param blank: The 'blank' label used in ctc.
    :type blank: int
-    :param norm_by_times: Whether to normalization by times. False by default.
+    :param norm_by_times: Whether to do normalization by times. False is the default.
    :type norm_by_times: bool
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -5414,23 +5492,26 @@ def crf_layer(input,
                      label=label,
                      size=label_dim)

-    :param input: The first input layer is the feature.
+    :param input: The first input layer.
    :type input: LayerOutput
-    :param label: The second input layer is label.
+    :param label: The input label.
    :type label: LayerOutput
    :param size: The category number.
    :type size: int
-    :param weight: The third layer is "weight" of each sample, which is an
-                  optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
    :type weight: LayerOutput
-    :param param_attr: Parameter attribute. None means default attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
    :type param_attr: ParameterAttribute
    :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
    :type coeff: float
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -5476,9 +5557,9 @@ def crf_decoding_layer(input,
    """
    A layer for calculating the decoding sequence of sequential conditional
    random field model. The decoding sequence is stored in output.ids.
-    If a second input is provided, it is treated as the ground-truth label, and
-    this layer will also calculate error. output.value[i] is 1 for incorrect
-    decoding or 0 for correct decoding.
+    If the input 'label' is provided, it is treated as the ground-truth label, and
+    this layer will also calculate error. output.value[i] is 1 for an incorrect
+    decoding and 0 for the correct.

    The example usage is:

@@ -5489,16 +5570,18 @@ def crf_decoding_layer(input,

    :param input: The first input layer.
    :type input: LayerOutput
-    :param size: size of this layer.
+    :param size: The dimension of this layer.
    :type size: int
-    :param label: None or ground-truth label.
-    :type label: LayerOutput or None
-    :param param_attr: Parameter attribute. None means default attribute
+    :param label: The input label.
+    :type label: LayerOutput | None
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
    :type param_attr: ParameterAttribute
    :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute | None
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -5545,8 +5628,7 @@ def nce_layer(input,
              bias_attr=None,
              layer_attr=None):
    """
-    Noise-contrastive estimation. This layer implements the method in the
-    following paper:
+    Noise-contrastive estimation.

    Reference:
        A fast and simple algorithm for training neural probabilistic language
@@ -5562,37 +5644,40 @@ def nce_layer(input,

    :param name: The name of this layer. It is optional.
    :type name: basestring
-    :param input: The input layers. It should be a LayerOutput or a list/tuple
-                  of LayerOutput.
+    :param input: The first input of this layer.
    :type input: LayerOutput | list | tuple | collections.Sequence
-    :param label: The ground truth.
+    :param label: The input label.
    :type label: LayerOutput
    :param weight: The weight layer defines a weight for each sample in the
-                   mini-batch. The default value is None.
+                   mini-batch. It is optional.
    :type weight: LayerOutput
-    :param num_classes: The class number.
+    :param num_classes: The number of classes.
    :type num_classes: int
-    :param param_attr: The parameter attributes.
-    :type param_attr: ParameterAttribute|list
-    :param num_neg_samples: The number of sampled negative labels. The default
-                            value is 10.
+    :param act: Activation type. SigmoidActivation is the default activation.
+    :type act: BaseActivation
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
+    :type param_attr: ParameterAttribute
+    :param num_neg_samples: The number of sampled negative labels. 10 is the
+                            default value.
    :type num_neg_samples: int
    :param neg_distribution: The discrete noisy distribution over the output
                             space from which num_neg_samples negative labels
                             are sampled. If this parameter is not set, a
-                             uniform distribution will be used. A user defined
+                             uniform distribution will be used. A user-defined
                             distribution is a list whose length must be equal
                             to the num_classes. Each member of the list defines
                             the probability of a class given input x.
    :type neg_distribution: list | tuple | collections.Sequence | None
-    :param bias_attr: The attribute for bias. If this parameter is set False or
-                      any object whose type is not ParameterAttribute, no bias
-                      is added. If this parameter is set True, the bias is
-                      initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
-    :return: The LayerOutput object.
+    :return: LayerOutput object.
    :rtype: LayerOutput
    """
    if isinstance(input, LayerOutput):
@@ -5659,11 +5744,11 @@ def rank_cost(left,
              coeff=1.0,
              layer_attr=None):
    """
-    A cost Layer for learning to rank using gradient descent. Details can refer
-    to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/
-    ICML_ranking.pdf>`_.
-    This layer contains at least three inputs. The weight is an optional
-    argument, which affects the cost.
+    A cost Layer for learning to rank using gradient descent.
+
+    Reference:
+        Learning to Rank using Gradient Descent
+        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf

    .. math::

@@ -5694,14 +5779,16 @@ def rank_cost(left,
    :type right: LayerOutput
    :param label: Label is 1 or 0, means positive order and reverse order.
    :type label: LayerOutput
-    :param weight: The weight affects the cost, namely the scale of cost.
-                   It is an optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
    :type weight: LayerOutput
    :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
    :type coeff: float
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -5746,25 +5833,25 @@ def lambda_cost(input,
                         NDCG_num=8,
                         max_sort_size=-1)

-    :param input: Samples of the same query should be loaded as sequence.
+    :param input: The first input of this layer, which is often a document
+                  samples list of the same query and whose type must be sequence.
    :type input: LayerOutput
-    :param score: The 2nd input. Score of each sample.
+    :param score: The scores of the samples.
    :type input: LayerOutput
    :param NDCG_num: The size of NDCG (Normalized Discounted Cumulative Gain),
                     e.g., 5 for NDCG@5. It must be less than or equal to the
-                     minimum size of lists.
+                     minimum size of the list.
    :type NDCG_num: int
-    :param max_sort_size: The size of partial sorting in calculating gradient.
-                          If max_sort_size = -1, then for each list, the
-                          algorithm will sort the entire list to get gradient.
-                          In other cases, max_sort_size must be greater than or
-                          equal to NDCG_num. And if max_sort_size is greater
-                          than the size of a list, the algorithm will sort the
-                          entire list of get gradient.
+    :param max_sort_size: The size of partial sorting in calculating gradient. If
+                          max_sort_size is equal to -1 or greater than the number
+                          of the samples in the list, then the algorithm will sort
+                          the entire list to compute the gradient. In other cases,
+                          max_sort_size must be greater than or equal to NDCG_num.
    :type max_sort_size: int
    :param name: The name of this layer. It is optional.
-    :type name: None | basestring
-    :param layer_attr: Extra Layer Attribute.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -5809,11 +5896,10 @@ def cross_entropy(input,
    :param name: The name of this layer. It is optional.
    :type name: basestring
    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default.
+                  1.0 is the default value.
    :type coeff: float
-    :param weight: The cost of each sample is multiplied with each weight.
-                   The weight should be a layer with size=1. Note that gradient
-                   will not be calculated for weight.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
    :type weight: LayerOutout
    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
                       details.
@@ -5858,7 +5944,7 @@ def cross_entropy_with_selfnorm(input,
    :param name: The name of this layer. It is optional.
    :type name: basestring
    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default.
+                  1.0 is the default value.
    :type coeff: float
    :param softmax_selfnorm_alpha: The scale factor affects the cost.
    :type softmax_selfnorm_alpha: float
@@ -5948,7 +6034,7 @@ def huber_regression_cost(input,
    :param delta: The difference between the observed and predicted values.
    :type delta: float
    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default.
+                  1.0 is the default value.
    :type coeff: float
    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
                       details.
@@ -5998,7 +6084,7 @@ def huber_classification_cost(input,
    :param name: The name of this layer. It is optional.
    :type name: basestring
    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default.
+                  1.0 is the default value.
    :type coeff: float
    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
                       details.
@@ -6043,7 +6129,7 @@ def multi_binary_label_cross_entropy(input,
    :param name: The name of this layer. It is optional.
    :type name: basestring
    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default.
+                  1.0 is the default value.
    :type coeff: float
    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
                       details.
@@ -6214,7 +6300,7 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
    :param name: The name of this layer. It is optional.
    :type name: basestring
    :param coeff: The weight of the gradient in the back propagation.
-                  1.0 is the default.
+                  1.0 is the default value.
    :type coeff: float
    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
                       details.
@@ -6366,7 +6452,7 @@ def row_conv_layer(input,
    :param context_len: The context length equals the lookahead step number
                        plus one.
    :type context_len: int
-    :param act: Activation Type. LinearActivation is the default.
+    :param act: Activation Type. LinearActivation is the default activation.
    :type act: BaseActivation
    :param param_attr: The parameter attribute. See ParameterAttribute for
                       details.
@@ -6488,7 +6574,8 @@ def gated_unit_layer(input,
    :type input: LayerOutput
    :param size: The dimension of this layer's output.
    :type size: int
-    :param act: Activation type of the projection. LinearActivation is the default.
+    :param act: Activation type of the projection. LinearActivation is the default
+                activation.
    :type act: BaseActivation
    :param name: The name of this layer. It is optional.
    :type name: basestring
@@ -6498,9 +6585,9 @@ def gated_unit_layer(input,
    :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
                            for details.
    :type gate_param_attr: ParameterAttribute
-    :param gate_bias_attr: The bias attribute of the gate. If the parameter is set to False or
+    :param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
                           an object whose type is not ParameterAttribute, no bias is defined.
-                           If the parameter is set to True, the bias is initialized to zero.
+                           If this parameter is set to True, the bias is initialized to zero.
    :type gate_bias_attr: ParameterAttribute | bool | None | Any
    :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
                        details.
@@ -6508,9 +6595,9 @@ def gated_unit_layer(input,
    :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
                              for details.
    :type inproj_param_attr: ParameterAttribute
-    :param inproj_bias_attr: The bias attribute of the projection. If the parameter is set to False
+    :param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
                             or an object whose type is not ParameterAttribute, no bias is defined.
-                             If the parameter is set to True, the bias is initialized to zero.
+                             If this parameter is set to True, the bias is initialized to zero.
    :type inproj_bias_attr: ParameterAttribute | bool | None | Any
    :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
                       details.
@@ -6869,7 +6956,7 @@ def img_conv3d_layer(input,
    :type filter_size: int | tuple | list
    :param num_filters: The number of filters in each group.
    :type num_filters: int
-    :param act: Activation type. ReluActivation is the default.
+    :param act: Activation type. ReluActivation is the default activation.
    :type act: BaseActivation
    :param groups: The number of the filter groups.
    :type groups: int
@@ -6884,8 +6971,8 @@ def img_conv3d_layer(input,
                      parameter is set to True, the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param num_channels: The number of input channels. If the parameter is not set or
-                         set to None,  its actual value will be automatically set to
-                         the channels number of the input .
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
    :type num_channels: int
    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
                       details.
@@ -7061,7 +7148,7 @@ def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
    :type offsets: LayerOutput
    :param sizes: The sizes of the sub-sequences, which should be sequence type.
    :type sizes: LayerOutput
-    :param act: Activation type, LinearActivation is the default.
+    :param act: Activation type, LinearActivation is the default activation.
    :type act: BaseActivation.
    :param bias_attr: The bias attribute. If the parameter is set to False or an object
                      whose type is not ParameterAttribute, no bias is defined. If the

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -681,34 +681,42 @@ def lstmemory_unit(input,
                                   state_act=TanhActivation())


-    :param input: input layer.
+    :param input: Input layer.
    :type input: LayerOutput
-    :param out_memory: output of previous time step
+    :param out_memory: The output of previous time step.
    :type out_memory: LayerOutput | None
-    :param name: lstmemory unit name.
+    :param name: The lstmemory unit name.
    :type name: basestring
-    :param size: lstmemory unit size.
+    :param size: The lstmemory unit size.
    :type size: int
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
    :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
    :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
    :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
    :type state_act: BaseActivation
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param lstm_layer_attr: extra attribute of lstm layer.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
    :type lstm_layer_attr: ExtraLayerAttribute
-    :return: lstmemory unit name.
+    :return: The lstmemory unit name.
    :rtype: LayerOutput
    """
    if size is None:
@@ -786,34 +794,42 @@ def lstmemory_group(input,
                                    gate_act=SigmoidActivation(),
                                    state_act=TanhActivation())

-    :param input: input layer.
+    :param input: Input layer.
    :type input: LayerOutput
-    :param size: lstmemory group size.
+    :param size: The lstmemory group size.
    :type size: int
-    :param name: name of lstmemory group.
+    :param name: The name of lstmemory group.
    :type name: basestring
-    :param out_memory: output of previous time step.
+    :param out_memory: The output of previous time step.
    :type out_memory: LayerOutput | None
-    :param reverse: process the input in a reverse order or not.
+    :param reverse: Process the input in a reverse order or not.
    :type reverse: bool
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
    :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
    :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
    :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
    :type state_act: BaseActivation
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                           False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_layer_attr: lstm layer's extra attribute.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
    :type lstm_layer_attr: ExtraLayerAttribute
    :return: the lstmemory group.
    :rtype: LayerOutput

--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -15,8 +15,8 @@
 """

 __all__ = [
-    "BasePoolingType", "MaxPooling", "AvgPooling", "CudnnMaxPooling",
-    "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
+    "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
+    "CudnnMaxPooling", "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
 ]


@@ -55,6 +55,19 @@ class MaxPooling(BasePoolingType):
        self.output_max_index = output_max_index


+class MaxWithMaskPooling(BasePoolingType):
+    """
+    MaxWithMask pooling.
+
+    Not only return the very large values for each dimension in sequence or time steps,
+    but also the location indices of found maxinum values.
+
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "max-pool-with-mask")
+
+
 class CudnnMaxPooling(BasePoolingType):
    """
    Cudnn max pooling only support GPU. Return the maxinum value in the

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -28,6 +28,8 @@ layers {
      stride_y: 1
      output_y: 227
      img_size_y: 256
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -28,6 +28,8 @@ layers {
      stride_y: 1
      output_y: 227
      img_size_y: 256
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -28,6 +28,8 @@ layers {
      stride_y: 1
      output_y: 48
      img_size_y: 48
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -30,6 +30,8 @@ layers {
      stride_y: 1
      output_y: 48
      img_size_y: 48
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
@@ -105,6 +107,8 @@ layers {
      stride_y: 1
      output_y: 24
      img_size_y: 24
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_1__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
@@ -30,6 +30,8 @@ layers {
      stride_y: 1
      output_y: 48
      img_size_y: 48
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
@@ -36,6 +36,8 @@ layers {
      stride_y: 1
      output_y: 14
      img_size_y: 14
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -37,6 +37,8 @@ import model
 import paddle.trainer.config_parser as cp

 __all__ = [
+    'default_startup_program',
+    'default_main_program',
    'optimizer',
    'layer',
    'activation',

--- a/python/paddle/v2/framework/.gitignore
+++ b/python/paddle/v2/framework/.gitignore
--- a/python/paddle/v2/framework/__init__.py
+++ b/python/paddle/v2/framework/__init__.py
--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/framework/backward.py
-from paddle.v2.framework import framework as framework
+from paddle.v2.fluid import framework as framework

 __all__ = ['append_backward_ops']


--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -13,7 +13,7 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope. 
 """

-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import threading

 __tl_scope__ = threading.local()
@@ -27,13 +27,13 @@ __all__ = [
 def get_cur_scope():
    """
    Get current scope.
-    :rtype: paddle.v2.framework.core.Scope
+    :rtype: paddle.v2.fluid.core.Scope
    """
    cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
    if cur_scope_stack is None:
        __tl_scope__.cur_scope = list()
    if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
+        __tl_scope__.cur_scope.append(paddle.v2.fluid.core.Scope())
    return __tl_scope__.cur_scope[-1]



--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
+import numpy as np
+from paddle.v2.fluid.framework import Program, g_main_program, unique_name, Variable
+import paddle.v2.fluid.core as core
+
+
+def _clone_var_in_block_(block, var):
+    assert isinstance(var, Variable)
+    return block.create_var(
+        name=var.name,
+        shape=var.shape,
+        dtype=var.data_type,
+        type=var.type,
+        lod_level=var.lod_level,
+        persistable=True)
+
+
+class Evaluator(object):
+    """
+    Evalutor Base class.
+
+    create metric states
+    add mini-batch evaluator caculate operator
+    add increment operator to accumulate the metric states
+    """
+
+    def __init__(self, name, **kwargs):
+        """
+        init the global states
+        """
+        self._states = {}
+        if kwargs.has_key("main_program"):
+            self._main_program = kwargs.get("main_program")
+        else:
+            self._main_program = g_main_program
+
+    def _update_ops(self, *args, **kwargs):
+        """
+        append update ops to the global states
+        """
+        raise NotImplementedError()
+
+    def reset(self, executor, reset_program=None):
+        """
+        Clear metric states at the begin of each pass/user specified batch
+        """
+        if reset_program == None:
+            reset_program = Program()
+        else:
+            reset_program = program
+        block = reset_program.global_block()
+        for k, var in self._states.iteritems():
+            g_var = _clone_var_in_block_(block, var)
+            zeros = block.create_var(dtype="float32", persistable=True)
+            block.append_op(
+                type="fill_constant",
+                outputs={"Out": [zeros]},
+                attrs={
+                    "shape": g_var.shape,
+                    "value": .0,
+                    "data_type": 5,
+                })
+            block.append_op(
+                type="scale", inputs={"X": zeros}, outputs={"Out": g_var})
+        executor.run(reset_program, fetch_list=self._states.values())
+
+    def eval(self, executor, eval_program=None):
+        """
+        Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+        """
+        raise NotImplementedError()
+
+
+class Accuracy(Evaluator):
+    """
+    Accuracy need two state variable Total, Correct
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(Accuracy, self).__init__("accuracy", **kwargs)
+        block = self._main_program.global_block()
+        g_total = block.create_var(
+            name=unique_name("Total"),
+            persistable=True,
+            dtype="int64",
+            shape=[1])
+        g_correct = block.create_var(
+            name=unique_name("Correct"),
+            persistable=True,
+            dtype="int64",
+            shape=[1])
+        self._states["Total"] = g_total
+        self._states["Correct"] = g_correct
+
+    def _update_ops(self, input, label, k=1, **kwargs):
+        block = self._main_program.global_block()
+        topk_out = block.create_var(dtype=input.data_type)
+        topk_indices = block.create_var(dtype="int64")
+        block.append_op(
+            type="top_k",
+            inputs={"X": [input]},
+            outputs={"Out": [topk_out],
+                     "Indices": [topk_indices]},
+            attrs={"k": k})
+        acc_out = block.create_var(dtype=kwargs.get("out_dtype", "float32"))
+        correct = block.create_var(dtype="int64", persistable=True)
+        total = block.create_var(dtype="int64", persistable=True)
+        block.append_op(
+            type="accuracy",
+            inputs={
+                "Out": [topk_out],
+                "Indices": [topk_indices],
+                "Label": [label]
+            },
+            outputs={
+                "Accuracy": [acc_out],
+                "Correct": [correct],
+                "Total": [total],
+            })
+
+        block.append_op(
+            type="cast",
+            inputs={"X": [self._states["Total"]]},
+            outputs={"Out": [self._states["Total"]]},
+            attrs={
+                "in_data_type": 5,  # float32
+                "out_data_type": 2,  #int32
+            })
+        block.append_op(
+            type="cast",
+            inputs={"X": [self._states["Correct"]]},
+            outputs={"Out": [self._states["Correct"]]},
+            attrs={
+                "in_data_type": 5,
+                "out_data_type": 2,
+            })
+
+        block.append_op(
+            type="elementwise_add",
+            inputs={"X": [self._states["Total"]],
+                    "Y": [total]},
+            outputs={"Out": [self._states["Total"]]})
+        block.append_op(
+            type="elementwise_add",
+            inputs={"X": [self._states["Correct"]],
+                    "Y": [correct]},
+            outputs={"Out": [self._states["Correct"]]})
+
+        return acc_out
+
+    def eval(self, executor, eval_program=None):
+        if eval_program != None:
+            eval_program = eval_program
+        else:
+            eval_program = Program()
+        block = eval_program.global_block()
+        eval_out = block.create_var(dtype=self._states["Total"].data_type)
+        e_total = _clone_var_in_block_(block, self._states["Total"])
+        e_correct = _clone_var_in_block_(block, self._states["Correct"])
+        block.append_op(
+            type="cast",
+            inputs={"X": [e_total]},
+            outputs={"Out": [e_total]},
+            attrs={
+                "in_data_type": 2,  #int32
+                "out_data_type": 5,  #float32
+            })
+        block.append_op(
+            type="cast",
+            inputs={"X": [e_correct]},
+            outputs={"Out": [e_correct]},
+            attrs={
+                "in_data_type": 2,
+                "out_data_type": 5,
+            })
+        block.append_op(
+            type="elementwise_div",
+            inputs={"X": e_correct,
+                    "Y": e_total},
+            outputs={"Out": eval_out})
+        out = executor.run(eval_program, fetch_list=[eval_out])
+        return np.array(out[0])
+
+
+def accuracy(*args, **kwargs):
+    cls = Accuracy(*args, **kwargs)
+    out = cls._update_ops(*args, **kwargs)
+    return cls, out
--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/framework/executor.py
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Block, Program, g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import Block, Program, g_main_program

 g_scope = core.Scope()


--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import collections
 import numpy as np
 import copy

-__all__ = ['Block', 'Variable', 'Program', 'Operator']
+__all__ = ['Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'default_main_program']


 def unique_name(prefix):
@@ -562,3 +562,9 @@ class Parameter(Variable):
 # program is a global instance.
 g_main_program = Program()
 g_startup_program = Program()
+
+def default_startup_program():
+    return g_startup_program
+
+def default_main_program():
+    return g_main_program
--- a/python/paddle/v2/framework/initializer.py
+++ b/python/paddle/v2/framework/initializer.py
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework
 import numpy as np

 __all__ = [

--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/framework/io.py
 import os
 import cPickle as pickle

-from paddle.v2.framework.framework import Program, Parameter, g_main_program, \
+from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
    Variable

 __all__ = [

--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
 import copy
 import itertools

-from paddle.v2.framework.framework import Variable, g_main_program, \
+from paddle.v2.fluid.framework import Variable, g_main_program, \
    g_startup_program, unique_name, Program
-from paddle.v2.framework.initializer import ConstantInitializer, \
+from paddle.v2.fluid.initializer import ConstantInitializer, \
    UniformInitializer, XavierInitializer



--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
-from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, \
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
    Operator
-from paddle.v2.framework.initializer import ConstantInitializer, \
+from paddle.v2.fluid.initializer import ConstantInitializer, \
    NormalInitializer
-from paddle.v2.framework.layer_helper import LayerHelper, unique_name
+from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
 import cStringIO

@@ -574,7 +574,9 @@ def accuracy(input, label, k=1, **kwargs):
                 "Indices": [topk_indices]},
        attrs={"k": k})
    acc_out_dtype = kwargs.get("out_dtype", "float32")
-    acc_out = helper.create_tmp_variable(dtype=acc_out_dtype)
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    correct = helper.create_tmp_variable(dtype="int64")
+    total = helper.create_tmp_variable(dtype="int64")
    helper.append_op(
        type="accuracy",
        inputs={
@@ -582,7 +584,11 @@ def accuracy(input, label, k=1, **kwargs):
            "Indices": [topk_indices],
            "Label": [label]
        },
-        outputs={"Accuracy": [acc_out]})
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
    return acc_out


@@ -839,6 +845,23 @@ def batch_norm(input,
    return helper.append_activation(batch_norm_out)


+def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.data_type)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.data_type)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        })
+
+    return sentence_ids, sentence_scores
+
+
 class BlockGuard(object):
    """
    BlockGuard class.

--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/framework/net_drawer.py
@@ -3,8 +3,8 @@ import json
 import logging
 from collections import defaultdict

-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)

--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
-import paddle.v2.framework.layers as layers
+import paddle.v2.fluid.layers as layers

 __all__ = ["simple_img_conv_pool", "sequence_conv_pool"]


--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2


 def get_all_op_protos():

--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
 from collections import defaultdict

-import paddle.v2.framework.framework as framework
-from paddle.v2.framework.framework import unique_name, Program
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.initializer import ConstantInitializer
-from paddle.v2.framework.regularizer import append_regularization_ops
-from paddle.v2.framework.layer_helper import LayerHelper
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.framework import unique_name, Program
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.initializer import ConstantInitializer
+from paddle.v2.fluid.regularizer import append_regularization_ops
+from paddle.v2.fluid.layer_helper import LayerHelper

 __all__ = [
    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',

--- a/python/paddle/v2/framework/regularizer.py
+++ b/python/paddle/v2/framework/regularizer.py
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework

 __all__ = [
    'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'

--- a/python/paddle/v2/framework/tests/.gitignore
+++ b/python/paddle/v2/framework/tests/.gitignore
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/book/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/book/test_fit_a_line.py
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.io import save_persistables, load_persistables
-from paddle.v2.framework.executor import Executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.io import save_persistables, load_persistables
+from paddle.v2.fluid.executor import Executor

 import numpy as np

-startup_program = Program()
-main_program = Program()
 x = layers.data(
    name='x',
    shape=[13],
-    data_type='float32',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='float32')

 y_predict = layers.fc(input=x,
                      size=1,
-                      act=None,
-                      main_program=main_program,
-                      startup_program=startup_program)
+                      act=None)

 y = layers.data(
    name='y',
    shape=[1],
-    data_type='float32',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='float32')

 cost = layers.square_error_cost(
    input=y_predict,
-    label=y,
-    main_program=main_program,
-    startup_program=startup_program)
-avg_cost = layers.mean(
-    x=cost, main_program=main_program, startup_program=startup_program)
+    label=y)
+avg_cost = layers.mean(x=cost)

 sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost, startup_program)
+opts = sgd_optimizer.minimize(avg_cost)

 BATCH_SIZE = 20

@@ -52,12 +40,12 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)

-exe.run(startup_program, feed={}, fetch_list=[])
+exe.run(framework.default_startup_program())

 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
-    save_persistables(exe, "./fit_a_line.model/", main_program=main_program)
-    load_persistables(exe, "./fit_a_line.model/", main_program=main_program)
+    save_persistables(exe, "./fit_a_line.model/")
+    load_persistables(exe, "./fit_a_line.model/")
    for data in train_reader():
        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
        y_data = np.array(map(lambda x: x[1], data)).astype("float32")
@@ -69,7 +57,7 @@ for pass_id in range(PASS_NUM):
        tensor_y = core.LoDTensor()
        tensor_y.set(y_data, place)
        # print tensor_y.get_dims()
-        outs = exe.run(main_program,
+        outs = exe.run(framework.default_main_program(),
                       feed={'x': tensor_x,
                             'y': tensor_y},
                       fetch_list=[avg_cost])

--- a/python/paddle/v2/framework/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/framework/tests/book/test_image_classification_train.py
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.framework.core as core
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.optimizer as optimizer
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_startup_program, g_main_program
-from paddle.v2.framework.initializer import XavierInitializer
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.initializer import XavierInitializer


-def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
+def resnet_cifar10(input, depth=32):
    def conv_bn_layer(input,
                      ch_out,
                      filter_size,
                      stride,
                      padding,
-                      act='relu',
-                      main_program=None,
-                      startup_program=None):
+                      act='relu'):
        tmp = layers.conv2d(
            input=input,
            filter_size=filter_size,
@@ -25,14 +23,10 @@ def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
            stride=stride,
            padding=padding,
            act=None,
-            bias_attr=False,
-            main_program=main_program,
-            startup_program=startup_program)
+            bias_attr=False)
        return layers.batch_norm(
            input=tmp,
-            act=act,
-            main_program=main_program,
-            startup_program=startup_program)
+            act=act)

    def shortcut(input, ch_in, ch_out, stride, program, init_program):
        if ch_in != ch_out:
@@ -44,40 +38,30 @@ def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
    def basicblock(input,
                   ch_in,
                   ch_out,
-                   stride,
-                   main_program=main_program,
-                   startup_program=startup_program):
+                   stride):
        tmp = conv_bn_layer(
            input,
            ch_out,
            3,
            stride,
-            1,
-            main_program=main_program,
-            startup_program=startup_program)
+            1)
        tmp = conv_bn_layer(
            tmp,
            ch_out,
            3,
            1,
            1,
-            act=None,
-            main_program=main_program,
-            startup_program=startup_program)
-        short = shortcut(input, ch_in, ch_out, stride, main_program,
-                         startup_program)
+            act=None)
+        short = shortcut(input, ch_in, ch_out, stride)
        return layers.elementwise_add(
            x=tmp,
            y=short,
-            act='relu',
-            main_program=main_program,
-            startup_program=startup_program)
+            act='relu')

-    def layer_warp(block_func, input, ch_in, ch_out, count, stride, program,
-                   startup_program):
-        tmp = block_func(input, ch_in, ch_out, stride, program, startup_program)
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
        for i in range(1, count):
-            tmp = block_func(tmp, ch_out, ch_out, 1, program, startup_program)
+            tmp = block_func(tmp, ch_out, ch_out, 1)
        return tmp

    assert (depth - 2) % 6 == 0
@@ -87,53 +71,41 @@ def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
        ch_out=16,
        filter_size=3,
        stride=1,
-        padding=1,
-        main_program=main_program,
-        startup_program=startup_program)
+        padding=1)
    res1 = layer_warp(
        basicblock,
        conv1,
        16,
        16,
        n,
-        1,
-        main_program=main_program,
-        startup_program=startup_program)
+        1)
    res2 = layer_warp(
        basicblock,
        res1,
        16,
        32,
        n,
-        2,
-        main_program=main_program,
-        startup_program=startup_program)
+        2)
    res3 = layer_warp(
        basicblock,
        res2,
        32,
        64,
        n,
-        2,
-        main_program=main_program,
-        startup_program=startup_program)
+        2)
    pool = layers.pool2d(
        input=res3,
        pool_size=8,
        pool_type='avg',
-        pool_stride=1,
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_stride=1)
    return pool


-def vgg16_bn_drop(input, main_program=None, startup_program=None):
+def vgg16_bn_drop(input):
    def conv_block(input,
                   num_filter,
                   groups,
-                   dropouts,
-                   main_program=None,
-                   startup_program=None):
+                   dropouts):
        return nets.img_conv_group(
            input=input,
            pool_size=2,
@@ -143,51 +115,34 @@ def vgg16_bn_drop(input, main_program=None, startup_program=None):
            conv_act='relu',
            conv_with_batchnorm=True,
            conv_batchnorm_drop_rate=dropouts,
-            pool_type='max',
-            main_program=main_program,
-            startup_program=startup_program)
+            pool_type='max')

-    conv1 = conv_block(input, 64, 2, [0.3, 0], main_program, startup_program)
-    conv2 = conv_block(conv1, 128, 2, [0.4, 0], main_program, startup_program)
-    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0], main_program,
-                       startup_program)
-    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0], main_program,
-                       startup_program)
-    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0], main_program,
-                       startup_program)
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])

    drop = layers.dropout(
        x=conv5,
-        dropout_prob=0.5,
-        main_program=main_program,
-        startup_program=startup_program)
+        dropout_prob=0.5)
    fc1 = layers.fc(input=drop,
                    size=512,
                    act=None,
-                    param_attr={"initializer": XavierInitializer()},
-                    main_program=main_program,
-                    startup_program=startup_program)
+                    param_attr={"initializer": XavierInitializer()})
    reshape1 = layers.reshape(
        x=fc1,
-        shape=list(fc1.shape + (1, 1)),
-        main_program=main_program,
-        startup_program=startup_program)
+        shape=list(fc1.shape + (1, 1)))
    bn = layers.batch_norm(
        input=reshape1,
-        act='relu',
-        main_program=main_program,
-        startup_program=startup_program)
+        act='relu')
    drop2 = layers.dropout(
        x=bn,
-        dropout_prob=0.5,
-        main_program=main_program,
-        startup_program=startup_program)
+        dropout_prob=0.5)
    fc2 = layers.fc(input=drop2,
                    size=512,
                    act=None,
-                    param_attr={"initializer": XavierInitializer()},
-                    main_program=main_program,
-                    startup_program=startup_program)
+                    param_attr={"initializer": XavierInitializer()})
    return fc2


@@ -225,7 +180,7 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)

-exe.run(g_startup_program, feed={}, fetch_list=[])
+exe.run(framework.default_startup_program())

 for pass_id in range(PASS_NUM):
    batch_id = 0
@@ -243,7 +198,7 @@ for pass_id in range(PASS_NUM):
        tensor_img.set(img_data, place)
        tensor_y.set(y_data, place)

-        outs = exe.run(g_main_program,
+        outs = exe.run(framework.default_main_program(),
                       feed={"pixel": tensor_img,
                             "label": tensor_y},
                       fetch_list=[avg_cost, accuracy])

--- a/python/paddle/v2/framework/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/book/test_recognize_digits_conv.py
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.evaluator as evaluator
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.executor import Executor

 import numpy as np

-startup_program = Program()
-main_program = Program()
-
 images = layers.data(
    name='pixel',
    shape=[1, 28, 28],
-    data_type='float32',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='float32')
 label = layers.data(
    name='label',
    shape=[1],
-    data_type='int64',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='int64')
 conv_pool_1 = nets.simple_img_conv_pool(
    input=images,
    filter_size=5,
    num_filters=20,
    pool_size=2,
    pool_stride=2,
-    act="relu",
-    main_program=main_program,
-    startup_program=startup_program)
+    act="relu")
 conv_pool_2 = nets.simple_img_conv_pool(
    input=conv_pool_1,
    filter_size=5,
    num_filters=50,
    pool_size=2,
    pool_stride=2,
-    act="relu",
-    main_program=main_program,
-    startup_program=startup_program)
+    act="relu")

 predict = layers.fc(input=conv_pool_2,
                    size=10,
-                    act="softmax",
-                    main_program=main_program,
-                    startup_program=startup_program)
-cost = layers.cross_entropy(
-    input=predict,
-    label=label,
-    main_program=main_program,
-    startup_program=startup_program)
-avg_cost = layers.mean(x=cost, main_program=main_program)
-accuracy = layers.accuracy(
-    input=predict,
-    label=label,
-    main_program=main_program,
-    startup_program=startup_program)
-
-# optimizer = optimizer.MomentumOptimizer(learning_rate=0.1 / 128.0,
-# momentum=0.9)
+                    act="softmax")
+cost = layers.cross_entropy(input=predict, label=label)
+avg_cost = layers.mean(x=cost)
 optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
-opts = optimizer.minimize(avg_cost, startup_program)
+opts = optimizer.minimize(avg_cost)
+
+accuracy, acc_out = evaluator.accuracy(
+    input=predict,
+    label=label)

 BATCH_SIZE = 50
 PASS_NUM = 3
@@ -75,10 +54,11 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)

-exe.run(startup_program, feed={}, fetch_list=[])
+exe.run(framework.default_startup_program())

 for pass_id in range(PASS_NUM):
    count = 0
+    accuracy.reset(exe)
    for data in train_reader():
        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
                                data)).astype("float32")
@@ -90,14 +70,20 @@ for pass_id in range(PASS_NUM):
        tensor_img.set(img_data, place)
        tensor_y.set(y_data, place)

-        outs = exe.run(main_program,
+        outs = exe.run(framework.default_main_program(),
                       feed={"pixel": tensor_img,
                             "label": tensor_y},
-                       fetch_list=[avg_cost, accuracy])
+                       fetch_list=[avg_cost, acc_out])
        loss = np.array(outs[0])
        acc = np.array(outs[1])
-
+        pass_acc = accuracy.eval(exe)
+        print "pass id : ", pass_id, pass_acc
+        # print loss, acc
        if loss < 10.0 and acc > 0.9:
            # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
            exit(0)
+
+    pass_acc = accuracy.eval(exe)
+    print "pass id : ", pass_id, pass_acc
+
 exit(1)
--- a/python/paddle/v2/framework/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/book/test_recognize_digits_mlp.py
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.regularizer import L2DecayRegularizer
-from paddle.v2.framework.initializer import UniformInitializer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.regularizer import L2DecayRegularizer
+from paddle.v2.fluid.initializer import UniformInitializer

 import numpy as np

 BATCH_SIZE = 128
-startup_program = Program()
-main_program = Program()
 image = layers.data(
    name='x',
    shape=[784],
-    data_type='float32',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='float32')

 param_attr = {
    'name': None,
@@ -30,45 +25,30 @@ param_attr = {
 hidden1 = layers.fc(input=image,
                    size=128,
                    act='relu',
-                    main_program=main_program,
-                    startup_program=startup_program,
                    param_attr=param_attr)
 hidden2 = layers.fc(input=hidden1,
                    size=64,
                    act='relu',
-                    main_program=main_program,
-                    startup_program=startup_program,
                    param_attr=param_attr)

 predict = layers.fc(input=hidden2,
                    size=10,
                    act='softmax',
-                    main_program=main_program,
-                    startup_program=startup_program,
                    param_attr=param_attr)

 label = layers.data(
    name='y',
    shape=[1],
-    data_type='int64',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='int64')

-cost = layers.cross_entropy(
-    input=predict,
-    label=label,
-    main_program=main_program,
-    startup_program=startup_program)
-avg_cost = layers.mean(
-    x=cost, main_program=main_program, startup_program=startup_program)
+cost = layers.cross_entropy(input=predict, label=label)
+avg_cost = layers.mean(x=cost)
 accuracy = layers.accuracy(
    input=predict,
-    label=label,
-    main_program=main_program,
-    startup_program=startup_program)
+    label=label)

 optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
-opts = optimizer.minimize(avg_cost, startup_program)
+opts = optimizer.minimize(avg_cost)

 train_reader = paddle.batch(
    paddle.reader.shuffle(
@@ -78,7 +58,7 @@ train_reader = paddle.batch(
 place = core.CPUPlace()
 exe = Executor(place)

-exe.run(startup_program, feed={}, fetch_list=[])
+exe.run(framework.default_startup_program())

 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
@@ -93,7 +73,7 @@ for pass_id in range(PASS_NUM):
        tensor_y = core.LoDTensor()
        tensor_y.set(y_data, place)

-        outs = exe.run(main_program,
+        outs = exe.run(framework.default_main_program(),
                       feed={'x': tensor_x,
                             'y': tensor_y},
                       fetch_list=[avg_cost, accuracy])

--- a/python/paddle/v2/framework/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/framework/tests/book/test_recommender_system.py
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.executor import Executor

 import numpy as np

-startup_program = Program()
-main_program = Program()
-is_sparse = True
-use_gpu = False
+IS_SPARSE = True
+USE_GPU = False
 BATCH_SIZE = 256


@@ -25,99 +22,71 @@ def get_usr_combined_features():
    uid = layers.data(
        name='user_id',
        shape=[1],
-        data_type='int64',
-        main_program=main_program,
-        startup_program=startup_program)
+        data_type='int64')

    usr_emb = layers.embedding(
        input=uid,
        data_type='float32',
        size=[USR_DICT_SIZE, 32],
        param_attr={'name': 'user_table'},
-        is_sparse=is_sparse,
-        main_program=main_program,
-        startup_program=startup_program)
+        is_sparse=IS_SPARSE)

    usr_fc = layers.fc(input=usr_emb,
-                       size=32,
-                       main_program=main_program,
-                       startup_program=startup_program)
+                       size=32)

    USR_GENDER_DICT_SIZE = 2

    usr_gender_id = layers.data(
        name='gender_id',
        shape=[1],
-        data_type='int64',
-        main_program=main_program,
-        startup_program=startup_program)
+        data_type='int64')

    usr_gender_emb = layers.embedding(
        input=usr_gender_id,
        size=[USR_GENDER_DICT_SIZE, 16],
        param_attr={'name': 'gender_table'},
-        is_sparse=is_sparse,
-        main_program=main_program,
-        startup_program=startup_program)
+        is_sparse=IS_SPARSE)

    usr_gender_fc = layers.fc(input=usr_gender_emb,
-                              size=16,
-                              main_program=main_program,
-                              startup_program=startup_program)
+                              size=16)

    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
    usr_age_id = layers.data(
        name='age_id',
        shape=[1],
-        data_type="int64",
-        main_program=main_program,
-        startup_program=startup_program)
+        data_type="int64")

    usr_age_emb = layers.embedding(
        input=usr_age_id,
        size=[USR_AGE_DICT_SIZE, 16],
-        is_sparse=is_sparse,
-        param_attr={'name': 'age_table'},
-        main_program=main_program,
-        startup_program=startup_program)
+        is_sparse=IS_SPARSE,
+        param_attr={'name': 'age_table'})

    usr_age_fc = layers.fc(input=usr_age_emb,
-                           size=16,
-                           main_program=main_program,
-                           startup_program=startup_program)
+                           size=16)

    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
    usr_job_id = layers.data(
        name='job_id',
        shape=[1],
-        data_type="int64",
-        main_program=main_program,
-        startup_program=startup_program)
+        data_type="int64")

    usr_job_emb = layers.embedding(
        input=usr_job_id,
        size=[USR_JOB_DICT_SIZE, 16],
        param_attr={'name': 'job_table'},
-        is_sparse=is_sparse,
-        main_program=main_program,
-        startup_program=startup_program)
+        is_sparse=IS_SPARSE)

    usr_job_fc = layers.fc(input=usr_job_emb,
-                           size=16,
-                           main_program=main_program,
-                           startup_program=startup_program)
+                           size=16)

    concat_embed = layers.concat(
        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
-        axis=1,
-        main_program=main_program,
-        startup_program=startup_program)
+        axis=1)

    usr_combined_features = layers.fc(input=concat_embed,
                                      size=200,
-                                      act="tanh",
-                                      main_program=main_program,
-                                      startup_program=startup_program)
+                                      act="tanh")

    return usr_combined_features

@@ -129,83 +98,61 @@ def get_mov_combined_features():
    mov_id = layers.data(
        name='movie_id',
        shape=[1],
-        data_type='int64',
-        main_program=main_program,
-        startup_program=startup_program)
+        data_type='int64')

    mov_emb = layers.embedding(
        input=mov_id,
        data_type='float32',
        size=[MOV_DICT_SIZE, 32],
        param_attr={'name': 'movie_table'},
-        is_sparse=is_sparse,
-        main_program=main_program,
-        startup_program=startup_program)
+        is_sparse=IS_SPARSE)

    mov_fc = layers.fc(input=mov_emb,
-                       size=32,
-                       main_program=main_program,
-                       startup_program=startup_program)
+                       size=32)

    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

    category_id = layers.data(
        name='category_id',
        shape=[1],
-        data_type='int64',
-        main_program=main_program,
-        startup_program=startup_program)
+        data_type='int64')

    mov_categories_emb = layers.embedding(
        input=category_id,
        size=[CATEGORY_DICT_SIZE, 32],
-        is_sparse=is_sparse,
-        main_program=main_program,
-        startup_program=startup_program)
+        is_sparse=IS_SPARSE)

    mov_categories_hidden = layers.sequence_pool(
        input=mov_categories_emb,
-        pool_type="sum",
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_type="sum")

    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

    mov_title_id = layers.data(
        name='movie_title',
        shape=[1],
-        data_type='int64',
-        main_program=main_program,
-        startup_program=startup_program)
+        data_type='int64')

    mov_title_emb = layers.embedding(
        input=mov_title_id,
        size=[MOV_TITLE_DICT_SIZE, 32],
-        is_sparse=is_sparse,
-        main_program=main_program,
-        startup_program=startup_program)
+        is_sparse=IS_SPARSE)

    mov_title_conv = nets.sequence_conv_pool(
        input=mov_title_emb,
        num_filters=32,
        filter_size=3,
        act="tanh",
-        pool_type="sum",
-        main_program=main_program,
-        startup_program=startup_program)
+        pool_type="sum")

    concat_embed = layers.concat(
        input=[mov_fc, mov_categories_hidden, mov_title_conv],
-        axis=1,
-        main_program=main_program,
-        startup_program=startup_program)
+        axis=1)

    # FIXME(dzh) : need tanh operator
    mov_combined_features = layers.fc(input=concat_embed,
                                      size=200,
-                                      act="tanh",
-                                      main_program=main_program,
-                                      startup_program=startup_program)
+                                      act="tanh")

    return mov_combined_features

@@ -217,27 +164,18 @@ def model():
    # need cos sim
    inference = layers.cos_sim(
        X=usr_combined_features,
-        Y=mov_combined_features,
-        main_program=main_program,
-        startup_program=startup_program)
+        Y=mov_combined_features)

    label = layers.data(
        name='score',
        shape=[1],
-        data_type='float32',
-        main_program=main_program,
-        startup_program=startup_program)
+        data_type='float32')

    square_cost = layers.square_error_cost(
        input=inference,
-        label=label,
-        main_program=main_program,
-        startup_program=startup_program)
+        label=label)

-    avg_cost = layers.mean(
-        x=square_cost,
-        main_program=main_program,
-        startup_program=startup_program)
+    avg_cost = layers.mean(x=square_cost)

    return avg_cost

@@ -245,16 +183,15 @@ def model():
 def main():
    cost = model()
    sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
-    opts = sgd_optimizer.minimize(cost, startup_program=startup_program)
-    block = main_program.block(0)
+    opts = sgd_optimizer.minimize(cost)

-    if use_gpu:
+    if USE_GPU:
        place = core.GPUPlace(0)
    else:
        place = core.CPUPlace()

    exe = Executor(place)
-    exe.run(startup_program, feed={}, fetch_list=[])
+    exe.run(framework.default_startup_program())

    train_reader = paddle.batch(
        paddle.reader.shuffle(
@@ -303,7 +240,7 @@ def main():
    PASS_NUM = 100
    for pass_id in range(PASS_NUM):
        for data in train_reader():
-            outs = exe.run(main_program,
+            outs = exe.run(framework.default_main_program(),
                           feed=func_feed(feeding, data),
                           fetch_list=[cost])
            out = np.array(outs[0])

--- a/python/paddle/v2/framework/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/framework/tests/book/test_understand_sentiment_conv.py
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program, g_main_program, g_startup_program
-from paddle.v2.framework.executor import Executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.executor import Executor

 import numpy as np

@@ -70,7 +69,7 @@ def main():
    place = core.CPUPlace()
    exe = Executor(place)

-    exe.run(g_startup_program)
+    exe.run(framework.default_startup_program())

    for pass_id in xrange(PASS_NUM):
        for data in train_data():
@@ -82,7 +81,7 @@ def main():
            tensor_label = core.LoDTensor()
            tensor_label.set(label, place)

-            outs = exe.run(g_main_program,
+            outs = exe.run(framework.default_main_program(),
                           feed={"words": tensor_words,
                                 "label": tensor_label},
                           fetch_list=[cost, acc])

--- a/python/paddle/v2/framework/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/framework/tests/book/test_understand_sentiment_dynamic_lstm.py
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program, g_main_program, g_startup_program
-from paddle.v2.framework.executor import Executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.executor import Executor

 import numpy as np

@@ -81,7 +80,7 @@ def main():
    place = core.CPUPlace()
    exe = Executor(place)

-    exe.run(g_startup_program)
+    exe.run(framework.default_startup_program())

    for pass_id in xrange(PASS_NUM):
        for data in train_data():
@@ -93,7 +92,7 @@ def main():
            tensor_label = core.LoDTensor()
            tensor_label.set(label, place)

-            outs = exe.run(g_main_program,
+            outs = exe.run(framework.default_main_program(),
                           feed={"words": tensor_words,
                                 "label": tensor_label},
                           fetch_list=[cost, acc])

--- a/python/paddle/v2/framework/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/framework/tests/book/test_understand_sentiment_lstm.py
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import g_main_program, g_startup_program
-from paddle.v2.framework.executor import Executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.executor import Executor

 import numpy as np

@@ -88,10 +87,10 @@ def main():
    place = core.CPUPlace()
    tensor_words, tensor_label = prepare_feed_data(data, place)
    exe = Executor(place)
-    exe.run(g_startup_program)
+    exe.run(framework.default_startup_program())

    while True:
-        outs = exe.run(g_main_program,
+        outs = exe.run(framework.default_main_program(),
                       feed={"words": tensor_words,
                             "label": tensor_label},
                       fetch_list=[cost, acc])

--- a/python/paddle/v2/framework/tests/book/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/book/test_word2vec.py
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.executor import Executor

 import numpy as np

-startup_program = Program()
-main_program = Program()
-
-embed_size = 32
-hidden_size = 256
+PASS_NUM = 100
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
 N = 5
-batch_size = 32
-is_sparse = True
+BATCH_SIZE = 32
+IS_SPARSE = True

 word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
@@ -23,97 +20,67 @@ dict_size = len(word_dict)
 first_word = layers.data(
    name='firstw',
    shape=[1],
-    data_type='int64',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='int64')
 second_word = layers.data(
    name='secondw',
    shape=[1],
-    data_type='int64',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='int64')
 third_word = layers.data(
    name='thirdw',
    shape=[1],
-    data_type='int64',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='int64')
 forth_word = layers.data(
    name='forthw',
    shape=[1],
-    data_type='int64',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='int64')
 next_word = layers.data(
    name='nextw',
    shape=[1],
-    data_type='int64',
-    main_program=main_program,
-    startup_program=startup_program)
+    data_type='int64')

 embed_first = layers.embedding(
    input=first_word,
-    size=[dict_size, embed_size],
+    size=[dict_size, EMBED_SIZE],
    data_type='float32',
-    is_sparse=is_sparse,
-    param_attr={'name': 'shared_w'},
-    main_program=main_program,
-    startup_program=startup_program)
+    is_sparse=IS_SPARSE,
+    param_attr={'name': 'shared_w'})
 embed_second = layers.embedding(
    input=second_word,
-    size=[dict_size, embed_size],
+    size=[dict_size, EMBED_SIZE],
    data_type='float32',
-    is_sparse=is_sparse,
-    param_attr={'name': 'shared_w'},
-    main_program=main_program,
-    startup_program=startup_program)
-
+    is_sparse=IS_SPARSE,
+    param_attr={'name': 'shared_w'})
 embed_third = layers.embedding(
    input=third_word,
-    size=[dict_size, embed_size],
+    size=[dict_size, EMBED_SIZE],
    data_type='float32',
-    is_sparse=is_sparse,
-    param_attr={'name': 'shared_w'},
-    main_program=main_program,
-    startup_program=startup_program)
+    is_sparse=IS_SPARSE,
+    param_attr={'name': 'shared_w'})
 embed_forth = layers.embedding(
    input=forth_word,
-    size=[dict_size, embed_size],
+    size=[dict_size, EMBED_SIZE],
    data_type='float32',
-    is_sparse=is_sparse,
-    param_attr={'name': 'shared_w'},
-    main_program=main_program,
-    startup_program=startup_program)
+    is_sparse=IS_SPARSE,
+    param_attr={'name': 'shared_w'})

 concat_embed = layers.concat(
    input=[embed_first, embed_second, embed_third, embed_forth],
-    axis=1,
-    main_program=main_program,
-    startup_program=startup_program)
-
+    axis=1)
 hidden1 = layers.fc(input=concat_embed,
-                    size=hidden_size,
-                    act='sigmoid',
-                    main_program=main_program,
-                    startup_program=startup_program)
+                    size=HIDDEN_SIZE,
+                    act='sigmoid')
 predict_word = layers.fc(input=hidden1,
                         size=dict_size,
-                         act='softmax',
-                         main_program=main_program,
-                         startup_program=startup_program)
+                         act='softmax')
 cost = layers.cross_entropy(
    input=predict_word,
-    label=next_word,
-    main_program=main_program,
-    startup_program=startup_program)
-avg_cost = layers.mean(
-    x=cost, main_program=main_program, startup_program=startup_program)
-
+    label=next_word)
+avg_cost = layers.mean(x=cost)
 sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost, startup_program)
+opts = sgd_optimizer.minimize(avg_cost)

 train_reader = paddle.batch(
-    paddle.dataset.imikolov.train(word_dict, N), batch_size)
+    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)

 place = core.CPUPlace()
 exe = Executor(place)
@@ -122,8 +89,8 @@ exe = Executor(place)
 # below exit line.
 exit(0)

-exe.run(startup_program, feed={}, fetch_list=[])
-PASS_NUM = 100
+exe.run(framework.default_startup_program())
+
 for pass_id in range(PASS_NUM):
    for data in train_reader():
        input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)]
@@ -150,7 +117,7 @@ for pass_id in range(PASS_NUM):
        next_tensor = core.LoDTensor()
        next_tensor.set(next_data, place)

-        outs = exe.run(main_program,
+        outs = exe.run(framework.default_main_program(),
                       feed={
                           'firstw': first_tensor,
                           'secondw': second_tensor,

--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -2,12 +2,12 @@ import unittest
 import numpy as np
 import random
 import itertools
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import collections
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.op import Operator
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import Program, OpProtoHolder
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import Program, OpProtoHolder


 def randomize_probability(batch_size, class_num, dtype='float32'):

--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
@@ -18,7 +18,9 @@ class TestAccuracyOp(OpTest):
                    num_correct += 1
                    break
        self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype("float32")
+            'Accuracy': np.array([num_correct / float(n)]).astype("float32"),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
        }

    def test_check_output(self):

--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
--- a/python/paddle/v2/framework/tests/test_adadelta_op.py
+++ b/python/paddle/v2/framework/tests/test_adadelta_op.py
--- a/python/paddle/v2/framework/tests/test_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_adam_op.py
+++ b/python/paddle/v2/framework/tests/test_adam_op.py
--- a/python/paddle/v2/framework/tests/test_adamax_op.py
+++ b/python/paddle/v2/framework/tests/test_adamax_op.py
--- a/python/paddle/v2/framework/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/framework/tests/test_array_read_write_op.py
 import unittest
-import paddle.v2.framework.core as core
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.framework import g_main_program
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.framework import g_main_program
 import numpy



--- a/python/paddle/v2/framework/tests/test_assign_op.py
+++ b/python/paddle/v2/framework/tests/test_assign_op.py
--- a/python/paddle/v2/framework/tests/test_auc_op.py
+++ b/python/paddle/v2/framework/tests/test_auc_op.py
--- a/python/paddle/v2/framework/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_batch_norm_op.py
 import unittest
 import numpy as np
 from op_test import OpTest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator


 def grad_var_name(var_name):

--- a/python/paddle/v2/framework/tests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/v2/framework/tests/test_bilinear_tensor_product_op.py
--- a/python/paddle/v2/framework/tests/test_cast_op.py
+++ b/python/paddle/v2/framework/tests/test_cast_op.py
 import op_test
 import unittest
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core


 class TestCastOp(op_test.OpTest):

--- a/python/paddle/v2/framework/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/framework/tests/test_chunk_eval_op.py
--- a/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
--- a/python/paddle/v2/framework/tests/test_clip_op.py
+++ b/python/paddle/v2/framework/tests/test_clip_op.py
--- a/python/paddle/v2/framework/tests/test_compare_op.py
+++ b/python/paddle/v2/framework/tests/test_compare_op.py
@@ -23,6 +23,9 @@ def create_test_class(op_type, typename, callback):

 for _type_name in {'float32', 'float64', 'int32', 'int64'}:
    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)

 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_concat_op.py
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
-from paddle.v2.framework.op import Operator, CondOp
+from paddle.v2.fluid.op import Operator, CondOp


 class PySimpleCond(object):

--- a/python/paddle/v2/framework/tests/test_conditional_block.py
+++ b/python/paddle/v2/framework/tests/test_conditional_block.py
 import unittest
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import g_startup_program, g_main_program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy



--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
--- a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_conv3d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_op.py
--- a/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_conv_shift_op.py
+++ b/python/paddle/v2/framework/tests/test_conv_shift_op.py
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
--- a/python/paddle/v2/framework/tests/test_create_op_doc_string.py
+++ b/python/paddle/v2/framework/tests/test_create_op_doc_string.py
 import unittest
-import paddle.v2.framework.layers as layers
+import paddle.v2.fluid.layers as layers


 class TestDocString(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/framework/tests/test_crf_decoding_op.py
--- a/python/paddle/v2/framework/tests/test_crop_op.py
+++ b/python/paddle/v2/framework/tests/test_crop_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
-from paddle.v2.framework.default_scope_funcs import *
+from paddle.v2.fluid.default_scope_funcs import *
 import unittest



--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
-from paddle.v2.framework.op import Operator, DynamicRecurrentOp
+from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
 import numpy as np

 # for siplicity, just one level LoD

--- a/python/paddle/v2/framework/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_div_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
--- a/python/paddle/v2/framework/tests/test_exception.py
+++ b/python/paddle/v2/framework/tests/test_exception.py
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest



--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
 import unittest
-from paddle.v2.framework.layers import mul, data
-import paddle.v2.framework.core as core
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_main_program
+from paddle.v2.fluid.layers import mul, data
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import g_main_program
 import numpy



--- a/python/paddle/v2/framework/tests/test_expand_op.py
+++ b/python/paddle/v2/framework/tests/test_expand_op.py
--- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np


--- a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_op.py
--- a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
--- a/python/paddle/v2/framework/tests/test_framework_debug_str.py
+++ b/python/paddle/v2/framework/tests/test_framework_debug_str.py
 import unittest
-from paddle.v2.framework.framework import Program
+from paddle.v2.fluid.framework import Program


 class TestDebugStringFramework(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_gather_op.py
+++ b/python/paddle/v2/framework/tests/test_gather_op.py
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
 import unittest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 import numpy



--- a/python/paddle/v2/framework/tests/test_gru_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_op.py
--- a/python/paddle/v2/framework/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
 import unittest

-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program


 def conv_block(input,

--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
 import unittest

-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core


 class TestInferShape(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/framework/tests/test_inference_model_io.py
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer

-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.io import save_inference_model, load_inference_model
-import paddle.v2.framework.executor as executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.io import save_inference_model, load_inference_model
+import paddle.v2.fluid.executor as executor
 import unittest
 import numpy as np


--- a/python/paddle/v2/framework/tests/test_initializer.py
+++ b/python/paddle/v2/framework/tests/test_initializer.py
 import numpy as np
 import unittest

-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.initializer as initializer
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.initializer as initializer

 DELTA = 0.00001


--- a/python/paddle/v2/framework/tests/test_l1_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_l1_norm_op.py
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program
+import paddle.v2.fluid.core as core
 import unittest



--- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
--- a/python/paddle/v2/framework/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/framework/tests/test_lod_array_length_op.py
 import unittest
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.executor import Executor
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
 import numpy



--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/framework/tests/test_lod_rank_table.py
-from paddle.v2.framework.layers import lod_rank_table, data
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_main_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.layers import lod_rank_table, data
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import g_main_program
+import paddle.v2.fluid.core as core
 import numpy
 import unittest


--- a/python/paddle/v2/framework/tests/test_lod_reset_op.py
+++ b/python/paddle/v2/framework/tests/test_lod_reset_op.py
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array.py
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import numpy



--- a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import numpy
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops


 class TestCPULoDTensorArrayOps(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table_op.py
--- a/python/paddle/v2/framework/tests/test_lrn_op.py
+++ b/python/paddle/v2/framework/tests/test_lrn_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
--- a/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
--- a/python/paddle/v2/framework/tests/test_matmul_op.py
+++ b/python/paddle/v2/framework/tests/test_matmul_op.py
--- a/python/paddle/v2/framework/tests/test_mean_op.py
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
--- a/python/paddle/v2/framework/tests/test_minus_op.py
+++ b/python/paddle/v2/framework/tests/test_minus_op.py
--- a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_momentum_op.py
+++ b/python/paddle/v2/framework/tests/test_momentum_op.py
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
--- a/python/paddle/v2/framework/tests/test_multiplex_op.py
+++ b/python/paddle/v2/framework/tests/test_multiplex_op.py
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
 import unittest, os
 import numpy as np
 import paddle.v2 as paddle
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 from op_test import OpTest, create_op, set_input

 if not core.is_compile_gpu():

--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 import unittest



--- a/python/paddle/v2/framework/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/framework/tests/test_op_support_gpu.py
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core


 class TestOpSupportGPU(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
 import unittest
-import paddle.v2.framework.op as op
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.op as op
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2


 class TestGetAllProtos(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
 import unittest
-from paddle.v2.framework.framework import Variable, Program, g_main_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import Variable, Program, g_main_program
+import paddle.v2.fluid.core as core


 class TestOperator(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
 import unittest

-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.optimizer as optimizer
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.backward import append_backward_ops


 class TestOptimizer(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_pad_op.py
+++ b/python/paddle/v2/framework/tests/test_pad_op.py
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
 import unittest
-from paddle.v2.framework.framework import g_main_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import g_main_program
+import paddle.v2.fluid.core as core


 class TestParameter(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
--- a/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
+++ b/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
--- a/python/paddle/v2/framework/tests/test_precision_recall_op.py
+++ b/python/paddle/v2/framework/tests/test_precision_recall_op.py
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
 import unittest

-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.framework import g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import g_main_program


 class TestProgram(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import unittest



--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core


 class TestOpDesc(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_proximal_gd_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_gd_op.py
--- a/python/paddle/v2/framework/tests/test_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_rank_loss_op.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
 import unittest

-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core


 class PyRNNBase(object):

--- a/python/paddle/v2/framework/tests/test_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_reduce_op.py
--- a/python/paddle/v2/framework/tests/test_regularizer.py
+++ b/python/paddle/v2/framework/tests/test_regularizer.py
 import unittest

-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.optimizer as optimizer
-import paddle.v2.framework.regularizer as regularizer
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.regularizer as regularizer
+from paddle.v2.fluid.backward import append_backward_ops


 class TestL2DecayRegularizer(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_reshape_op.py
+++ b/python/paddle/v2/framework/tests/test_reshape_op.py
--- a/python/paddle/v2/framework/tests/test_rmsprop_op.py
+++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py
--- a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
 import unittest

-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core


 def create_tensor(np_data, place):

--- a/python/paddle/v2/framework/tests/test_scale_op.py
+++ b/python/paddle/v2/framework/tests/test_scale_op.py
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import unittest


 class TestScope(unittest.TestCase):
    def test_create_destroy(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
        scope = paddle_c.Scope()
        self.assertIsNotNone(scope)
        scope_with_parent = scope.new_scope()
        self.assertIsNotNone(scope_with_parent)

    def test_none_variable(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
        scope = paddle_c.Scope()
        self.assertIsNone(scope.find_var("test"))

    def test_create_var_get_var(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
        scope = paddle_c.Scope()
        var_a = scope.var("var_a")
        self.assertIsNotNone(var_a)
@@ -25,7 +25,7 @@ class TestScope(unittest.TestCase):
        self.assertIsNotNone(scope2.find_var('var_a'))

    def test_var_get_int(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
        scope = paddle_c.Scope()
        var = scope.var("test_int")
        var.set_int(10)

--- a/python/paddle/v2/framework/tests/test_selected_rows.py
+++ b/python/paddle/v2/framework/tests/test_selected_rows.py
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np


--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py
--- a/python/paddle/v2/framework/tests/test_seq_conv.py
+++ b/python/paddle/v2/framework/tests/test_seq_conv.py
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
--- a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
 import unittest
 import numpy as np
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 from op_test import OpTest



--- a/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
 import unittest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.executor import Executor
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.framework import g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.framework import g_main_program
 import numpy



--- a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/v2/framework/tests/test_sign_op.py
+++ b/python/paddle/v2/framework/tests/test_sign_op.py
--- a/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/framework/tests/test_split_and_merge_lod_tensor_op.py
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import numpy as np
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops


 class TestCPULoDTensorArrayOps(unittest.TestCase):

--- a/python/paddle/v2/framework/tests/test_split_op.py
+++ b/python/paddle/v2/framework/tests/test_split_op.py
--- a/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
+++ b/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
--- a/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
--- a/python/paddle/v2/framework/tests/test_sum_op.py
+++ b/python/paddle/v2/framework/tests/test_sum_op.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy


--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np


--- a/python/paddle/v2/framework/tests/test_top_k_op.py
+++ b/python/paddle/v2/framework/tests/test_top_k_op.py
--- a/python/paddle/v2/framework/tests/test_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
 import unittest
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 import numpy



--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py
 import unittest
-from paddle.v2.framework.framework import Variable, g_main_program, Program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import Variable, g_main_program, Program
+import paddle.v2.fluid.core as core
 import numpy as np



--- a/python/paddle/v2/framework/tests/test_while_op.py
+++ b/python/paddle/v2/framework/tests/test_while_op.py
 import unittest
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.executor import Executor
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
 import numpy



--- a/python/paddle/v2/framework/evaluator.py
+++ b/python/paddle/v2/framework/evaluator.py
-import paddle.v2.framework.op as op
-import numpy as np
-import paddle.v2.framework.core as core
-
-
-def avg_accumulate(accumulated_var, per_eval, num_batches, place):
-    t = np.array(accumulated_var.get_tensor())
-    t[0] += per_eval[0]
-    accumulated_var.get_tensor().set([t[0] / float(num_batches)], place)
-
-
-class Evaluator(object):
-    def __init__(self,
-                 scope,
-                 operator='accuracy',
-                 input='Inference',
-                 label='Label',
-                 output='Output',
-                 place=core.CPUPlace()):
-        """
-        create an evaluator for evaluating the inference.
-        NOTE: default run on CPUPlace(), running on GPUPlace doesn't improve performance much.
-
-        :param scope: the scope instance contains the input.
-        :type scope: paddle.v2.framework.core.scope
-        :param operator: operator name for caculating the evaluation for each mini-batch.
-        :type operator: string
-        :param input: output variable name of forward network.
-        :type input: string
-        :param label: variable name of label
-        :type label: string
-        """
-        self.scope = scope
-        self.place = place
-        self.output_name = output
-        self.num_batches = 0
-        # create variable to store accumulated evaluator output
-        eval_name = ''.join([operator, "@Eval"])
-        if scope.find_var(eval_name):
-            raise Exception("evaluator already exist in scope: %s" % eval_name)
-        self.accumulated_var = scope.var(eval_name)
-        t = self.accumulated_var.get_tensor()
-        t.set_dims((1, ))
-        t.set([0.0], place)
-        # self.accumulated_var = block.create_var(block, name=eval_name, shape=(1,))
-        # self.accumulated_var.get_tensor().set([0.0])
-        # create operator of evaluation
-        var_map = dict()  # var name -> variable
-        var_map[input] = [input]
-        var_map[label] = [label]
-        var_map[output] = [output]
-        self.op = op.Operator(operator, **var_map)
-
-    def evaluate(self, ctx, accumulator=avg_accumulate):
-        self.op.run(self.scope, ctx)
-        per_eval = np.array(self.scope.find_var(self.output_name).get_tensor())
-        self.num_batches += 1
-        accumulator(self.accumulated_var, per_eval, self.num_batches,
-                    self.place)
--- a/python/paddle/v2/framework/math_ops.py
+++ b/python/paddle/v2/framework/math_ops.py
+import paddle.v2.framework.core as core
+from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, \
+    Operator
--- a/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
+import unittest
+
+import numpy as np
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+
+class TestBeamSearchDecodeOp(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        self.cpu_place = core.CPUPlace()
+
+    def append_lod_tensor(self, tensor_array, lod, data):
+        lod_tensor = core.LoDTensor()
+        lod_tensor.set_lod(lod)
+        lod_tensor.set(data, self.cpu_place)
+        tensor_array.append(lod_tensor)
+
+    def test_get_set(self):
+        ids = self.scope.var("ids").get_lod_tensor_array()
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="int64"))
+
+        scores = self.scope.var("scores").get_lod_tensor_array()
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="float32"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="float32"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="float32"))
+
+        sentence_ids = self.scope.var("sentence_ids").get_tensor()
+        sentence_scores = self.scope.var("sentence_scores").get_tensor()
+
+        beam_search_decode_op = Operator(
+            "beam_search_decode",
+            # inputs
+            Ids="ids",
+            Scores="scores",
+            # outputs
+            SentenceIds="sentence_ids",
+            SentenceScores="sentence_scores")
+
+        ctx = core.DeviceContext.create(self.cpu_place)
+        beam_search_decode_op.run(self.scope, ctx)
+
+        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
+        self.assertEqual(sentence_ids.lod(), expected_lod)
+        self.assertEqual(sentence_scores.lod(), expected_lod)
+
+        expected_data = np.array(
+            [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
+        self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
+        self.assertTrue(
+            np.array_equal(np.array(sentence_scores), expected_data))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_evaluator.py
+++ b/python/paddle/v2/framework/tests/test_evaluator.py
-from paddle.v2.framework.evaluator import Evaluator
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
-import unittest
-import op_test
-import numpy as np
-
-
-class TestEvaluator(unittest.TestCase):
-    def setup(self, scope, inputs, outputs):
-        def __create_var__(var_name, arr):
-            np_arr = np.array(arr)
-            scope.var(var_name)
-            # tensor = var.get_tensor()
-            # tensor.set_dims(np_arr.shape)
-
-        for var_name, arr in inputs.iteritems():
-            __create_var__(var_name, arr)
-
-        for var_name, arr in outputs.iteritems():
-            __create_var__(var_name, arr)
-
-    def test_evaluator(self):
-
-        inputs = {
-            'Inference': np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 1]]).T,
-            'Label': np.array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])
-        }
-        outputs = {'Accuracy': np.array([0.9])}
-        out_name = 'Accuracy'
-
-        places = [core.CPUPlace()]
-        if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
-
-        for place in places:
-            scope = core.Scope()
-            self.setup(scope, inputs, outputs)
-
-            evaluator = Evaluator(
-                scope,
-                operator='accuracy',
-                input='Inference',
-                label='Label',
-                output=out_name,
-                place=place)
-            op_test.set_input(scope, evaluator.op, inputs, place)
-            ctx = core.DeviceContext.create(place)
-
-            for i in range(10):  # simulate 10 mini-batches
-                evaluator.evaluate(ctx)
-
-            actual = np.array(scope.find_var(out_name).get_tensor())
-            print actual
-
-            self.assertTrue(
-                np.allclose(
-                    actual, outputs[out_name], atol=1e-5),
-                "output name: " + out_name + " has diff.")
-
-
-if __name__ == '__main__':
-    exit(0)
-    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -13,8 +13,8 @@ packages=['paddle',
          'paddle.v2.reader',
          'paddle.v2.master',
          'paddle.v2.plot',
-          'paddle.v2.framework',
-          'paddle.v2.framework.proto',
+          'paddle.v2.fluid',
+          'paddle.v2.fluid.proto',
          'py_paddle']

 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -44,14 +44,14 @@ setup(name='paddlepaddle',
      ext_modules=[Extension('_foo', ['stub.cc'])],
      package_data={
        'paddle.v2.master': ['libpaddle_master.so'],
-        'paddle.v2.framework': ['core.so'],
+        'paddle.v2.fluid': ['core.so'],
        'py_paddle':['*.py','_swig_paddle.so']
      },
      package_dir={
          '': '${CMAKE_CURRENT_SOURCE_DIR}',
-          # The paddle.v2.framework.proto will be generated while compiling.
+          # The paddle.v2.fluid.proto will be generated while compiling.
          # So that package points to other directory.
-          'paddle.v2.framework.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
          'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
      },
      scripts=paddle_bins,