Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into design_doc_edit

d7cd400a · Kavya Srinet · b341636f · c3a61349 · d7cd400a · d7cd400a
231 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -21,7 +21,7 @@ third_party/
 cmake-build-*

 # generated while compiling
-python/paddle/v2/framework/core.so
+python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake

--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -121,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat,

 paddle_matrix paddle_matrix_create_sparse(
    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+#ifndef PADDLE_MOBILE_INFERENCE
  auto ptr = new paddle::capi::CMatrix();
  ptr->mat = paddle::Matrix::createSparseMatrix(
      height,
@@ -131,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse(
      false,
      useGpu);
  return ptr;
+#else
+  return nullptr;
+#endif
 }

 paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
@@ -140,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                            uint64_t colSize,
                                            float* valueArray,
                                            uint64_t valueSize) {
+#ifndef PADDLE_MOBILE_INFERENCE
  if (mat == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (rowArray == nullptr || colArray == nullptr ||
@@ -160,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
  } else {
    return kPD_NOT_SUPPORTED;
  }
+#else
+  return kPD_NOT_SUPPORTED;
+#endif
 }
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height,
 * @param isBinary is binary (either 1 or 0 in matrix) or not.
 * @param useGpu is using GPU or not.
 * @return paddle_matrix.
+ * @note Mobile inference does not support this interface.
 */
 PD_API paddle_matrix paddle_matrix_create_sparse(
    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
@@ -129,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
 * NULL if the matrix is binary.
 * @param [in] valueSize length of value array. Zero if the matrix is binary.
 * @return paddle_error
+ * @note Mobile inference does not support this interface.
 */
 PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                                   int* rowArray,

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -27,7 +27,9 @@ if(WITH_GPU)
    set_source_files_properties(${CUDA_CXX_SOURCES}
                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
+    if (NOT MOBILE_INFERENCE)
    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
+    endif()
 endif()

 set(CUDA_CU_SOURCES

--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "hl_base.h"

 /**
- * @brief   Maximum pool forward.
+ * @brief   Maximum pool forward with Mask output.
 *
 * @param[in]   frameCnt    batch size of input image.
 * @param[in]   inputData   input data.
@@ -35,7 +35,7 @@ limitations under the License. */
 * @param[in]   paddingW    padding width.
 * @param[out]  tgtData     output data.
 * @param[in]   tgtStride   stride between output data samples.
- *
+ * @param[out]  maskData    the location indices of select max data.
 */
 extern void hl_maxpool_forward(const int frameCnt,
                               const real* inputData,
@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt,
                               const int paddingH,
                               const int paddingW,
                               real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               real* maskData = NULL);

 /**
 * @brief   Maximum pool backward.

--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt,
                               const int paddingH,
                               const int paddingW,
                               real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               real* MaskData) {}

 inline void hl_maxpool_backward(const int frameCnt,
                                const real* inputData,

--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads,
                                 const int offsetH,
                                 const int offsetW,
                                 real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 real* maskData) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    int pw = index % pooledW;
@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads,
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
    real maxval = -FLT_MAX;
+    int max_index = -1;
    inputData += (frameNum * channels + c) * height * width;
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w])
-          maxval = inputData[h * width + w];
+        if (maxval < inputData[h * width + w]) {
+          max_index = h * width + w;
+          maxval = inputData[max_index];
+        }
      }
    }
    int tgtIndex =
        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
    tgtData[tgtIndex] = maxval;
+    if (maskData != NULL) {
+      maskData[tgtIndex] = max_index;
+    }
  }
 }

@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt,
                        const int paddingH,
                        const int paddingW,
                        real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        real* maskData) {
  int num_kernels = pooledH * pooledW * channels * frameCnt;
  int blocks = (num_kernels + 1024 - 1) / 1024;
  dim3 threads(1024, 1);
@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt,
                                                         paddingH,
                                                         paddingW,
                                                         tgtData,
-                                                         tgtStride);
+                                                         tgtStride,
+                                                         maskData);
  CHECK_SYNC("hl_maxpool_forward failed");
 }


--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -377,6 +377,12 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
  return grad_op_descs;
 }

+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx);
+
 std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
    ProgramDescBind& program_desc, int block_idx,
    std::unordered_set<std::string>* no_grad_vars,
@@ -392,13 +398,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(

    if ((*it)->Type() == "recurrent") {
      int step_block_idx = (*it)->GetBlockAttr("step_block");
-      auto backward_block_op_descs = MakeBlockBackward(
-          program_desc, step_block_idx, no_grad_vars, grad_to_var);
+      BlockDescBind* backward_block = CreateStepBlock(
+          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else if ((*it)->Type() == "conditional_block") {
      BlockDescBind* backward_block =
-          program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-      for (auto& ptr : backward_block_op_descs) {
-        backward_block->AppendAllocatedOp(std::move(ptr));
-      }
+          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
+                          (*it)->GetBlockAttr("block"));
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
    } else {
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
@@ -449,6 +455,21 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
  return backward_descs;
 }

+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx) {
+  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
+                                                   no_grad_vars, grad_to_var);
+  BlockDescBind* backward_block =
+      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+  for (auto& ptr : backward_block_op_descs) {
+    backward_block->AppendAllocatedOp(move(ptr));
+  }
+  return backward_block;
+}
+
 ParamGradInfoMap AppendBackward(
    ProgramDescBind& program_desc, const VarDescBind& target,
    const std::unordered_set<std::string>& no_grad_vars) {

--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -27,10 +27,32 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
    return VarDesc_VarType_LOD_RANK_TABLE;
  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+    return VarDesc_VarType_SELECTED_ROWS;
  } else {
    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
  }
 }

+template <typename Visitor>
+inline void VisitVarType(const Variable& var, Visitor visitor) {
+  switch (ToVarType(var.Type())) {
+    case VarDesc_VarType_LOD_TENSOR:
+      visitor(var.Get<framework::LoDTensor>());
+      return;
+    case VarDesc_VarType_LOD_RANK_TABLE:
+      visitor(var.Get<LoDRankTable>());
+      return;
+    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+      visitor(var.Get<LoDTensorArray>());
+      return;
+    case VarDesc_VarType_SELECTED_ROWS:
+      visitor(var.Get<SelectedRows>());
+      return;
+    default:
+      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -61,6 +61,7 @@ public:
    // function arguments
    strides_ = config.get<std::vector<size_t>>("strides");
    paddings_ = config.get<std::vector<size_t>>("paddings");
+    dilations_ = config.get<std::vector<size_t>>("dilations");
    groups_ = config.get<size_t>("groups");

    // number of inputs and outputs
@@ -118,6 +119,7 @@ protected:

  std::vector<size_t> strides_;
  std::vector<size_t> paddings_;
+  std::vector<size_t> dilations_;

  /// Group size, refer to grouped convolution in
  /// Alex Krizhevsky's paper: when group=2, the first half of the
@@ -133,6 +135,10 @@ protected:

  inline int paddingW() const { return paddings_[1]; }

+  inline int dilationH() const { return dilations_[0]; }
+
+  inline int dilationW() const { return dilations_[1]; }
+
  // A temporary memory in convolution calculation.
  MemoryHandlePtr memory_;


--- a/paddle/function/ConvOpTest.h
+++ b/paddle/function/ConvOpTest.h
@@ -79,15 +79,26 @@ void Convolution(const std::string& conv1,
            if (outputChannels < inputChannels) continue;
            for (size_t stride : {1, 2}) {
              for (size_t padding : {0, 1}) {
+                for (size_t dilation : {1, 3}) {
                  if (padding >= filterSize) break;
+                  size_t filterS = (filterSize - 1) * dilation + 1;
+
+                  if (inputSize + 2 * padding < filterS) break;
+
+                  if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
+                       conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
+                      dilation > 1)
+                    break;

                  // NNPACK only supports stride = 1 if batchSize > 1
-                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
+                  if ((conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
                      batchSize > 1 && stride > 1)
                    break;

                  size_t outputSize =
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                      (inputSize - filterS + 2 * padding + stride) / stride;
                  VLOG(3) << " batchSize=" << batchSize
                          << " inputChannels=" << inputChannels
                          << " inputHeight=" << inputSize
@@ -96,17 +107,19 @@ void Convolution(const std::string& conv1,
                          << " filterHeight=" << filterSize
                          << " filterWidth=" << filterSize
                          << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;

                  std::vector<size_t> paddings = {padding, padding};
                  std::vector<size_t> strides = {stride, stride};
+                  std::vector<size_t> dilations = {dilation, dilation};
                  Compare2Function<DType1, DType2> test(
                      conv1,
                      conv2,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
+                          .set("dilations", dilations)
                          .set("groups", (size_t)1)
                          .set("algo", (std::string) "auto"));

@@ -125,6 +138,7 @@ void Convolution(const std::string& conv1,
        }
      }
    }
+  }
 }

 /**
@@ -144,6 +158,7 @@ void Convolution2(const std::string& conv1,
              for (size_t outputChannels : {7}) {
                size_t stride = 1;
                size_t padding = 0;
+                size_t dilation = 1;
                size_t outputHeight =
                    (inputHeight - filterHeight + 2 * padding + stride) /
                    stride;
@@ -162,6 +177,7 @@ void Convolution2(const std::string& conv1,

                std::vector<size_t> paddings = {padding, padding};
                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {dilation, dilation};
                Compare2Function<DType1, DType2> test(
                    conv1,
                    conv2,
@@ -169,6 +185,7 @@ void Convolution2(const std::string& conv1,
                        .set("paddings", paddings)
                        .set("strides", strides)
                        .set("groups", (size_t)1)
+                        .set("dilations", dilations)
                        .set("algo", (std::string) "auto"));

                TensorShape input{
@@ -223,6 +240,7 @@ void DepthwiseConvolution(const std::string& conv1,

                std::vector<size_t> paddings = {padding, padding};
                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {1, 1};
                size_t groups = inputChannels;
                Compare2Function<DType1, DType2> test(
                    conv1,
@@ -231,6 +249,7 @@ void DepthwiseConvolution(const std::string& conv1,
                        .set("paddings", paddings)
                        .set("strides", strides)
                        .set("groups", groups)
+                        .set("dilations", dilations)
                        .set("algo", (std::string) "auto"));

                TensorShape input{

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -100,7 +100,9 @@ public:
                 strideH(),
                 strideW(),
                 paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
        } else {
          colData = inputData + g * inputOffset;
        }
@@ -223,7 +225,9 @@ public:
                 strideH(),
                 strideW(),
                 paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
        }
      }
      inputGrad += inputChannels * inputHeight * inputWidth;
@@ -310,7 +314,9 @@ public:
                 strideH(),
                 strideW(),
                 paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
        } else {
          colData = inputData + g * inputOffset;
        }

--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -78,7 +78,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };

 template <ColFormat Format, DeviceType Device, class T>
@@ -91,7 +93,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };

 }  // namespace paddle
--- a/paddle/function/Im2ColOp.cpp
+++ b/paddle/function/Im2ColOp.cpp
@@ -31,7 +31,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -47,8 +49,8 @@ public:
      int c_im = c / filterWidth / filterHeight;
      for (int h = 0; h < outputHeight; ++h) {
        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
          if ((imRowIdx - paddingHeight) < 0 ||
              (imRowIdx - paddingHeight) >= inputHeight ||
              (imColIdx - paddingWidth) < 0 ||
@@ -81,7 +83,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -97,8 +101,8 @@ public:
      int c_im = c / filterWidth / filterHeight;
      for (int h = 0; h < outputHeight; ++h) {
        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
          if ((imRowIdx - paddingHeight) >= 0 &&
              (imRowIdx - paddingHeight) < inputHeight &&
              (imColIdx - paddingWidth) >= 0 &&
@@ -134,7 +138,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -147,9 +153,10 @@ public:
        for (int channel = 0; channel < inputChannels; ++channel) {
          for (int filterH = 0; filterH < filterHeight; ++filterH) {
            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
              int colDataOffset =
                  (((outputH * outputWidth + outputW) * inputChannels +
                    channel) *
@@ -189,7 +196,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -202,9 +211,10 @@ public:
        for (int channel = 0; channel < inputChannels; ++channel) {
          for (int filterH = 0; filterH < filterHeight; ++filterH) {
            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
              int colDataOffset =
                  (((outputH * outputWidth + outputW) * inputChannels +
                    channel) *

--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -28,6 +28,8 @@ __global__ void im2col(const T* data_im,
                       int strideW,
                       int paddingH,
                       int paddingW,
+                       int dilationH,
+                       int dilationW,
                       int height_col,
                       int width_col,
                       T* data_col) {
@@ -44,8 +46,8 @@ __global__ void im2col(const T* data_im,
    data_col += (channel_out * height_col + h_out) * width_col + w_out;
    for (int i = 0; i < blockH; ++i) {
      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in + i);
-        int cIdx = int(w_in + j);
+        int rIdx = int(h_in + i * dilationH);
+        int cIdx = int(w_in + j * dilationW);
        if ((rIdx - (int)paddingH) >= (int)height ||
            (rIdx - (int)paddingH) < 0 ||
            (cIdx - (int)paddingW) >= (int)width ||
@@ -77,7 +79,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -102,6 +106,8 @@ public:
                                                    strideWidth,
                                                    paddingHeight,
                                                    paddingWidth,
+                                                    dilationHeight,
+                                                    dilationWidth,
                                                    outputHeight,
                                                    outputWidth,
                                                    colData);
@@ -121,6 +127,8 @@ __global__ void col2im(size_t n,
                       size_t strideW,
                       size_t paddingH,
                       size_t paddingW,
+                       size_t dilationH,
+                       size_t dilationW,
                       size_t height_col,
                       size_t width_col,
                       T* data_im) {
@@ -131,23 +139,34 @@ __global__ void col2im(size_t n,
    int w = int(index % width);
    int h = int((index / width) % height);
    int c = int(index / (width * height));
+    int filterH = (blockH - 1) * dilationH + 1;
+    int filterW = (blockW - 1) * dilationW + 1;
+
    if ((w - (int)paddingW) >= 0 &&
        (w - (int)paddingW) < (width - 2 * paddingW) &&
        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
      // compute the start and end of the output
      int w_col_start =
-          (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
      int h_col_start =
-          (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
      int h_col_end = min(int(h / strideH + 1), int(height_col));
+
      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH * blockW) +
-                      (h - h_col * (int)strideH) * (int)blockW +
-                      (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+          int h_k = (h - h_col * strideH);
+          int w_k = (w - w_col * strideW);
+          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
+            h_k /= dilationH;
+            w_k /= dilationW;
+            int c_col =
+                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
+                    width_col +
+                w_col;
+            val += data_col[c_col];
+          }
        }
      }
      h -= paddingH;
@@ -173,7 +192,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -205,6 +226,8 @@ public:
        strideWidth,
        paddingHeight,
        paddingWidth,
+        dilationHeight,
+        dilationWidth,
        outputHeight,
        outputWidth,
        imData);
@@ -229,6 +252,8 @@ __global__ void im2colOCF(const T* imData,
                          int strideWidth,
                          int paddingHeight,
                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                          int outputHeight,
                          int outputWidth) {
  int swId = blockIdx.x;
@@ -237,8 +262,10 @@ __global__ void im2colOCF(const T* imData,
       channelId += blockDim.z) {
    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int widthOffset =
+            idx * dilationHeight + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationWidth + shId * strideHeight - paddingHeight;
        int imOffset = widthOffset + heightOffset * inputWidth +
                       channelId * inputHeight * inputWidth;

@@ -273,7 +300,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -312,6 +341,8 @@ public:
                                                       strideWidth,
                                                       paddingHeight,
                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                       outputHeight,
                                                       outputWidth);
    CHECK_SYNC("Im2ColFunctor GPU failed");
@@ -330,6 +361,8 @@ __global__ void col2imOCF(T* imData,
                          int strideWidth,
                          int paddingHeight,
                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                          int outputHeight,
                          int outputWidth) {
  int swId = blockIdx.x;
@@ -338,8 +371,10 @@ __global__ void col2imOCF(T* imData,
       channelId += blockDim.z) {
    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int widthOffset =
+            idx * dilationWidth + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationHeight + shId * strideHeight - paddingHeight;
        int imOffset = widthOffset + heightOffset * inputWidth +
                       channelId * inputHeight * inputWidth;

@@ -372,7 +407,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -411,6 +448,8 @@ public:
                                                       strideWidth,
                                                       paddingHeight,
                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                       outputHeight,
                                                       outputWidth);
    CHECK_SYNC("Col2ImFunctor GPU failed");

--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -29,14 +29,17 @@ void TestIm2ColFunctor() {
          for (size_t filterWidth : {3, 7}) {
            for (size_t stride : {1, 2}) {
              for (size_t padding : {0, 1}) {
-                if (inputHeight <= filterHeight || inputWidth <= filterWidth)
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
                    break;
-                if (padding >= filterHeight || padding >= filterWidth) break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
                  size_t outputHeight =
-                    (inputHeight - filterHeight + 2 * padding + stride) /
-                    stride;
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
                  size_t outputWidth =
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;

                  TensorShape imShape =
                      TensorShape({channels, inputHeight, inputWidth});
@@ -53,10 +56,14 @@ void TestIm2ColFunctor() {

                  size_t height = channels * filterHeight * filterWidth;
                  size_t width = outputHeight * outputWidth;
-                VectorPtr input1 = Vector::create(imShape.getElements(), false);
-                VectorPtr input2 = Vector::create(imShape.getElements(), false);
-                MatrixPtr output1 = Matrix::create(height, width, false, false);
-                MatrixPtr output2 = Matrix::create(width, height, false, false);
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(width, height, false, false);
                  input1->uniform(0.001, 1);
                  input2->copyFrom(*input1);

@@ -69,7 +76,9 @@ void TestIm2ColFunctor() {
                          stride,
                          stride,
                          padding,
-                        padding);
+                          padding,
+                          dilation,
+                          dilation);
                  im2Col2(input2->getData(),
                          imShape,
                          output2->getData(),
@@ -77,7 +86,9 @@ void TestIm2ColFunctor() {
                          stride,
                          stride,
                          padding,
-                        padding);
+                          padding,
+                          dilation,
+                          dilation);

                  // The transposition of the result of ColFormat == kCFO
                  // is equal to the result of ColFormat == kOCF.
@@ -87,6 +98,7 @@ void TestIm2ColFunctor() {

                  Col2ImFunctor<kCFO, Device, T> col2Im1;
                  Col2ImFunctor<kOCF, Device, T> col2Im2;
+
                  col2Im1(input1->getData(),
                          imShape,
                          output1->getData(),
@@ -94,7 +106,9 @@ void TestIm2ColFunctor() {
                          stride,
                          stride,
                          padding,
-                        padding);
+                          padding,
+                          dilation,
+                          dilation);
                  col2Im2(input2->getData(),
                          imShape,
                          output2->getData(),
@@ -102,8 +116,9 @@ void TestIm2ColFunctor() {
                          stride,
                          stride,
                          padding,
-                        padding);
-
+                          padding,
+                          dilation,
+                          dilation);
                  autotest::TensorCheckErr(*input1, *input2);
                }
              }
@@ -112,6 +127,7 @@ void TestIm2ColFunctor() {
        }
      }
    }
+  }
 }

 TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }

--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -85,9 +85,49 @@ if(MOBILE_INFERENCE)
         gradientmachines/GradientMachineMode.cpp
         gradientmachines/MultiGradientMachine.cpp)

-    # Remove useless layers
+    # Remove layers that used in training
    list(REMOVE_ITEM GSERVER_SOURCES
-    	 layers/RecurrentLayerGroup.cpp)
+    	 layers/RecurrentLayerGroup.cpp
+         layers/CostLayer.cpp
+         layers/MultiBoxLossLayer.cpp
+         layers/WarpCTCLayer.cpp
+         layers/CTCLayer.cpp
+         layers/LinearChainCTC.cpp
+         layers/PrintLayer.cpp)
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/OuterProdLayer.cpp
+         layers/SumToOneNormLayer.cpp
+         layers/ConvShiftLayer.cpp
+         layers/InterpolationLayer.cpp
+         layers/AgentLayer.cpp
+         layers/DotMulOperator.cpp
+         layers/GruStepLayer.cpp
+         layers/LstmStepLayer.cpp
+         layers/ConvexCombinationLayer.cpp
+         layers/Conv3DLayer.cpp
+         layers/DeConv3DLayer.cpp
+         layers/CropLayer.cpp
+         layers/CrossEntropyOverBeam.cpp
+         layers/DataNormLayer.cpp
+         layers/FeatureMapExpandLayer.cpp
+         layers/HierarchicalSigmoidLayer.cpp
+         layers/MultinomialSampler.cpp
+         layers/NCELayer.cpp
+         layers/KmaxSeqScoreLayer.cpp
+         layers/MDLstmLayer.cpp
+         layers/MultiplexLayer.cpp
+         layers/PadLayer.cpp
+         layers/Pool3DLayer.cpp
+         layers/ResizeLayer.cpp
+         layers/RotateLayer.cpp
+         layers/RowConvLayer.cpp
+         layers/RowL2NormLayer.cpp
+         layers/SamplingIdLayer.cpp
+         layers/ScaleShiftLayer.cpp
+         layers/SelectiveFullyConnectedLayer.cpp
+         layers/SpatialPyramidPoolLayer.cpp
+         layers/BilinearInterpLayer.cpp
+         layers/ClipLayer.cpp)
 endif()

 if(WITH_GPU)

--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -16,7 +16,6 @@ limitations under the License. */

 #include "NeuralNetwork.h"
 #include "hl_gpu.h"
-#include "paddle/gserver/layers/AgentLayer.h"
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -28,6 +27,7 @@ limitations under the License. */
 #ifndef PADDLE_MOBILE_INFERENCE
 #include "MultiNetwork.h"
 #include "RecurrentGradientMachine.h"
+#include "paddle/gserver/layers/AgentLayer.h"
 #endif

 namespace paddle {
@@ -192,9 +192,11 @@ void NeuralNetwork::init(const ModelConfig& config,
 void NeuralNetwork::connect(LayerPtr agentLayer,
                            LayerPtr realLayer,
                            int height) {
+#ifndef PADDLE_MOBILE_INFERENCE
  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
  CHECK_NOTNULL(agent);
  agent->setRealLayer(realLayer, height);
+#endif
 }

 void NeuralNetwork::connect(std::string agentLayerName,

--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -79,6 +79,10 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
  for (int i = 0; i < config_.inputs_size(); i++) {
    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    std::vector<size_t> dilations = {(size_t)dilationY_[i],
+                                     (size_t)dilation_[i]};
+
+    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);

    // Convolution Layer uses the GemmConv function by default.
    convType = "GemmConv";
@@ -97,13 +101,14 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
      if ((filterSize_[i] == filterSizeY_[i]) &&
          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
-          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
+          !useDilation) {
        convType = "NeonDepthwiseConv";
      }
 #endif
    }

-    if (FLAGS_use_nnpack && !isDeconv_) {
+    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
      createFunction(forward_,
                     "NNPACKConv",
                     FuncConfig()
@@ -117,6 +122,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
+                         .set("dilations", dilations)
                         .set("groups", (size_t)groups_[i]));

      createFunction(backward_,
@@ -124,6 +130,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
+                         .set("dilations", dilations)
                         .set("groups", (size_t)groups_[i]));

      createFunction(backward_,
@@ -131,6 +138,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
+                         .set("dilations", dilations)
                         .set("groups", (size_t)groups_[i]));
    }
  }

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -98,6 +98,7 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
 LayerPtr Layer::create(const LayerConfig& config) {
  std::string type = config.type();

+#ifndef PADDLE_MOBILE_INFERENCE
  // NOTE: As following types have illegal character '-',
  // they can not use REGISTER_LAYER to registrar.
  // Besides, to fit with old training models,
@@ -106,7 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) {
    return LayerPtr(new MultiClassCrossEntropy(config));
  else if (type == "rank-cost")
    return LayerPtr(new RankingCost(config));
-#ifndef PADDLE_MOBILE_INFERENCE
  else if (type == "auc-validation")
    return LayerPtr(new AucValidation(config));
  else if (type == "pnpair-validation")

--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxPoolWithMaskLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+  setOutput("mask", &mask_);
+  return true;
+}
+
+size_t MaxPoolWithMaskLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  layerSize = outputX_ * outputY_ * channels_;
+  getOutput().setFrameHeight(outputY_);
+  getOutput().setFrameWidth(outputX_);
+
+  return layerSize;
+}
+
+void MaxPoolWithMaskLayer::forward(PassType passType) {
+  size_t size = getSize();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  int batchSize = inputV->getHeight();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  CHECK_EQ(size, outV->getWidth());
+
+  resetSpecifyOutput(mask_,
+                     batchSize,
+                     size,
+                     /* isValueClean */ false,
+                     /* isGradClean */ true);
+
+  MatrixPtr maskV = mask_.value;
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_,
+                       maskV);
+}
+
+void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  MatrixPtr outGrad = getOutputGrad();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
+
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class MaxPoolWithMaskLayer : public PoolLayer {
+protected:
+  Argument mask_;
+
+public:
+  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
+      : PoolLayer(config) {}
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "PoolLayer.h"
+#include "MaxPoolWithMaskLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
 #ifdef PADDLE_WITH_CUDA
@@ -44,7 +45,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-
  return true;
 }

@@ -57,6 +57,8 @@ Layer* PoolLayer::create(const LayerConfig& config) {
  } else if (CudnnPoolLayer::typeCheck(pool)) {
    return new CudnnPoolLayer(config);
 #endif
+  } else if (pool == "max-pool-with-mask") {
+    return new MaxPoolWithMaskLayer(config);
  } else {
    LOG(FATAL) << "Unknown pool type: " << pool;
    return nullptr;

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
 # gserver pacakge unittests

 add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_MultinomialSampler)
 add_simple_unittest(test_RecurrentLayer)

+if(NOT MOBILE_INFERENCE)
+  add_simple_unittest(test_MultinomialSampler)
+endif()
+
 function(gserver_test TARGET)
  add_unittest_without_exec(${TARGET}
      ${TARGET}.cpp
@@ -24,6 +27,7 @@ gserver_test(test_ConvUnify)
 gserver_test(test_BatchNorm)
 gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
+gserver_test(test_MaxPoolingWithMaskOutput)

 ########## test_Mkldnn layers and activations ##########
 if(WITH_MKLDNN)
@@ -48,7 +52,7 @@ if(WITH_PYTHON)
 endif()

 ############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE)
+if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
    add_unittest_without_exec(test_WarpCTCLayer
        test_WarpCTCLayer.cpp)


--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -434,7 +434,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
  config.layerConfig.set_partial_sum(1);
  config.layerConfig.set_shared_biases(true);

-  int dilation = 1;
+  int dilation = 2;
  if (type == "cudnn_conv") {
 #if CUDNN_VERSION >= 6000
    dilation = 2;
@@ -1234,6 +1234,7 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 TEST(Layer, PoolLayer) {
  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);

 #ifdef PADDLE_WITH_CUDA
  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
@@ -1242,6 +1243,7 @@ TEST(Layer, PoolLayer) {
  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }


--- a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+++ b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(1);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
+                                       const string& poolType,
+                                       bool use_gpu,
+                                       MatrixPtr& maskMat) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(5);
+  pool->set_img_size_y(5);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+
+  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
+                   maskMat);
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  MatrixPtr inputMat;
+  MatrixPtr maskMat;
+  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
+                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
+                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
+  real maskData[] = {12, 4, 22, 24};
+
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->setData(inputData);
+  maskMat->setData(maskData);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->copyFrom(inputData, 25);
+  maskMat->copyFrom(maskData, 4);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#endif
+}
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1902,5 +1902,52 @@ void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
 }

 template class BaseMatrixT<real>;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 template class BaseMatrixT<int>;
+
+#else
+
+template <>
+void BaseMatrixT<int>::zero() {
+  applyUnary(unary::Zero<int>());
+}
+
+template <>
+void BaseMatrixT<int>::assign(int p) {
+  applyUnary(unary::Assign<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
+  applyBinary(binary::IsEqual<int>(value), b);
+}
+
+template <>
+void BaseMatrixT<int>::neg() {
+  applyUnary(unary::Neg<int>());
+}
+
+template <>
+void BaseMatrixT<int>::abs2() {
+  applyUnary(unary::Abs<int>());
+}
+
+template <>
+void BaseMatrixT<int>::add(int p) {
+  applyUnary(unary::Add<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::add(int p1, int p2) {
+  applyUnary(unary::Add2<int>(p1, p2));
+}
+
+template <>
+void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
+  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
+}
+
+#endif
 }  // namespace paddle
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -25,6 +25,19 @@ else()
    message(STATUS "Compile with MKLDNNMatrix")
 endif()

+if(MOBILE_INFERENCE)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
+    # Remove sparse
+    list(REMOVE_ITEM MATH_HEADERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
+endif()
 set(MATH_SOURCES
    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"

--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "Matrix.h"

@@ -309,3 +312,57 @@ private:
  using Matrix::subMatrix;
 };
 }  // namespace paddle
+
+#else
+
+#include "Matrix.h"
+
+namespace paddle {
+
+class CpuSparseMatrix : public Matrix {
+public:
+  CpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  real* getValue() const { return nullptr; }
+  size_t getColStartIdx(size_t i) const { return 0; }
+  size_t getRowStartIdx(size_t i) const { return 0; }
+  size_t getColNum(size_t i) const { return 0; }
+  int* getRowCols(size_t i) const { return nullptr; }
+
+  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
+    return nullptr;
+  }
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -451,6 +451,7 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
 }

 void GpuMatrix::collectBias(Matrix& a, real scale) {
+#ifdef PADDLE_WITH_CUDA
  CHECK_EQ(getHeight(), (size_t)1);
  CHECK_EQ(width_, a.getWidth());
  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
@@ -461,6 +462,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
  }
+#endif
 }

 void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
@@ -552,6 +554,7 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                    const GpuMatrix& b,
                    real scaleAB,
                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
  CHECK(isContiguous());
  CHECK(b.isContiguous());
  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
@@ -578,12 +581,14 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                          b.height_,
                          scaleAB,
                          scaleT);
+#endif
 }

 void GpuMatrix::mul(const GpuMatrix& a,
                    const GpuSparseMatrix& b,
                    real scaleAB,
                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
  CHECK(isContiguous());
  CHECK(a.isContiguous());
  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
@@ -622,6 +627,7 @@ void GpuMatrix::mul(const GpuMatrix& a,
                            scaleAB,
                            scaleT);
  }
+#endif
 }

 /* this = a*b */
@@ -1028,15 +1034,23 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";

  real* inputData = inputMat.getData();
+  real* maskData = NULL;
  size_t frameNum = inputMat.getHeight();
  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
  CHECK(height_ == inputMat.getHeight());
  CHECK(width_ == outputH * outputW * channels);

+  if (maskMatP != NULL) {
+    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
+    CHECK(outputH * outputW * channels == maskMatP->getWidth());
+    maskData = maskMatP->getData();
+  }
+
  hl_maxpool_forward(frameNum,
                     inputData,
                     channels,
@@ -1051,7 +1065,8 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                     paddingH,
                     paddingW,
                     data_,
-                     getStride());
+                     getStride(),
+                     maskData);
 }

 void GpuMatrix::maxPoolBackward(Matrix& inputMat,
@@ -1548,6 +1563,7 @@ void GpuMatrix::bilinearBackward(const Matrix& out,
 }

 void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);

@@ -1563,9 +1579,11 @@ void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
  hl_matrix_multi_binary_cross_entropy(
      output_d, entropy_d, mat_d, height_, outputPtr->width_);
+#endif
 }

 void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);

@@ -1581,6 +1599,7 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
  hl_matrix_multi_binary_cross_entropy_bp(
      output_d, grad_d, mat_d, height_, width_);
+#endif
 }

 void GpuMatrix::vol2Col(real* dataSrc,
@@ -1973,9 +1992,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
  real* inputData = inputMat.getData();
  real* outData = data_;
+  real* maskData = NULL;
  size_t num = inputMat.getHeight();
  size_t inLength = imgSizeH * imgSizeW;
  size_t outLength = outputH * outputW;
@@ -1984,6 +2005,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
  CHECK_EQ(channels * outLength, this->getWidth());
  size_t outStride = getStride();

+  if (maskMatP != NULL) {
+    maskData = maskMatP->getData();
+    CHECK_EQ(channels * outLength, maskMatP->getWidth());
+  }
+
  /* initialize the data_ */
  for (size_t i = 0; i < height_; i++) {
    for (size_t j = 0; j < width_; j++) {
@@ -2005,17 +2031,30 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
          int wstart = pw * strideW - paddingW;
          int wend = std::min(wstart + sizeX, imgSizeW);
          wstart = std::max(wstart, 0);
+          if (maskData == NULL) {
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
                outData[ph * outputW + pw] = std::max(
                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
              }
            }
+          } else {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
+                  outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
+                  maskData[ph * outputW + pw] = h * imgSizeW + w;
+                }
+              }
+            }
+          }
        }
      }
      // compute offset
      inputData += inLength;
      outData += outLength;
+
+      if (maskData != NULL) maskData += outLength;
    }
  }
 }
@@ -3226,6 +3265,7 @@ template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
                                                           real scaleAB,
                                                           real scaleT);

+#ifndef PADDLE_MOBILE_INFERENCE
 void SharedCpuMatrix::mul(CpuSparseMatrix* a,
                          CpuMatrix* b,
                          real scaleAB,
@@ -3354,6 +3394,7 @@ void SharedCpuMatrix::initBlock(int blockNum) {
  }
 }

+#endif
 /* Add a (column) vector b to matrix a, column by column */
 void CpuMatrix::addColumnVector(const Matrix& b) {
  BaseMatrix::addColVector(const_cast<Matrix&>(b));

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -861,7 +861,8 @@ public:

  /**
   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value
+   * in the sizeX of value, if the maskMatP is not NULL, it will
+   * also caculate the location indices.
   */
  virtual void maxPoolForward(Matrix& inputMat,
                              size_t imgSizeH,
@@ -874,7 +875,8 @@ public:
                              size_t outputH,
                              size_t outputW,
                              size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              MatrixPtr maskMatP = NULL) {
    LOG(FATAL) << "Not implemeted";
  }

@@ -1426,7 +1428,8 @@ public:
                      size_t outputH,
                      size_t outputW,
                      size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);

  void maxPoolBackward(Matrix& image,
                       size_t imgSizeH,
@@ -1697,7 +1700,8 @@ public:
                      size_t outputH,
                      size_t outputW,
                      size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);

  void maxPoolBackward(Matrix& image,
                       size_t imgSizeH,
@@ -2066,6 +2070,7 @@ public:

 class SharedCpuMatrix : public CpuMatrix {
 public:
+#ifndef PADDLE_MOBILE_INFERENCE
  /* blockNum is number of partitions of the matrix  */
  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
      : CpuMatrix(height, width, trans) {
@@ -2111,6 +2116,7 @@ private:
  ThreadLocal<CpuMatrixPtr> localBuf_;
  ThreadLocal<std::vector<int>> localBufRows_;
  ThreadLocal<std::vector<int>> blockSeq_;
+#endif
 };

 typedef struct { unsigned int col; } sparse_non_value_t;

--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "CpuSparseMatrix.h"
 #include "Matrix.h"
@@ -237,3 +240,47 @@ private:
 };

 }  // namespace paddle
+
+#else
+
+#include "CpuSparseMatrix.h"
+
+namespace paddle {
+
+class GpuSparseMatrix : public Matrix {
+public:
+  GpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, true) {}
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once

+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <gflags/gflags.h>
 #include <string.h>
 #include <algorithm>
@@ -313,3 +315,27 @@ private:
 };

 }  // namespace paddle
+
+#else
+namespace paddle {
+
+class SparseRowCpuMatrix : public CpuMatrix {
+public:
+  void reserveStore() {}
+  void clearIndices() {}
+};
+
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+public:
+  void setupIndices() {}
+  void addRows(MatrixPtr input) {}
+  void addRows(IVectorPtr ids) {}
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
+class SparseRowIdsCpuMatrix : public CpuMatrix {};
+
+}  // namespace paddle
+
+#endif
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -3,8 +3,10 @@
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
-add_simple_unittest(test_SparseMatrix)
 add_simple_unittest(test_RowBuffer)
+if(NOT MOBILE_INFERENCE)
+    add_simple_unittest(test_SparseMatrix)
+endif()

 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare

--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_type.h"
+
+namespace paddle {
+namespace operators {
+class AssignFunctor {
+ public:
+  AssignFunctor(framework::Variable *out,
+                const platform::DeviceContext &dev_ctx)
+      : out_(out), dev_ctx_(dev_ctx) {}
+
+  void operator()(const framework::LoDTensor &lod_tensor) const {
+    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
+    copy_tensor(lod_tensor, &out_tensor);
+  }
+
+  void operator()(const framework::LoDTensorArray &array) const {
+    auto &out_array = *out_->GetMutable<framework::LoDTensorArray>();
+    out_array.resize(array.size());
+    for (size_t i = 0; i < array.size(); ++i) {
+      copy_tensor(array[i], &out_array[i]);
+    }
+  }
+
+  void operator()(const framework::SelectedRows &rows) const {
+    framework::SelectedRows &out_rows =
+        *out_->GetMutable<framework::SelectedRows>();
+    out_rows.set_rows(rows.rows());
+    out_rows.set_height(rows.height());
+    auto &t = rows.value();
+    out_rows.mutable_value()->CopyFrom(t, t.place(), dev_ctx_);
+  }
+
+  template <typename T>
+  void operator()(const T &v) const {
+    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+  }
+
+ private:
+  void copy_tensor(const framework::LoDTensor &lod_tensor,
+                   framework::LoDTensor *out) const {
+    auto &out_tensor = *out;
+    out_tensor.CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_);
+    out_tensor.set_lod(lod_tensor.lod());
+  }
+
+  framework::Variable *out_;
+  const platform::DeviceContext &dev_ctx_;
+};
+
+class AssignOp : public framework::OperatorBase {
+ public:
+  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) {
+      return;
+    }
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(
+        out != nullptr,
+        "The Output(Out) should not be null if the Input(X) is set.");
+    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+  }
+};
+
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignOpProtoMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+             "could be LoDTensor, SelectedRows or LoDTensorArray.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+              "is the same as input X.");
+    AddComment(R"DOC(Assign Operator
+
+Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+  }
+};
+
+class AssignInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    if (context->HasInput("X")) {
+      auto type = context->GetInputsVarType("X")[0];
+      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::VarDesc_VarType_LOD_TENSOR) {
+        context->SetOutputDim("Out", context->GetInputDim("X"));
+      }
+    }
+  }
+};
+
+class AssignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *op = new framework::OpDescBind();
+    op->SetType("assign");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    return std::unique_ptr<framework::OpDescBind>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
+                  ops::AssignInferShape, ops::AssignOpProtoMaker);
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -27,6 +27,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase {
  void Run(const framework::Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {
    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+
    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
    const size_t step_num = ids->size();

--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class BilinearTensorProductOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
+                      "The input(Weight) must be a 3D tensor.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The first dimension(batch_size) of input(X) must be "
+                      "equal to the first dimension of the input(Y).");
+    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
+                      "The second dimension of input(X) must be equal to "
+                      "the second dimension of the input(Weight).");
+    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
+                      "The second dimension of input(Y) must be equal to "
+                      "the third dimension of the input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL,
+                     "The Input(Bias) must be a 2-D tensor with "
+                     "the 2nd dimension fixed to 1 (a row vector).");
+      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
+                        "The second dimension of input(Bias) must be equal "
+                        "to the first dimension of the input(Weight).");
+    }
+
+    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BilinearTensorProductOpMaker(framework::OpProto* proto,
+                               framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of bilinear_tensor_product operator.");
+    AddInput("Y", "The second input of bilinear_tensor_product operator.");
+    AddInput("Weight",
+             "The learnable parameters of bilinear_tensor_product operator.");
+    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
+        .AsDispensable();
+    AddOutput("Out", "The output of bilinear_tensor_product operator.");
+    AddComment(R"DOC(
+Bilinear Tensor Product operator.
+Given input X and Y, a 3D tensor weight, and bias. Each column of the
+output is computed by one slice i = 1, . . . , k of the tensor:
+
+    M =  (X W_i) \cdot Y
+    Out_i = \sum_i {M_i} + Bias_i
+
+)DOC");
+  }
+};
+
+class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
+                      "The input(Out@GRAD) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[0], out_dims[0],
+        "The first dimension(batch_size) of input(Out@GRAD) must be "
+        "equal to the first dimension of the Input(X).");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[0], out_dims[1],
+        "The second dimension of input(Out@GRAD) must be equal to "
+        "the third dimension of the Input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE_EQ(
+          bias_dims[1], out_dims[1],
+          "The second dimension of input(Out@GRAD) must be equal to "
+          "the second dimension of the Input(Bias).");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    auto weight_grad_name = framework::GradVarName("Weight");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+    if (ctx->HasOutput(weight_grad_name)) {
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
+            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
+            ops::BilinearTensorProductOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ b/paddle/operators/bilinear_tensor_product_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class BilinearTensorProductKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto output_mat = EigenMatrix<T>::From(*out);
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+    auto place = ctx.GetEigenDevice<Place>();
+
+    // Create the intermediate variable to caculate the result of
+    // Input(X) multiplied by Input(Weight_i), the formula is:
+    // left_mul = X Weight_i.
+    Tensor left_mul;
+    left_mul.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                             ctx.GetPlace());
+    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
+
+    for (int i = 0; i < out_dim; ++i) {
+      auto output_col_vec = output_mat.chip(i, 1);
+      Tensor weight_mat =
+          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
+      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                           batch_size, y_dim, x_dim, 1, x->data<T>(),
+                           weight_mat.data<T>(), 0, left_mul.data<T>());
+      output_col_vec.device(place) =
+          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
+    }
+    if (bias) {
+      auto bias_vec = EigenMatrix<T>::From(*bias);
+      Eigen::DSizes<int, 2> bcast(batch_size, 1);
+      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* weight = ctx.Input<Tensor>("Weight");
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+
+    auto x_mat = EigenMatrix<T>::From(*x);
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto d_out_mat = EigenMatrix<T>::From(*d_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    // Create the intermediate variable to caculate the Output(Y@Grad).
+    Tensor x_scale;
+    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
+                            ctx.GetPlace());
+    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
+
+    // Create the intermediate variable to caculate the Output(X@Grad).
+    Tensor y_scale;
+    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                            ctx.GetPlace());
+    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
+
+    math::SetConstant<Place, T> set_zero;
+
+    // Set Output(X@Grad) be zero.
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_x, static_cast<T>(0));
+    }
+
+    // Set Output(Y@Grad) be zero.
+    if (d_y) {
+      d_y->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_y, static_cast<T>(0));
+    }
+
+    // Caculate the Output(X@Grad) and Output(Y@Grad).
+    if (d_x || d_y) {
+      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
+      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor weight_i = weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        if (d_x) {
+          y_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_x) *
+              y_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasTrans,
+                               batch_size, x_dim, y_dim, 1, y_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_x->data<T>());
+        }
+        if (d_y) {
+          x_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_y) *
+              x_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                               batch_size, y_dim, x_dim, 1, x_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_y->data<T>());
+        }
+      }
+    }
+
+    // Caculate the gradient of Input(Weight).
+    if (d_weight) {
+      d_weight->mutable_data<T>(ctx.GetPlace());
+      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        x_scale_mat.device(place) =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_weight) *
+            x_mat;
+        math::gemm<Place, T>(ctx.device_context(), CblasTrans, CblasNoTrans,
+                             x_dim, y_dim, batch_size, 1, x_scale.data<T>(),
+                             y->data<T>(), 0, d_weight_i.data<T>());
+      }
+    }
+
+    // Caculate the gradient of Input(Bias).
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_mat = EigenMatrix<T>::From(*d_bias);
+      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -94,5 +94,13 @@ class CompareOp : public framework::OperatorWithKernel {

 REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
 REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
+REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_OP(greater_than, "Out = X > Y");
+REGISTER_LOGICAL_KERNEL(greater_than, CPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_OP(greater_equal, "Out = X >= Y");
+REGISTER_LOGICAL_KERNEL(greater_equal, CPU,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_OP(equal, "Out = X == Y");
 REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@@ -15,4 +15,9 @@
 #include "paddle/operators/compare_op.h"

 REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(greater_than, GPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -27,6 +27,24 @@ struct LessThanFunctor {
  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
 };

+template <typename T>
+struct LessEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
+};
+
+template <typename T>
+struct GreaterThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
+};
+
+template <typename T>
+struct GreaterEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
+};
+
 template <typename T>
 struct EqualFunctor {
  using ELEM_TYPE = T;

--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <algorithm>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ConditionalOp : public framework::OperatorBase {
+ public:
+  ConditionalOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  std::vector<const framework::LoDTensor *> InputTensors(
+      const framework::Scope &scope) const {
+    std::vector<const framework::LoDTensor *> retv;
+    auto xs = Inputs("X");
+    retv.resize(xs.size(), nullptr);
+    std::transform(
+        xs.begin(), xs.end(), retv.begin(),
+        [&scope](const std::string &var_name) -> const framework::LoDTensor * {
+          auto *var = scope.FindVar(var_name);
+          PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name);
+          return &var->Get<framework::LoDTensor>();
+        });
+    return retv;
+  }
+};
+
+class ConditionalBlockOp : public ConditionalOp {
+ public:
+  ConditionalBlockOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Output("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
+      scopes->resize(1);
+      scopes->front() = &scope.NewScope();
+      auto &cur_scope = *scopes->front();
+
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+    }
+  }
+};
+
+class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The conditional variable of this operator. If X is empty, the "
+             "whole sub-block will not be executed.")
+        .AsDuplicable();
+    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
+    AddOutput("Scope",
+              "(std::vector<Scope*>) The step scope of conditional block. To "
+              "unify the conditional block, rnn and while op, the type of "
+              "scope is std::vector<Scope*>");
+    AddAttr<framework::BlockDescBind *>(
+        "block", "The step block of conditional block operator");
+    AddComment(R"DOC(Conditional block operator
+
+Run the sub-block if X is not empty. Params is the other inputs and Out is the
+outputs of the sub-block.
+)DOC");
+  }
+};
+
+class ConditionalBlockGradOp : public ConditionalOp {
+ public:
+  ConditionalBlockGradOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = this->InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Input("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
+      framework::Scope &cur_scope = *scopes[0];
+
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+                                  Outputs(framework::GradVarName("Params")));
+
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+                                  Outputs(framework::GradVarName("X")));
+    }
+  }
+
+ private:
+  void AssignLocalGradientToGlobal(
+      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const std::vector<std::string> &p_names,
+      const std::vector<std::string> &pg_names) const {
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto out_grad_name = pg_names[i];
+      auto in_grad_name = framework::GradVarName(p_names[i]);
+      auto *in_var = cur_scope.FindVar(in_grad_name);
+      if (in_var == nullptr) {
+        continue;
+      }
+      auto new_in_grad_name = cur_scope.Rename(in_grad_name);
+      auto assign =
+          framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}},
+                                          {{"Out", {out_grad_name}}}, {});
+      assign->Run(cur_scope, dev_ctx);
+      cur_scope.Rename(new_in_grad_name, in_grad_name);
+    }
+  }
+};
+
+class ConditionalBlockGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInputs("X"));
+    if (context->HasInputs("Params")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
+      context->SetOutputsDim(framework::GradVarName("Params"),
+                             context->GetInputsDim("Params"));
+    }
+    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
+    context->SetOutputsDim(framework::GradVarName("X"),
+                           context->GetInputsDim("X"));
+  }
+};
+
+class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto grad_op = new framework::OpDescBind();
+    grad_op->SetType("conditional_block_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetInput("Scope", Output("Scope"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
+    grad_op->SetBlockAttr("block", *this->grad_block_[0]);
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
+                  ops::ConditionalBlockOpProtoMaker,
+                  ops::ConditionalBlockGradMaker);
+REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
+                  ops::ConditionalBlockGradInferShape);
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
@@ -29,7 +29,7 @@ class L1NormKernel : public framework::OpKernel<T> {
    Out->mutable_data<T>(context.GetPlace());

    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
    auto place = context.GetEigenDevice<Place>();

    out.device(place) = x.abs().sum();

--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDResetOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LoDResetOp should not be null.");
+    // If target LoD is not set form Input(), then it must be set from Attr().
+    if (!ctx->HasInput("TargetLoD")) {
+      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
+      PADDLE_ENFORCE(level0.size() > 1,
+                     "Target LoD is not found, should be set to be a valid one "
+                     "through Input() or Attr().");
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDResetOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
+    AddInput("TargetLoD",
+             "(Tensor, optional) The target level 0 LoD from Input().")
+        .AsDispensable();
+    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddAttr<std::vector<int>>("target_lod",
+                              "The target level 0 LoD from Attr().")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(LoDReset operator
+
+Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
+Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
+Currently the lod_reset operator only supports the reset of level 0 LoD.
+At least one of Input(TargetLoD) and Attr(target_lod) must be set,
+and if both of them are set, Input(TargetLoD) will be chosen as the
+target LoD.
+
+An example:
+Given a float LoDTensor X with shape (6, 1), its transpose form represents
+
+    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+
+with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+
+    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+
+If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
+the sequences that the LoDTensor Output(Out) contains becomes:
+
+    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+
+)DOC");
+  }
+};
+
+class LoDResetGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
+            ops::LoDResetGradOp);
+REGISTER_OP_CPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/lod_reset_op.cu
+++ b/paddle/operators/lod_reset_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::GPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class LoDResetKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
+
+    std::vector<int> level0;
+    if (lod_t) {
+      auto* lod = lod_t->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor lod_cpu;
+        lod_cpu.CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context());
+        lod = lod_cpu.data<int>();
+      }
+      level0 = std::vector<int>(lod, lod + lod_t->numel());
+    } else {
+      level0 = ctx.Attr<std::vector<int>>("target_lod");
+    }
+
+    PADDLE_ENFORCE(level0.size() > 1UL,
+                   "The size of target LoD should be greater than 1.");
+    PADDLE_ENFORCE(level0[0] == 0,
+                   "Target LoD should be a vector starting from 0.");
+    PADDLE_ENFORCE(level0.back() == in->dims()[0],
+                   "Target LoD should be a vector end with the "
+                   "first dimension of Input(X).");
+    for (size_t i = 0; i < level0.size() - 1; ++i) {
+      PADDLE_ENFORCE(level0[i + 1] > level0[i],
+                     "Target LoD should be an ascending vector.");
+    }
+
+    out->ShareDataWith(*in);
+    // cast level0 to size_t
+    std::vector<size_t> ulevel0(level0.size(), 0);
+    std::transform(level0.begin(), level0.end(), ulevel0.begin(),
+                   [](int a) { return static_cast<size_t>(a); });
+    framework::LoD target_lod;
+    target_lod.push_back(ulevel0);
+    out->set_lod(target_lod);
+  }
+};
+
+template <typename Place, typename T>
+class LoDResetGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->ShareDataWith(*d_out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -27,15 +27,15 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
    const int ksize_height = ksize[0];
    const int ksize_width = ksize[1];
    const int stride_height = strides[0];
@@ -47,7 +47,7 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
    const int output_stride = output_height * output_width;

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -87,11 +87,12 @@ template <typename PoolProcess, class T>
 class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@@ -110,7 +111,7 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -154,10 +155,11 @@ template <class T>
 class MaxPool2dGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@@ -176,7 +178,7 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -240,17 +242,17 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
@@ -265,7 +267,7 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
    const int output_stride = output_depth * output_height * output_width;

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -315,11 +317,12 @@ template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@@ -343,7 +346,7 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -398,10 +401,11 @@ template <class T>
 class MaxPool3dGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@@ -425,7 +429,7 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -498,15 +502,15 @@ template <typename T>
 class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
    const int ksize_height = ksize[0];
    const int ksize_width = ksize[1];
    const int stride_height = strides[0];
@@ -517,8 +521,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
    const int output_stride = output_height * output_width;

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -563,13 +567,13 @@ template <typename T>
 class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_height = input_grad.dims()[2];
-    const int input_width = input_grad.dims()[3];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
    const int output_channels = output_grad.dims()[1];
    const int output_height = output_grad.dims()[2];
    const int output_width = output_grad.dims()[3];
@@ -578,7 +582,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {

    const T* mask_data = mask.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int n = 0; n < batch_size; ++n) {
      for (int c = 0; c < output_channels; ++c) {
@@ -612,17 +616,17 @@ template <typename T>
 class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
@@ -636,8 +640,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
    const int output_stride = output_depth * output_height * output_width;

    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());

    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -691,14 +695,14 @@ template <typename T>
 class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_depth = input_grad.dims()[2];
-    const int input_height = input_grad.dims()[3];
-    const int input_width = input_grad.dims()[4];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
    const int output_channels = output_grad.dims()[1];
    const int output_depth = output_grad.dims()[2];
    const int output_height = output_grad.dims()[3];
@@ -708,7 +712,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {

    const T* mask_data = mask.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

    for (int n = 0; n < batch_size; ++n) {
      for (int c = 0; c < output_channels; ++c) {

--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -88,60 +88,62 @@ template <typename Place, typename PoolProcess, typename T>
 class Pool2dFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* output);
 };

 template <typename Place, typename PoolProcess, typename T>
 class Pool2dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };

 template <typename Place, class T>
 class MaxPool2dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };

 template <typename Place, typename PoolProcess, typename T>
 class Pool3dFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* output);
 };

 template <typename Place, typename PoolProcess, typename T>
 class Pool3dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };

 template <typename Place, class T>
 class MaxPool3dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };

 /*
@@ -155,38 +157,38 @@ template <typename Place, typename T>
 class MaxPool2dWithIndexFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask);
 };

 template <typename Place, typename T>
 class MaxPool2dWithIndexGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };

 template <typename Place, typename T>
 class MaxPool3dWithIndexFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask);
 };

 template <typename Place, typename T>
 class MaxPool3dWithIndexGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };

 }  // namespace math

--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -74,11 +74,10 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
  Tensor output;
  auto in_dims = input.dims();
  if (in_dims.size() == 3) {
-    output.Resize(in_dims);
+    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
    output.mutable_data<T>(context.GetPlace());
    EigenTranspose<Place, T, 3>(context, input, output, {1, 0, 2});
-    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
-    output.Resize(make_ddim(out_dims));
+    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
  } else {
    output.ShareDataWith(input);
  }

--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class MergeLoDTensorOp : public framework::OperatorBase {
+ public:
+  MergeLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
+    auto &in_false =
+        scope.FindVar(Input("InFalse"))->Get<framework::LoDTensor>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    int rank = in_true.dims().size();
+    platform::Place place = in_true.place();
+    std::type_index data_type = in_true.type();
+    framework::DDim in_true_dims =
+        framework::slice_ddim(in_true.dims(), 1, rank);
+
+    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
+
+    auto in_true_dim_vec = framework::vectorize(in_true_dims);
+    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+
+    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+
+    // Build LoDTensor `out`
+
+    size_t in_true_idx = 0;
+    size_t in_false_idx = 0;
+    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+      const framework::LoDTensor *input = nullptr;
+      size_t *in_idx = nullptr;
+      if (static_cast<int>(mask_data[i]) == 0) {
+        input = &in_false;
+        in_idx = &in_false_idx;
+      } else {
+        input = &in_true;
+        in_idx = &in_true_idx;
+      }
+      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+          input->lod(), *in_idx, (*in_idx) + 1, 0);
+      auto &lod_length = lod_and_offset.first;
+
+      framework::AppendLoD(out_lod, lod_length);
+
+      size_t start_offset = lod_and_offset.second.first;
+      size_t end_offset = lod_and_offset.second.second;
+
+      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      size_t len = end_offset - start_offset;
+      if (len == 0) {
+        continue;
+      }
+      out->Slice(out_offset, out_offset + len)
+          .CopyFrom(input->Slice(start_offset, end_offset), place, dev_ctx);
+      out_offset += len;
+      (*in_idx) += 1;
+    }
+
+    for (size_t i = 0; i < level; i++) {
+      out_lod->insert(out_lod->begin(), x.lod()[i]);
+    }
+  }
+};
+
+class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input LoDTensor, contains complete lod information to "
+             "construct the output");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddInput("InTrue", "The True branch to be merged");
+    AddInput("InFalse", "The False branch to be merged");
+    AddOutput("Out", "The merged output LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to rank.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Merge True and False branches of LoDTensor into a single Output,
+        with a mask at certain lod level. X is used to obtain complete
+        lod information. Please refer to SplitLoDTensorOp.)DOC");
+  }
+};
+
+class MergeLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "MergeLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "MergeLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasInput("InTrue"),
+                   "MergeLoDTensorOp must has input InTrue.");
+    PADDLE_ENFORCE(context->HasInput("InFalse"),
+                   "MergeLoDTensorOp must has input InFalse.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "MergeLoDTensorOp must has output Out");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
+  }
+};
+
+class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("split_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
+    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
+                  ops::MergeLoDTensorOpProtoMaker,
+                  ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -75,16 +75,16 @@ class PoolKernel : public framework::OpKernel<T> {
              Place, paddle::operators::math::MaxPool<T>, T>
              pool2d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);

        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool2dFunctor<
              Place, paddle::operators::math::AvgPool<T>, T>
              pool2d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
        }
      } break;
      case 3: {
@@ -93,15 +93,15 @@ class PoolKernel : public framework::OpKernel<T> {
              Place, paddle::operators::math::MaxPool<T>, T>
              pool3d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool3dFunctor<
              Place, paddle::operators::math::AvgPool<T>, T>
              pool3d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
        }
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -142,30 +142,30 @@ class PoolGradKernel : public framework::OpKernel<T> {
          if (pooling_type == "max") {
            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
                pool2d_backward;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings);
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, in_x_grad);
          } else if (pooling_type == "avg") {
            paddle::operators::math::Pool2dGradFunctor<
                Place, paddle::operators::math::AvgPoolGrad<T>, T>
                pool2d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings, pool_process);
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, pool_process, in_x_grad);
          }
        } break;
        case 3: {
          if (pooling_type == "max") {
            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
                pool3d_backward;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings);
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, in_x_grad);
          } else if (pooling_type == "avg") {
            paddle::operators::math::Pool3dGradFunctor<
                Place, paddle::operators::math::AvgPoolGrad<T>, T>
                pool3d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings, pool_process);
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, pool_process, in_x_grad);
          }
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }

--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -46,14 +46,14 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
      case 2: {
        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
            pool2d_forward;
-        pool2d_forward(context.device_context(), *in_x, *out, *mask, ksize,
-                       strides, paddings);
+        pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                       paddings, out, mask);
      } break;
      case 3: {
        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
            pool3d_forward;
-        pool3d_forward(context.device_context(), *in_x, *out, *mask, ksize,
-                       strides, paddings);
+        pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                       paddings, out, mask);
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
    }
@@ -89,14 +89,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
        case 2: {
          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
              pool2d_backward;
-          pool2d_backward(context.device_context(), *in_x_grad, *out_grad,
-                          *mask, ksize, strides, paddings);
+          pool2d_backward(context.device_context(), *out_grad, *mask, ksize,
+                          strides, paddings, in_x_grad);
        } break;
        case 3: {
          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
              pool3d_backward;
-          pool3d_backward(context.device_context(), *in_x_grad, *out_grad,
-                          *mask, ksize, strides, paddings);
+          pool3d_backward(context.device_context(), *out_grad, *mask, ksize,
+                          strides, paddings, in_x_grad);
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
      }

--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -126,6 +126,7 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
      Eigen::DSizes<int, 2> bcast(h, 1);

      if (pooltype == "AVERAGE") {
@@ -136,9 +137,9 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
        in_g_e.device(place) =
            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
      } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e;
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
      } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e;
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
      } else {
        PADDLE_THROW("unsupported pooling pooltype");
      }

--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+using LoD = framework::LoD;
+
+class SplitLoDTensorOp : public framework::OperatorBase {
+ public:
+  SplitLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto *out_true =
+        scope.FindVar(Output("OutTrue"))->GetMutable<framework::LoDTensor>();
+    auto *out_false =
+        scope.FindVar(Output("OutFalse"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+    auto &x_lod = x.lod();
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
+
+    // set out_true/out_false lod
+    for (size_t t = 0; t < 2; t++) {
+      LoD *lod = nullptr;
+      if (t == 0) {
+        lod = out_false->mutable_lod();
+      } else {
+        lod = out_true->mutable_lod();
+      }
+      lod->clear();
+      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+        if (static_cast<size_t>(mask_data[i]) == t) {
+          size_t start_idx = i;
+          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+              x_lod, start_idx, start_idx + 1, level);
+
+          auto &lod_length = lod_and_offset.first;
+          framework::AppendLoD(lod, lod_length);
+
+          size_t start_offset = lod_and_offset.second.first;
+          size_t end_offset = lod_and_offset.second.second;
+          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+        }
+      }
+    }
+
+    for (size_t t = 0; t < 2; ++t) {
+      framework::LoDTensor *out;
+      if (t == 0) {
+        out = out_false;
+      } else {
+        out = out_true;
+      }
+      auto &ranges = copy_ranges[t];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out->Resize(x_dim);
+      out->mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[offset: offset+len] = x[each_range.begin: each_range.end]
+        out->Slice(static_cast<int>(offset), static_cast<int>(offset + len))
+            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                              static_cast<int>(each_range.end)),
+                      x.place(), dev_ctx);
+        offset += len;
+      }
+    }
+  }
+};
+
+class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input LoDTensor");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddOutput("OutTrue", "True branch of input LoDTensor");
+    AddOutput("OutFalse", "False branch of input LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to split.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Split a LoDTensor with a Mask at certain level. The input LoDTensor
+        has 3 sequence at certain lod level. The Mask is a bool column vector,
+        such as [0, 1, 0] at the same level. The first and third sequence will
+        be send to False Output LoDTensor; whereas the second sequence will
+        be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC");
+  }
+};
+
+class SplitLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "SplitLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "SplitLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasOutput("OutTrue"),
+                   "SplitLoDTensorOp must has output OutTrue.");
+    PADDLE_ENFORCE(context->HasOutput("OutFalse"),
+                   "SplitLoDTensorOp must has output OutFalse.");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
+    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
+  }
+};
+
+class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("merge_lod_tensor");
+    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
+    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp,
+                  ops::SplitLoDTensorOpProtoMaker,
+                  ops::SplitLoDTensorInferShape,
+                  ops::SplitLoDTensorArrayGradMaker);
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -29,7 +29,7 @@ class SquaredL2NormKernel : public framework::OpKernel<T> {
    Out->mutable_data<T>(context.GetPlace());

    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
    auto place = context.GetEigenDevice<Place>();

    out.device(place) = x.square().sum();

--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -200,7 +200,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
                                     false,
                                     useGpu_);
    }
-  } else if (matType == MAT_NORMAL_SHARED) {
+  }
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOLINTNEXTLINE
+  else if (matType == MAT_NORMAL_SHARED) {
    CHECK_EQ(height * width, bufs_[pType]->getSize());
    size_t blockNum = 0;
    CHECK(isGradShared(&blockNum));
@@ -259,7 +262,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
  } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
    CHECK(isGradSparseUpdate());
    mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  } else {
+  }
+#endif
+  // NOLINTNEXTLINE
+  else {
    LOG(FATAL) << "Unsupported mat type" << matType;
  }
 }

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -42,6 +42,9 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #endif

+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
 namespace paddle {
 namespace pybind {
 static size_t UniqueIntegerGenerator(const std::string &prefix) {

--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -33,6 +33,7 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
                                 bool withValue,
                                 bool useGpu,
                                 bool equalNnzPerSample) {
+#ifndef PADDLE_MOBILE_INFERENCE
  std::vector<int64_t> ids(height);
  std::vector<int64_t> indices(height + 1);
  indices[0] = 0;
@@ -84,6 +85,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
    }
    return mat;
  }
+#endif
+  return nullptr;
 }

 void generateSequenceStartPositions(size_t batchSize,

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -37,10 +37,10 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)


-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
        DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)


 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -66,7 +66,7 @@ if (WITH_TESTING)
    add_subdirectory(paddle/v2/tests)
    add_subdirectory(paddle/v2/reader/tests)
    add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/v2/framework/tests)
+    add_subdirectory(paddle/v2/fluid/tests)
  endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1200,8 +1200,14 @@ def TestData(data_config, async_load_data=None):

 #caffe_mode: compute the output size using floor instead of ceil,
 #            which is consistent of caffe and CuDNN's convention.
-def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
-    output = (2 * padding + img_size - filter_size) / float(stride)
+def cnn_output_size(img_size,
+                    filter_size,
+                    padding,
+                    stride,
+                    caffe_mode,
+                    dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    output = (2 * padding + img_size - filter_s) / float(stride)
    if caffe_mode:
        return 1 + int(math.floor(output))
    else:
@@ -1210,8 +1216,14 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):

 #calcualte image_size based on output_size for de-convolution (ConvTransLayer).
 #It is the reverse function of cnn_output_size
-def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
-    img_size = (output_size - 1) * stride + filter_size - 2 * padding
+def cnn_image_size(output_size,
+                   filter_size,
+                   padding,
+                   stride,
+                   caffe_mode,
+                   dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    img_size = (output_size - 1) * stride + filter_s - 2 * padding
    if not caffe_mode:
        img_size = img_size + 1
    return img_size
@@ -1253,9 +1265,9 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
 def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
    pool_conf.pool_type = pool.pool_type
    config_assert(pool.pool_type in [
-        'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
-    ], "pool-type %s is not in "
-                  "['max-projection', 'avg-projection', "
+        'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
+    ], "pool-type %s is not in " \
+              "['max-projection', 'avg-projection', 'max-pool-with-mask'," \
                  "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)

    pool_conf.channels = pool.channels
@@ -1376,6 +1388,12 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
    conv_conf.stride_y = conv.stride_y
    conv_conf.groups = conv.groups
    conv_conf.caffe_mode = conv.caffe_mode
+    if not conv.dilation:
+        conv.dilation = 1
+        conv.dilation_y = 1
+    else:
+        conv_conf.dilation = conv.dilation
+        conv_conf.dilation_y = conv.dilation_y

    if not trans:
        conv_conf.filter_channels = conv.channels / conv.groups
@@ -1383,20 +1401,20 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
            get_img_size(input_layer_name, conv.channels)
        conv_conf.output_x = cnn_output_size(
            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
        conv_conf.output_y = cnn_output_size(
            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
    else:
        conv_conf.filter_channels = num_filters / conv.groups
        conv_conf.output_x, conv_conf.output_y = \
            get_img_size(input_layer_name, conv.channels)
        conv_conf.img_size = cnn_image_size(
            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
        conv_conf.img_size_y = cnn_image_size(
            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)


 #caffe_mode: compute the output size using floor instead of ceil,

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -20,7 +20,7 @@ from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
    ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, BasePoolingType, \
+from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
    CudnnAvgPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
@@ -2571,7 +2571,9 @@ def img_conv_layer(input,

    if layer_type:
        if dilation > 1 or dilation_y > 1:
-            assert layer_type in ["cudnn_conv", "cudnn_convt"]
+            assert layer_type in [
+                "cudnn_conv", "cudnn_convt", "exconv", "exconvt"
+            ]
        if trans:
            assert layer_type in ["exconvt", "cudnn_convt"]
        else:
@@ -2699,9 +2701,9 @@ def img_pool_layer(input,
    elif isinstance(pool_type, AvgPooling):
        pool_type.name = 'avg'

-    assert type(pool_type) in [AvgPooling, MaxPooling, CudnnAvgPooling,
+    assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
                               CudnnMaxPooling], \
-        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling are supported"
+        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"

    type_name = pool_type.name + '-projection' \
        if (
@@ -3592,10 +3594,9 @@ def lstm_step_layer(input,
    :type gate_act: BaseActivation
    :param state_act: State Activation Type. TanhActivation is the default.
    :type state_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param bias_attr: The parameter attribute for bias. If this parameter is
+                     set to True or None, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | True
    :param layer_attr: layer's extra attribute.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
@@ -3650,9 +3651,10 @@ def gru_step_layer(input,
    :param name: The name of this layer. It is optional.
    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
    :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param param_attr: the parameter_attribute for transforming the output_mem
                       from previous step.
@@ -3712,9 +3714,10 @@ def gru_step_naive_layer(input,
    :type act: BaseActivation
    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
    :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param param_attr:
    :param layer_attr:
@@ -3844,9 +3847,10 @@ def recurrent_layer(input,
    :type input: LayerOutput
    :param act: Activation type. TanhActivation is the default.
    :type act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to 
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If the parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param param_attr: parameter attribute.
    :type param_attr: ParameterAttribute
@@ -4836,9 +4840,10 @@ def tensor_layer(a,
    :type act: BaseActivation
    :param param_attr: The Parameter Attribute.
    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param layer_attr: Extra Layer config.
    :type layer_attr: ExtraLayerAttribute | None
@@ -4900,9 +4905,10 @@ def selective_fc_layer(input,
    :type act: BaseActivation
    :param param_attr: The Parameter Attribute.
    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param layer_attr: Extra Layer config.
    :type layer_attr: ExtraLayerAttribute | None
@@ -5585,10 +5591,10 @@ def nce_layer(input,
                             to the num_classes. Each member of the list defines
                             the probability of a class given input x.
    :type neg_distribution: list | tuple | collections.Sequence | None
-    :param bias_attr: The attribute for bias. If this parameter is set False or
-                      any object whose type is not ParameterAttribute, no bias
-                      is added. If this parameter is set True, the bias is
-                      initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param layer_attr: Extra Layer Attribute.
    :type layer_attr: ExtraLayerAttribute
@@ -6498,9 +6504,9 @@ def gated_unit_layer(input,
    :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
                            for details.
    :type gate_param_attr: ParameterAttribute
-    :param gate_bias_attr: The bias attribute of the gate. If the parameter is set to False or
+    :param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
                           an object whose type is not ParameterAttribute, no bias is defined.
-                           If the parameter is set to True, the bias is initialized to zero.
+                           If this parameter is set to True, the bias is initialized to zero.
    :type gate_bias_attr: ParameterAttribute | bool | None | Any
    :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
                        details.
@@ -6508,9 +6514,9 @@ def gated_unit_layer(input,
    :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
                              for details.
    :type inproj_param_attr: ParameterAttribute
-    :param inproj_bias_attr: The bias attribute of the projection. If the parameter is set to False
+    :param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
                             or an object whose type is not ParameterAttribute, no bias is defined.
-                             If the parameter is set to True, the bias is initialized to zero.
+                             If this parameter is set to True, the bias is initialized to zero.
    :type inproj_bias_attr: ParameterAttribute | bool | None | Any
    :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
                       details.

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -681,34 +681,42 @@ def lstmemory_unit(input,
                                   state_act=TanhActivation())


-    :param input: input layer.
+    :param input: Input layer.
    :type input: LayerOutput
-    :param out_memory: output of previous time step
+    :param out_memory: The output of previous time step.
    :type out_memory: LayerOutput | None
-    :param name: lstmemory unit name.
+    :param name: The lstmemory unit name.
    :type name: basestring
-    :param size: lstmemory unit size.
+    :param size: The lstmemory unit size.
    :type size: int
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
    :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
    :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
    :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
    :type state_act: BaseActivation
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param lstm_layer_attr: extra attribute of lstm layer.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
    :type lstm_layer_attr: ExtraLayerAttribute
-    :return: lstmemory unit name.
+    :return: The lstmemory unit name.
    :rtype: LayerOutput
    """
    if size is None:
@@ -786,34 +794,42 @@ def lstmemory_group(input,
                                    gate_act=SigmoidActivation(),
                                    state_act=TanhActivation())

-    :param input: input layer.
+    :param input: Input layer.
    :type input: LayerOutput
-    :param size: lstmemory group size.
+    :param size: The lstmemory group size.
    :type size: int
-    :param name: name of lstmemory group.
+    :param name: The name of lstmemory group.
    :type name: basestring
-    :param out_memory: output of previous time step.
+    :param out_memory: The output of previous time step.
    :type out_memory: LayerOutput | None
-    :param reverse: process the input in a reverse order or not.
+    :param reverse: Process the input in a reverse order or not.
    :type reverse: bool
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
    :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
    :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
    :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
    :type state_act: BaseActivation
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                           False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_layer_attr: lstm layer's extra attribute.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
    :type lstm_layer_attr: ExtraLayerAttribute
    :return: the lstmemory group.
    :rtype: LayerOutput

--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -15,8 +15,8 @@
 """

 __all__ = [
-    "BasePoolingType", "MaxPooling", "AvgPooling", "CudnnMaxPooling",
-    "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
+    "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
+    "CudnnMaxPooling", "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
 ]


@@ -55,6 +55,19 @@ class MaxPooling(BasePoolingType):
        self.output_max_index = output_max_index


+class MaxWithMaskPooling(BasePoolingType):
+    """
+    MaxWithMask pooling.
+
+    Not only return the very large values for each dimension in sequence or time steps,
+    but also the location indices of found maxinum values.
+
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "max-pool-with-mask")
+
+
 class CudnnMaxPooling(BasePoolingType):
    """
    Cudnn max pooling only support GPU. Return the maxinum value in the

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -28,6 +28,8 @@ layers {
      stride_y: 1
      output_y: 227
      img_size_y: 256
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -28,6 +28,8 @@ layers {
      stride_y: 1
      output_y: 227
      img_size_y: 256
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -28,6 +28,8 @@ layers {
      stride_y: 1
      output_y: 48
      img_size_y: 48
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -30,6 +30,8 @@ layers {
      stride_y: 1
      output_y: 48
      img_size_y: 48
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
@@ -105,6 +107,8 @@ layers {
      stride_y: 1
      output_y: 24
      img_size_y: 24
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_1__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
@@ -30,6 +30,8 @@ layers {
      stride_y: 1
      output_y: 48
      img_size_y: 48
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
@@ -36,6 +36,8 @@ layers {
      stride_y: 1
      output_y: 14
      img_size_y: 14
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/v2/framework/.gitignore
+++ b/python/paddle/v2/framework/.gitignore
--- a/python/paddle/v2/framework/__init__.py
+++ b/python/paddle/v2/framework/__init__.py
--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/framework/backward.py
-from paddle.v2.framework import framework as framework
+from paddle.v2.fluid import framework as framework

 __all__ = ['append_backward_ops']


--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -13,7 +13,7 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope. 
 """

-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import threading

 __tl_scope__ = threading.local()
@@ -27,13 +27,13 @@ __all__ = [
 def get_cur_scope():
    """
    Get current scope.
-    :rtype: paddle.v2.framework.core.Scope
+    :rtype: paddle.v2.fluid.core.Scope
    """
    cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
    if cur_scope_stack is None:
        __tl_scope__.cur_scope = list()
    if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
+        __tl_scope__.cur_scope.append(paddle.v2.fluid.core.Scope())
    return __tl_scope__.cur_scope[-1]



--- a/python/paddle/v2/framework/evaluator.py
+++ b/python/paddle/v2/framework/evaluator.py
-import paddle.v2.framework.op as op
+import paddle.v2.fluid.op as op
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core


 def avg_accumulate(accumulated_var, per_eval, num_batches, place):
@@ -22,7 +22,7 @@ class Evaluator(object):
        NOTE: default run on CPUPlace(), running on GPUPlace doesn't improve performance much.

        :param scope: the scope instance contains the input.
-        :type scope: paddle.v2.framework.core.scope
+        :type scope: paddle.v2.fluid.core.scope
        :param operator: operator name for caculating the evaluation for each mini-batch.
        :type operator: string
        :param input: output variable name of forward network.

--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/framework/executor.py
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Block, Program, g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import Block, Program, g_main_program

 g_scope = core.Scope()


--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import collections
 import numpy as np
 import copy
@@ -285,7 +285,7 @@ class Operator(object):
        self.desc.check_attrs()
        no_kernel_op_set = {
            'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad', 'while'
+            'rnn_memory_helper_grad', 'conditional_block', 'while'
        }
        if type not in no_kernel_op_set:
            self.desc.infer_var_type(self.block.desc)

--- a/python/paddle/v2/framework/initializer.py
+++ b/python/paddle/v2/framework/initializer.py
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework
 import numpy as np

 __all__ = [

--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/framework/io.py
 import os
 import cPickle as pickle

-from paddle.v2.framework.framework import Program, Parameter, g_main_program, \
+from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
    Variable

 __all__ = [

--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
 import copy
 import itertools

-from paddle.v2.framework.framework import Variable, g_main_program, \
+from paddle.v2.fluid.framework import Variable, g_main_program, \
    g_startup_program, unique_name, Program
-from paddle.v2.framework.initializer import ConstantInitializer, \
+from paddle.v2.fluid.initializer import ConstantInitializer, \
    UniformInitializer, XavierInitializer



--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
-from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, \
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
    Operator
-from paddle.v2.framework.initializer import ConstantInitializer, \
+from paddle.v2.fluid.initializer import ConstantInitializer, \
    NormalInitializer
-from paddle.v2.framework.layer_helper import LayerHelper, unique_name
+from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
 import cStringIO

 __all__ = [
    'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
-    'batch_norm', 'accuracy'
+    'batch_norm', 'accuracy', 'split_lod_tensor'
 ]


@@ -226,6 +226,11 @@ def data(name,
        stop_gradient=stop_gradient)


+def create_tensor(dtype, name=None, main_program=None):
+    helper = LayerHelper("create_tensor", **locals())
+    return helper.create_variable(name=helper.name, dtype=dtype)
+
+
 def _convert_(name):
    """
    Formatting.
@@ -451,6 +456,56 @@ def sums(input, main_program=None, startup_program=None):
    return out


+def assign(input, output, main_program=None):
+    helper = LayerHelper('assign', **locals())
+    helper.append_op(
+        type='scale',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs={'scale': 1.0})
+    return output
+
+
+def split_lod_tensor(input,
+                     mask,
+                     level,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('split_lod_tensor', **locals())
+    out_true = helper.create_tmp_variable(dtype=input.data_type)
+    out_false = helper.create_tmp_variable(dtype=input.data_type)
+    helper.append_op(
+        type='split_lod_tensor',
+        inputs={
+            'X': input,
+            'Mask': mask,
+        },
+        outputs={'OutTrue': out_true,
+                 'OutFalse': out_false},
+        attrs={'level': level})
+    return out_true, out_false
+
+
+def merge_lod_tensor(in_true,
+                     in_false,
+                     x,
+                     mask,
+                     level,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('merge_lod_tensor', **locals())
+    out = helper.create_tmp_variable(dtype=x.data_type)
+    helper.append_op(
+        type='merge_lod_tensor',
+        inputs={'X': x,
+                'Mask': mask,
+                'InTrue': in_true,
+                'InFalse': in_false},
+        outputs={'Out': out},
+        attrs={'level': level})
+    return out
+
+
 def cos_sim(X, Y, **kwargs):
    """
    This function performs the cosine similarity between two tensors
@@ -784,6 +839,23 @@ def batch_norm(input,
    return helper.append_activation(batch_norm_out)


+def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.data_type)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.data_type)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        })
+
+    return sentence_ids, sentence_scores
+
+
 class BlockGuard(object):
    """
    BlockGuard class.
@@ -1375,3 +1447,73 @@ def array_length(array, main_program=None):
    helper.append_op(
        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
    return tmp
+
+
+class ConditionalBlockGuard(BlockGuard):
+    def __init__(self, block):
+        if not isinstance(block, ConditionalBlock):
+            raise TypeError("block should be conditional block")
+        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
+        self.block = block
+
+    def __enter__(self):
+        return super(ConditionalBlockGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.block.complete()
+        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
+                                                           exc_tb)
+
+
+class ConditionalBlock(object):
+    def __init__(self, inputs, name=None, main_program=None):
+        for each_input in inputs:
+            if not isinstance(each_input, Variable):
+                raise TypeError("Each input should be variable")
+        self.inputs = inputs
+        self.helper = LayerHelper(
+            'conditional_block', name=name, main_program=main_program)
+
+    def block(self):
+        return ConditionalBlockGuard(self)
+
+    def complete(self):
+        inside_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(inside_block.parent_idx)
+
+        intermediate = set()
+        params = set()
+
+        for each_op in inside_block.ops:
+            assert isinstance(each_op, Operator)
+            for iname in each_op.input_names:
+                for in_var_name in each_op.input(iname):
+                    if in_var_name not in intermediate:
+                        params.add(in_var_name)
+
+            for oname in each_op.output_names:
+                for out_var_name in each_op.output(oname):
+                    intermediate.add(out_var_name)
+        input_set = set([ipt.name for ipt in self.inputs])
+
+        param_list = [
+            parent_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name not in intermediate
+        ]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        parent_block.append_op(
+            type='conditional_block',
+            inputs={
+                'X': self.inputs,
+                'Params': param_list,
+            },
+            outputs={'Out': out_list,
+                     'Scope': [step_scope]},
+            attrs={'block': inside_block})
--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/framework/net_drawer.py
@@ -3,8 +3,8 @@ import json
 import logging
 from collections import defaultdict

-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2

 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)

--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
-import paddle.v2.framework.layers as layers
+import paddle.v2.fluid.layers as layers

 __all__ = ["simple_img_conv_pool", "sequence_conv_pool"]


--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2


 def get_all_op_protos():

--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
--- a/python/paddle/v2/framework/regularizer.py
+++ b/python/paddle/v2/framework/regularizer.py
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework

 __all__ = [
    'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'

--- a/python/paddle/v2/framework/tests/.gitignore
+++ b/python/paddle/v2/framework/tests/.gitignore
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
+
+add_subdirectory(book)
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_train.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ b/python/paddle/v2/framework/tests/test_recommender_system.py
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_dynamic_lstm.py
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
--- a/python/paddle/v2/framework/tests/test_adadelta_op.py
+++ b/python/paddle/v2/framework/tests/test_adadelta_op.py
--- a/python/paddle/v2/framework/tests/test_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_adam_op.py
+++ b/python/paddle/v2/framework/tests/test_adam_op.py
--- a/python/paddle/v2/framework/tests/test_adamax_op.py
+++ b/python/paddle/v2/framework/tests/test_adamax_op.py
--- a/python/paddle/v2/framework/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/framework/tests/test_array_read_write_op.py
--- a/python/paddle/v2/fluid/tests/test_assign_op.py
+++ b/python/paddle/v2/fluid/tests/test_assign_op.py
--- a/python/paddle/v2/framework/tests/test_auc_op.py
+++ b/python/paddle/v2/framework/tests/test_auc_op.py
--- a/python/paddle/v2/framework/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_batch_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
--- a/python/paddle/v2/framework/tests/test_cast_op.py
+++ b/python/paddle/v2/framework/tests/test_cast_op.py
--- a/python/paddle/v2/framework/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/framework/tests/test_chunk_eval_op.py
--- a/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
--- a/python/paddle/v2/framework/tests/test_clip_op.py
+++ b/python/paddle/v2/framework/tests/test_clip_op.py
--- a/python/paddle/v2/framework/tests/test_compare_op.py
+++ b/python/paddle/v2/framework/tests/test_compare_op.py
--- a/python/paddle/v2/framework/tests/test_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_concat_op.py
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
--- a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_conv3d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_op.py
--- a/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_conv_shift_op.py
+++ b/python/paddle/v2/framework/tests/test_conv_shift_op.py
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
--- a/python/paddle/v2/framework/tests/test_create_op_doc_string.py
+++ b/python/paddle/v2/framework/tests/test_create_op_doc_string.py
--- a/python/paddle/v2/framework/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/framework/tests/test_crf_decoding_op.py
--- a/python/paddle/v2/framework/tests/test_crop_op.py
+++ b/python/paddle/v2/framework/tests/test_crop_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_div_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
--- a/python/paddle/v2/framework/tests/test_evaluator.py
+++ b/python/paddle/v2/framework/tests/test_evaluator.py
--- a/python/paddle/v2/framework/tests/test_exception.py
+++ b/python/paddle/v2/framework/tests/test_exception.py
--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
--- a/python/paddle/v2/framework/tests/test_expand_op.py
+++ b/python/paddle/v2/framework/tests/test_expand_op.py
--- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_op.py
--- a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
--- a/python/paddle/v2/framework/tests/test_framework_debug_str.py
+++ b/python/paddle/v2/framework/tests/test_framework_debug_str.py
--- a/python/paddle/v2/framework/tests/test_gather_op.py
+++ b/python/paddle/v2/framework/tests/test_gather_op.py
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
--- a/python/paddle/v2/framework/tests/test_gru_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_op.py
--- a/python/paddle/v2/framework/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/framework/tests/test_inference_model_io.py
--- a/python/paddle/v2/framework/tests/test_initializer.py
+++ b/python/paddle/v2/framework/tests/test_initializer.py
--- a/python/paddle/v2/framework/tests/test_l1_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_l1_norm_op.py
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
--- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
--- a/python/paddle/v2/framework/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/framework/tests/test_lod_array_length_op.py
--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/framework/tests/test_lod_rank_table.py
--- a/python/paddle/v2/fluid/tests/test_lod_reset_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array.py
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
--- a/python/paddle/v2/framework/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table_op.py
--- a/python/paddle/v2/framework/tests/test_lrn_op.py
+++ b/python/paddle/v2/framework/tests/test_lrn_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
--- a/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
--- a/python/paddle/v2/framework/tests/test_matmul_op.py
+++ b/python/paddle/v2/framework/tests/test_matmul_op.py
--- a/python/paddle/v2/framework/tests/test_mean_op.py
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
--- a/python/paddle/v2/framework/tests/test_minus_op.py
+++ b/python/paddle/v2/framework/tests/test_minus_op.py
--- a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_momentum_op.py
+++ b/python/paddle/v2/framework/tests/test_momentum_op.py
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
--- a/python/paddle/v2/framework/tests/test_multiplex_op.py
+++ b/python/paddle/v2/framework/tests/test_multiplex_op.py
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
--- a/python/paddle/v2/framework/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/framework/tests/test_op_support_gpu.py
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
--- a/python/paddle/v2/framework/tests/test_pad_op.py
+++ b/python/paddle/v2/framework/tests/test_pad_op.py
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
--- a/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
+++ b/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
--- a/python/paddle/v2/framework/tests/test_precision_recall_op.py
+++ b/python/paddle/v2/framework/tests/test_precision_recall_op.py
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
--- a/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_proximal_gd_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_gd_op.py
--- a/python/paddle/v2/framework/tests/test_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_rank_loss_op.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_reduce_op.py
--- a/python/paddle/v2/framework/tests/test_regularizer.py
+++ b/python/paddle/v2/framework/tests/test_regularizer.py
--- a/python/paddle/v2/framework/tests/test_reshape_op.py
+++ b/python/paddle/v2/framework/tests/test_reshape_op.py
--- a/python/paddle/v2/framework/tests/test_rmsprop_op.py
+++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py
--- a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
--- a/python/paddle/v2/framework/tests/test_scale_op.py
+++ b/python/paddle/v2/framework/tests/test_scale_op.py
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
--- a/python/paddle/v2/framework/tests/test_selected_rows.py
+++ b/python/paddle/v2/framework/tests/test_selected_rows.py
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py
--- a/python/paddle/v2/framework/tests/test_seq_conv.py
+++ b/python/paddle/v2/framework/tests/test_seq_conv.py
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
--- a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
--- a/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
--- a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/v2/framework/tests/test_sign_op.py
+++ b/python/paddle/v2/framework/tests/test_sign_op.py
--- a/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/v2/framework/tests/test_split_op.py
+++ b/python/paddle/v2/framework/tests/test_split_op.py
--- a/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
+++ b/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
--- a/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
--- a/python/paddle/v2/framework/tests/test_sum_op.py
+++ b/python/paddle/v2/framework/tests/test_sum_op.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
--- a/python/paddle/v2/framework/tests/test_top_k_op.py
+++ b/python/paddle/v2/framework/tests/test_top_k_op.py
--- a/python/paddle/v2/framework/tests/test_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py
--- a/python/paddle/v2/framework/tests/test_while_op.py
+++ b/python/paddle/v2/framework/tests/test_while_op.py
--- a/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
--- a/python/setup.py.in
+++ b/python/setup.py.in