Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_prelu

c1b23535 · xzl · a0e77692 · feaf1e2d · c1b23535 · c1b23535
220 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -21,7 +21,7 @@ third_party/
 cmake-build-*
 # generated while compiling
-python/paddle/v2/framework/core.so
+python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake

--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "hl_base.h"
 /**
- * @brief   Maximum pool forward.
+ * @brief   Maximum pool forward with Mask output.
 *
 * @param[in]   frameCnt    batch size of input image.
 * @param[in]   inputData   input data.
@@ -35,7 +35,7 @@ limitations under the License. */
 * @param[in]   paddingW    padding width.
 * @param[out]  tgtData     output data.
 * @param[in]   tgtStride   stride between output data samples.
- *
+ * @param[out]  maskData    the location indices of select max data.
 */
 extern void hl_maxpool_forward(const int frameCnt,
                               const real* inputData,
@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt,
                               const int paddingH,
                               const int paddingW,
                               real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               real* maskData = NULL);
 /**
 * @brief   Maximum pool backward.

--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt,
                               const int paddingH,
                               const int paddingW,
                               real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               real* MaskData) {}
 inline void hl_maxpool_backward(const int frameCnt,
                                const real* inputData,

--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads,
                                 const int offsetH,
                                 const int offsetW,
                                 real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 real* maskData) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    int pw = index % pooledW;
@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads,
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
    real maxval = -FLT_MAX;
+    int max_index = -1;
    inputData += (frameNum * channels + c) * height * width;
    for (int h = hstart; h < hend; ++h) {
      for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w])
+        if (maxval < inputData[h * width + w]) {
-          maxval = inputData[h * width + w];
+          max_index = h * width + w;
+          maxval = inputData[max_index];
+        }
      }
    }
    int tgtIndex =
        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
    tgtData[tgtIndex] = maxval;
+    if (maskData != NULL) {
+      maskData[tgtIndex] = max_index;
+    }
  }
 }
@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt,
                        const int paddingH,
                        const int paddingW,
                        real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        real* maskData) {
  int num_kernels = pooledH * pooledW * channels * frameCnt;
  int blocks = (num_kernels + 1024 - 1) / 1024;
  dim3 threads(1024, 1);
@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt,
                                                         paddingH,
                                                         paddingW,
                                                         tgtData,
-                                                         tgtStride);
+                                                         tgtStride,
+                                                         maskData);
  CHECK_SYNC("hl_maxpool_forward failed");
 }

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -377,6 +377,12 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
  return grad_op_descs;
 }
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx);
 std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
    ProgramDescBind& program_desc, int block_idx,
    std::unordered_set<std::string>* no_grad_vars,
@@ -392,13 +398,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
    if ((*it)->Type() == "recurrent") {
      int step_block_idx = (*it)->GetBlockAttr("step_block");
-      auto backward_block_op_descs = MakeBlockBackward(
+      BlockDescBind* backward_block = CreateStepBlock(
-          program_desc, step_block_idx, no_grad_vars, grad_to_var);
+          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else if ((*it)->Type() == "conditional_block") {
      BlockDescBind* backward_block =
-          program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
-      for (auto& ptr : backward_block_op_descs) {
+                          (*it)->GetBlockAttr("block"));
-        backward_block->AppendAllocatedOp(std::move(ptr));
-      }
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
    } else {
      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
@@ -449,6 +455,21 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
  return backward_descs;
 }
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx) {
+  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
+                                                   no_grad_vars, grad_to_var);
+  BlockDescBind* backward_block =
+      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+  for (auto& ptr : backward_block_op_descs) {
+    backward_block->AppendAllocatedOp(move(ptr));
+  }
+  return backward_block;
+}
 ParamGradInfoMap AppendBackward(
    ProgramDescBind& program_desc, const VarDescBind& target,
    const std::unordered_set<std::string>& no_grad_vars) {

--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -27,10 +27,32 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
    return VarDesc_VarType_LOD_RANK_TABLE;
  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
    return VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+    return VarDesc_VarType_SELECTED_ROWS;
  } else {
    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
  }
 }
+template <typename Visitor>
+inline void VisitVarType(const Variable& var, Visitor visitor) {
+  switch (ToVarType(var.Type())) {
+    case VarDesc_VarType_LOD_TENSOR:
+      visitor(var.Get<framework::LoDTensor>());
+      return;
+    case VarDesc_VarType_LOD_RANK_TABLE:
+      visitor(var.Get<LoDRankTable>());
+      return;
+    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+      visitor(var.Get<LoDTensorArray>());
+      return;
+    case VarDesc_VarType_SELECTED_ROWS:
+      visitor(var.Get<SelectedRows>());
+      return;
+    default:
+      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -61,6 +61,7 @@ public:
    // function arguments
    strides_ = config.get<std::vector<size_t>>("strides");
    paddings_ = config.get<std::vector<size_t>>("paddings");
+    dilations_ = config.get<std::vector<size_t>>("dilations");
    groups_ = config.get<size_t>("groups");
    // number of inputs and outputs
@@ -118,6 +119,7 @@ protected:
  std::vector<size_t> strides_;
  std::vector<size_t> paddings_;
+  std::vector<size_t> dilations_;
  /// Group size, refer to grouped convolution in
  /// Alex Krizhevsky's paper: when group=2, the first half of the
@@ -133,6 +135,10 @@ protected:
  inline int paddingW() const { return paddings_[1]; }
+  inline int dilationH() const { return dilations_[0]; }
+  inline int dilationW() const { return dilations_[1]; }
  // A temporary memory in convolution calculation.
  MemoryHandlePtr memory_;

--- a/paddle/function/ConvOpTest.h
+++ b/paddle/function/ConvOpTest.h
@@ -79,45 +79,59 @@ void Convolution(const std::string& conv1,
            if (outputChannels < inputChannels) continue;
            for (size_t stride : {1, 2}) {
              for (size_t padding : {0, 1}) {
-                if (padding >= filterSize) break;
+                for (size_t dilation : {1, 3}) {
+                  if (padding >= filterSize) break;
+                  size_t filterS = (filterSize - 1) * dilation + 1;
-                // NNPACK only supports stride = 1 if batchSize > 1
+                  if (inputSize + 2 * padding < filterS) break;
-                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
-                    batchSize > 1 && stride > 1)
-                  break;
-                size_t outputSize =
+                  if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                       conv1 == "NNPACKConv-CPU" ||
-                VLOG(3) << " batchSize=" << batchSize
+                       conv2 == "NNPACKConv-CPU") &&
-                        << " inputChannels=" << inputChannels
+                      dilation > 1)
-                        << " inputHeight=" << inputSize
+                    break;
-                        << " inputWidth=" << inputSize
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterSize
-                        << " filterWidth=" << filterSize
-                        << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
-                std::vector<size_t> paddings = {padding, padding};
+                  // NNPACK only supports stride = 1 if batchSize > 1
-                std::vector<size_t> strides = {stride, stride};
+                  if ((conv1 == "NNPACKConv-CPU" ||
-                Compare2Function<DType1, DType2> test(
+                       conv2 == "NNPACKConv-CPU") &&
-                    conv1,
+                      batchSize > 1 && stride > 1)
-                    conv2,
+                    break;
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", (size_t)1)
-                        .set("algo", (std::string) "auto"));
-                TensorShape input{
+                  size_t outputSize =
-                    batchSize, inputChannels, inputSize, inputSize};
+                      (inputSize - filterS + 2 * padding + stride) / stride;
-                TensorShape filter{
+                  VLOG(3) << " batchSize=" << batchSize
-                    outputChannels, inputChannels, filterSize, filterSize};
+                          << " inputChannels=" << inputChannels
-                TensorShape output{
+                          << " inputHeight=" << inputSize
-                    batchSize, outputChannels, outputSize, outputSize};
+                          << " inputWidth=" << inputSize
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterSize
+                          << " filterWidth=" << filterSize
+                          << " outputHeight=" << outputSize
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;
-                function(test, input, filter, output);
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  std::vector<size_t> dilations = {dilation, dilation};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("dilations", dilations)
+                          .set("groups", (size_t)1)
+                          .set("algo", (std::string) "auto"));
+                  TensorShape input{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape output{
+                      batchSize, outputChannels, outputSize, outputSize};
+                  function(test, input, filter, output);
+                }
              }
            }
          }
@@ -144,6 +158,7 @@ void Convolution2(const std::string& conv1,
              for (size_t outputChannels : {7}) {
                size_t stride = 1;
                size_t padding = 0;
+                size_t dilation = 1;
                size_t outputHeight =
                    (inputHeight - filterHeight + 2 * padding + stride) /
                    stride;
@@ -162,6 +177,7 @@ void Convolution2(const std::string& conv1,
                std::vector<size_t> paddings = {padding, padding};
                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {dilation, dilation};
                Compare2Function<DType1, DType2> test(
                    conv1,
                    conv2,
@@ -169,6 +185,7 @@ void Convolution2(const std::string& conv1,
                        .set("paddings", paddings)
                        .set("strides", strides)
                        .set("groups", (size_t)1)
+                        .set("dilations", dilations)
                        .set("algo", (std::string) "auto"));
                TensorShape input{
@@ -223,6 +240,7 @@ void DepthwiseConvolution(const std::string& conv1,
                std::vector<size_t> paddings = {padding, padding};
                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {1, 1};
                size_t groups = inputChannels;
                Compare2Function<DType1, DType2> test(
                    conv1,
@@ -231,6 +249,7 @@ void DepthwiseConvolution(const std::string& conv1,
                        .set("paddings", paddings)
                        .set("strides", strides)
                        .set("groups", groups)
+                        .set("dilations", dilations)
                        .set("algo", (std::string) "auto"));
                TensorShape input{

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -100,7 +100,9 @@ public:
                 strideH(),
                 strideW(),
                 paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
        } else {
          colData = inputData + g * inputOffset;
        }
@@ -223,7 +225,9 @@ public:
                 strideH(),
                 strideW(),
                 paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
        }
      }
      inputGrad += inputChannels * inputHeight * inputWidth;
@@ -310,7 +314,9 @@ public:
                 strideH(),
                 strideW(),
                 paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
        } else {
          colData = inputData + g * inputOffset;
        }

--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -78,7 +78,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };
 template <ColFormat Format, DeviceType Device, class T>
@@ -91,7 +93,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };
 }  // namespace paddle
--- a/paddle/function/Im2ColOp.cpp
+++ b/paddle/function/Im2ColOp.cpp
@@ -31,7 +31,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -47,8 +49,8 @@ public:
      int c_im = c / filterWidth / filterHeight;
      for (int h = 0; h < outputHeight; ++h) {
        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-          int imColIdx = w * strideWidth + wOffset;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
          if ((imRowIdx - paddingHeight) < 0 ||
              (imRowIdx - paddingHeight) >= inputHeight ||
              (imColIdx - paddingWidth) < 0 ||
@@ -81,7 +83,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -97,8 +101,8 @@ public:
      int c_im = c / filterWidth / filterHeight;
      for (int h = 0; h < outputHeight; ++h) {
        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-          int imColIdx = w * strideWidth + wOffset;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
          if ((imRowIdx - paddingHeight) >= 0 &&
              (imRowIdx - paddingHeight) < inputHeight &&
              (imColIdx - paddingWidth) >= 0 &&
@@ -134,7 +138,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -147,9 +153,10 @@ public:
        for (int channel = 0; channel < inputChannels; ++channel) {
          for (int filterH = 0; filterH < filterHeight; ++filterH) {
            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
+              int imRowOffset = outputH * strideHeight +
-                  outputH * strideHeight + filterH - paddingHeight;
+                                filterH * dilationHeight - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
              int colDataOffset =
                  (((outputH * outputWidth + outputW) * inputChannels +
                    channel) *
@@ -189,7 +196,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -202,9 +211,10 @@ public:
        for (int channel = 0; channel < inputChannels; ++channel) {
          for (int filterH = 0; filterH < filterHeight; ++filterH) {
            for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
+              int imRowOffset = outputH * strideHeight +
-                  outputH * strideHeight + filterH - paddingHeight;
+                                filterH * dilationHeight - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
              int colDataOffset =
                  (((outputH * outputWidth + outputW) * inputChannels +
                    channel) *

--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -28,6 +28,8 @@ __global__ void im2col(const T* data_im,
                       int strideW,
                       int paddingH,
                       int paddingW,
+                       int dilationH,
+                       int dilationW,
                       int height_col,
                       int width_col,
                       T* data_col) {
@@ -44,8 +46,8 @@ __global__ void im2col(const T* data_im,
    data_col += (channel_out * height_col + h_out) * width_col + w_out;
    for (int i = 0; i < blockH; ++i) {
      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in + i);
+        int rIdx = int(h_in + i * dilationH);
-        int cIdx = int(w_in + j);
+        int cIdx = int(w_in + j * dilationW);
        if ((rIdx - (int)paddingH) >= (int)height ||
            (rIdx - (int)paddingH) < 0 ||
            (cIdx - (int)paddingW) >= (int)width ||
@@ -77,7 +79,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -102,6 +106,8 @@ public:
                                                    strideWidth,
                                                    paddingHeight,
                                                    paddingWidth,
+                                                    dilationHeight,
+                                                    dilationWidth,
                                                    outputHeight,
                                                    outputWidth,
                                                    colData);
@@ -121,6 +127,8 @@ __global__ void col2im(size_t n,
                       size_t strideW,
                       size_t paddingH,
                       size_t paddingW,
+                       size_t dilationH,
+                       size_t dilationW,
                       size_t height_col,
                       size_t width_col,
                       T* data_im) {
@@ -131,23 +139,34 @@ __global__ void col2im(size_t n,
    int w = int(index % width);
    int h = int((index / width) % height);
    int c = int(index / (width * height));
+    int filterH = (blockH - 1) * dilationH + 1;
+    int filterW = (blockW - 1) * dilationW + 1;
    if ((w - (int)paddingW) >= 0 &&
        (w - (int)paddingW) < (width - 2 * paddingW) &&
        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
      // compute the start and end of the output
      int w_col_start =
-          (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
      int h_col_start =
-          (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
      int h_col_end = min(int(h / strideH + 1), int(height_col));
      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH * blockW) +
+          int h_k = (h - h_col * strideH);
-                      (h - h_col * (int)strideH) * (int)blockW +
+          int w_k = (w - w_col * strideW);
-                      (w - w_col * (int)strideW);
+          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+            h_k /= dilationH;
+            w_k /= dilationW;
+            int c_col =
+                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
+                    width_col +
+                w_col;
+            val += data_col[c_col];
+          }
        }
      }
      h -= paddingH;
@@ -173,7 +192,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -205,6 +226,8 @@ public:
        strideWidth,
        paddingHeight,
        paddingWidth,
+        dilationHeight,
+        dilationWidth,
        outputHeight,
        outputWidth,
        imData);
@@ -229,6 +252,8 @@ __global__ void im2colOCF(const T* imData,
                          int strideWidth,
                          int paddingHeight,
                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                          int outputHeight,
                          int outputWidth) {
  int swId = blockIdx.x;
@@ -237,8 +262,10 @@ __global__ void im2colOCF(const T* imData,
       channelId += blockDim.z) {
    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int widthOffset =
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+            idx * dilationHeight + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationWidth + shId * strideHeight - paddingHeight;
        int imOffset = widthOffset + heightOffset * inputWidth +
                       channelId * inputHeight * inputWidth;
@@ -273,7 +300,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -312,6 +341,8 @@ public:
                                                       strideWidth,
                                                       paddingHeight,
                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                       outputHeight,
                                                       outputWidth);
    CHECK_SYNC("Im2ColFunctor GPU failed");
@@ -330,6 +361,8 @@ __global__ void col2imOCF(T* imData,
                          int strideWidth,
                          int paddingHeight,
                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                          int outputHeight,
                          int outputWidth) {
  int swId = blockIdx.x;
@@ -338,8 +371,10 @@ __global__ void col2imOCF(T* imData,
       channelId += blockDim.z) {
    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int widthOffset =
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+            idx * dilationWidth + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationHeight + shId * strideHeight - paddingHeight;
        int imOffset = widthOffset + heightOffset * inputWidth +
                       channelId * inputHeight * inputWidth;
@@ -372,7 +407,9 @@ public:
                  int strideHeight,
                  int strideWidth,
                  int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
    int inputChannels = imShape[0];
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
@@ -411,6 +448,8 @@ public:
                                                       strideWidth,
                                                       paddingHeight,
                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                       outputHeight,
                                                       outputWidth);
    CHECK_SYNC("Col2ImFunctor GPU failed");

--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -29,82 +29,98 @@ void TestIm2ColFunctor() {
          for (size_t filterWidth : {3, 7}) {
            for (size_t stride : {1, 2}) {
              for (size_t padding : {0, 1}) {
-                if (inputHeight <= filterHeight || inputWidth <= filterWidth)
+                for (size_t dilation : {1, 3}) {
-                  break;
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
-                if (padding >= filterHeight || padding >= filterWidth) break;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
-                size_t outputHeight =
+                  if (inputHeight + 2 * padding < filterSizeH ||
-                    (inputHeight - filterHeight + 2 * padding + stride) /
+                      inputWidth + 2 * padding < filterSizeW)
-                    stride;
+                    break;
-                size_t outputWidth =
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
-                TensorShape imShape =
+                  size_t outputWidth =
-                    TensorShape({channels, inputHeight, inputWidth});
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
-                TensorShape colShape1 = TensorShape({channels,
-                                                     filterHeight,
+                  TensorShape imShape =
-                                                     filterWidth,
+                      TensorShape({channels, inputHeight, inputWidth});
-                                                     outputHeight,
+                  TensorShape colShape1 = TensorShape({channels,
-                                                     outputWidth});
+                                                       filterHeight,
-                TensorShape colShape2 = TensorShape({outputHeight,
+                                                       filterWidth,
-                                                     outputWidth,
+                                                       outputHeight,
-                                                     channels,
+                                                       outputWidth});
-                                                     filterHeight,
+                  TensorShape colShape2 = TensorShape({outputHeight,
-                                                     filterWidth});
+                                                       outputWidth,
+                                                       channels,
-                size_t height = channels * filterHeight * filterWidth;
+                                                       filterHeight,
-                size_t width = outputHeight * outputWidth;
+                                                       filterWidth});
-                VectorPtr input1 = Vector::create(imShape.getElements(), false);
-                VectorPtr input2 = Vector::create(imShape.getElements(), false);
+                  size_t height = channels * filterHeight * filterWidth;
-                MatrixPtr output1 = Matrix::create(height, width, false, false);
+                  size_t width = outputHeight * outputWidth;
-                MatrixPtr output2 = Matrix::create(width, height, false, false);
+                  VectorPtr input1 =
-                input1->uniform(0.001, 1);
+                      Vector::create(imShape.getElements(), false);
-                input2->copyFrom(*input1);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
-                Im2ColFunctor<kCFO, Device, T> im2Col1;
+                  MatrixPtr output1 =
-                Im2ColFunctor<kOCF, Device, T> im2Col2;
+                      Matrix::create(height, width, false, false);
-                im2Col1(input1->getData(),
+                  MatrixPtr output2 =
-                        imShape,
+                      Matrix::create(width, height, false, false);
-                        output1->getData(),
+                  input1->uniform(0.001, 1);
-                        colShape1,
+                  input2->copyFrom(*input1);
-                        stride,
-                        stride,
+                  Im2ColFunctor<kCFO, Device, T> im2Col1;
-                        padding,
+                  Im2ColFunctor<kOCF, Device, T> im2Col2;
-                        padding);
+                  im2Col1(input1->getData(),
-                im2Col2(input2->getData(),
+                          imShape,
-                        imShape,
+                          output1->getData(),
-                        output2->getData(),
+                          colShape1,
-                        colShape2,
+                          stride,
-                        stride,
+                          stride,
-                        stride,
+                          padding,
-                        padding,
+                          padding,
-                        padding);
+                          dilation,
+                          dilation);
-                // The transposition of the result of ColFormat == kCFO
+                  im2Col2(input2->getData(),
-                // is equal to the result of ColFormat == kOCF.
+                          imShape,
-                MatrixPtr test;
+                          output2->getData(),
-                output2->transpose(test, true);
+                          colShape2,
-                autotest::TensorCheckErr(*output1, *test);
+                          stride,
+                          stride,
-                Col2ImFunctor<kCFO, Device, T> col2Im1;
+                          padding,
-                Col2ImFunctor<kOCF, Device, T> col2Im2;
+                          padding,
-                col2Im1(input1->getData(),
+                          dilation,
-                        imShape,
+                          dilation);
-                        output1->getData(),
-                        colShape1,
+                  // The transposition of the result of ColFormat == kCFO
-                        stride,
+                  // is equal to the result of ColFormat == kOCF.
-                        stride,
+                  MatrixPtr test;
-                        padding,
+                  output2->transpose(test, true);
-                        padding);
+                  autotest::TensorCheckErr(*output1, *test);
-                col2Im2(input2->getData(),
-                        imShape,
+                  Col2ImFunctor<kCFO, Device, T> col2Im1;
-                        output2->getData(),
+                  Col2ImFunctor<kOCF, Device, T> col2Im2;
-                        colShape2,
-                        stride,
+                  col2Im1(input1->getData(),
-                        stride,
+                          imShape,
-                        padding,
+                          output1->getData(),
-                        padding);
+                          colShape1,
+                          stride,
-                autotest::TensorCheckErr(*input1, *input2);
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  col2Im2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  autotest::TensorCheckErr(*input1, *input2);
+                }
              }
            }
          }

--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -79,6 +79,10 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
  for (int i = 0; i < config_.inputs_size(); i++) {
    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    std::vector<size_t> dilations = {(size_t)dilationY_[i],
+                                     (size_t)dilation_[i]};
+    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
    // Convolution Layer uses the GemmConv function by default.
    convType = "GemmConv";
@@ -97,13 +101,14 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
      if ((filterSize_[i] == filterSizeY_[i]) &&
          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
-          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
+          !useDilation) {
        convType = "NeonDepthwiseConv";
      }
 #endif
    }
-    if (FLAGS_use_nnpack && !isDeconv_) {
+    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
      createFunction(forward_,
                     "NNPACKConv",
                     FuncConfig()
@@ -117,6 +122,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
+                         .set("dilations", dilations)
                         .set("groups", (size_t)groups_[i]));
      createFunction(backward_,
@@ -124,6 +130,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
+                         .set("dilations", dilations)
                         .set("groups", (size_t)groups_[i]));
      createFunction(backward_,
@@ -131,6 +138,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
+                         .set("dilations", dilations)
                         .set("groups", (size_t)groups_[i]));
    }
  }

--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MaxPoolWithMaskLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+  setOutput("mask", &mask_);
+  return true;
+}
+size_t MaxPoolWithMaskLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+  layerSize = outputX_ * outputY_ * channels_;
+  getOutput().setFrameHeight(outputY_);
+  getOutput().setFrameWidth(outputX_);
+  return layerSize;
+}
+void MaxPoolWithMaskLayer::forward(PassType passType) {
+  size_t size = getSize();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  int batchSize = inputV->getHeight();
+  resetOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  CHECK_EQ(size, outV->getWidth());
+  resetSpecifyOutput(mask_,
+                     batchSize,
+                     size,
+                     /* isValueClean */ false,
+                     /* isGradClean */ true);
+  MatrixPtr maskV = mask_.value;
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_,
+                       maskV);
+}
+void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+  MatrixPtr outGrad = getOutputGrad();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "PoolLayer.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class MaxPoolWithMaskLayer : public PoolLayer {
+protected:
+  Argument mask_;
+public:
+  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
+      : PoolLayer(config) {}
+  size_t getSize();
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "PoolLayer.h"
+#include "MaxPoolWithMaskLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
 #ifdef PADDLE_WITH_CUDA
@@ -44,7 +45,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
  return true;
 }
@@ -57,6 +57,8 @@ Layer* PoolLayer::create(const LayerConfig& config) {
  } else if (CudnnPoolLayer::typeCheck(pool)) {
    return new CudnnPoolLayer(config);
 #endif
+  } else if (pool == "max-pool-with-mask") {
+    return new MaxPoolWithMaskLayer(config);
  } else {
    LOG(FATAL) << "Unknown pool type: " << pool;
    return nullptr;

--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -98,7 +98,7 @@ void ROIPoolLayer::forward(PassType passType) {
    size_t roiStartH = round(bottomROIs[2] * spatialScale_);
    size_t roiEndW = round(bottomROIs[3] * spatialScale_);
    size_t roiEndH = round(bottomROIs[4] * spatialScale_);
-    CHECK_GE(roiBatchIdx, 0);
+    CHECK_GE(roiBatchIdx, 0UL);
    CHECK_LT(roiBatchIdx, batchSize);
    size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL);
    size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL);

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -24,6 +24,7 @@ gserver_test(test_ConvUnify)
 gserver_test(test_BatchNorm)
 gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
+gserver_test(test_MaxPoolingWithMaskOutput)
 ########## test_Mkldnn layers and activations ##########
 if(WITH_MKLDNN)

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -434,7 +434,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
  config.layerConfig.set_partial_sum(1);
  config.layerConfig.set_shared_biases(true);
-  int dilation = 1;
+  int dilation = 2;
  if (type == "cudnn_conv") {
 #if CUDNN_VERSION >= 6000
    dilation = 2;
@@ -1234,6 +1234,7 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 TEST(Layer, PoolLayer) {
  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
 #ifdef PADDLE_WITH_CUDA
  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
@@ -1242,6 +1243,7 @@ TEST(Layer, PoolLayer) {
  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -297,7 +297,7 @@ static void getAddtoConfig(TestConfig& cfg,
 }
 void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
-  CHECK_GE(nInputs, 1);
+  CHECK_GE(nInputs, 1UL);
  TestConfig dnnConfig;
  getAddtoConfig(dnnConfig, pm, nInputs);
  dnnConfig.layerConfig.set_type("mkldnn_addto");

--- a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+++ b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "LayerGradUtil.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+using namespace paddle;
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(1);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
+                                       const string& poolType,
+                                       bool use_gpu,
+                                       MatrixPtr& maskMat) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+  pool->set_img_size(5);
+  pool->set_img_size_y(5);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+  config.layerConfig.set_name("MaxPoolWithMask");
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
+                   maskMat);
+}
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  MatrixPtr inputMat;
+  MatrixPtr maskMat;
+  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
+                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
+                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
+  real maskData[] = {12, 4, 22, 24};
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->setData(inputData);
+  maskMat->setData(maskData);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->copyFrom(inputData, 25);
+  maskMat->copyFrom(maskData, 4);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#endif
+}
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1028,15 +1028,23 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
  real* inputData = inputMat.getData();
+  real* maskData = NULL;
  size_t frameNum = inputMat.getHeight();
  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
  CHECK(height_ == inputMat.getHeight());
  CHECK(width_ == outputH * outputW * channels);
+  if (maskMatP != NULL) {
+    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
+    CHECK(outputH * outputW * channels == maskMatP->getWidth());
+    maskData = maskMatP->getData();
+  }
  hl_maxpool_forward(frameNum,
                     inputData,
                     channels,
@@ -1051,7 +1059,8 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                     paddingH,
                     paddingW,
                     data_,
-                     getStride());
+                     getStride(),
+                     maskData);
 }
 void GpuMatrix::maxPoolBackward(Matrix& inputMat,
@@ -1973,9 +1982,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
  real* inputData = inputMat.getData();
  real* outData = data_;
+  real* maskData = NULL;
  size_t num = inputMat.getHeight();
  size_t inLength = imgSizeH * imgSizeW;
  size_t outLength = outputH * outputW;
@@ -1984,6 +1995,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
  CHECK_EQ(channels * outLength, this->getWidth());
  size_t outStride = getStride();
+  if (maskMatP != NULL) {
+    maskData = maskMatP->getData();
+    CHECK_EQ(channels * outLength, maskMatP->getWidth());
+  }
  /* initialize the data_ */
  for (size_t i = 0; i < height_; i++) {
    for (size_t j = 0; j < width_; j++) {
@@ -2005,10 +2021,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
          int wstart = pw * strideW - paddingW;
          int wend = std::min(wstart + sizeX, imgSizeW);
          wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
+          if (maskData == NULL) {
-            for (int w = wstart; w < wend; ++w) {
+            for (int h = hstart; h < hend; ++h) {
-              outData[ph * outputW + pw] = std::max(
+              for (int w = wstart; w < wend; ++w) {
-                  outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+                outData[ph * outputW + pw] = std::max(
+                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+              }
+            }
+          } else {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
+                  outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
+                  maskData[ph * outputW + pw] = h * imgSizeW + w;
+                }
+              }
            }
          }
        }
@@ -2016,6 +2043,8 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
      // compute offset
      inputData += inLength;
      outData += outLength;
+      if (maskData != NULL) maskData += outLength;
    }
  }
 }

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -861,7 +861,8 @@ public:
  /**
   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value
+   * in the sizeX of value, if the maskMatP is not NULL, it will
+   * also caculate the location indices.
   */
  virtual void maxPoolForward(Matrix& inputMat,
                              size_t imgSizeH,
@@ -874,7 +875,8 @@ public:
                              size_t outputH,
                              size_t outputW,
                              size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              MatrixPtr maskMatP = NULL) {
    LOG(FATAL) << "Not implemeted";
  }
@@ -1426,7 +1428,8 @@ public:
                      size_t outputH,
                      size_t outputW,
                      size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
  void maxPoolBackward(Matrix& image,
                       size_t imgSizeH,
@@ -1697,7 +1700,8 @@ public:
                      size_t outputH,
                      size_t outputW,
                      size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
  void maxPoolBackward(Matrix& image,
                       size_t imgSizeH,

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -214,6 +214,7 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
        rnn/recurrent_op_utils.cc

--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_type.h"
+namespace paddle {
+namespace operators {
+class AssignFunctor {
+ public:
+  AssignFunctor(framework::Variable *out,
+                const platform::DeviceContext &dev_ctx)
+      : out_(out), dev_ctx_(dev_ctx) {}
+  void operator()(const framework::LoDTensor &lod_tensor) const {
+    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
+    copy_tensor(lod_tensor, &out_tensor);
+  }
+  void operator()(const framework::LoDTensorArray &array) const {
+    auto &out_array = *out_->GetMutable<framework::LoDTensorArray>();
+    out_array.resize(array.size());
+    for (size_t i = 0; i < array.size(); ++i) {
+      copy_tensor(array[i], &out_array[i]);
+    }
+  }
+  void operator()(const framework::SelectedRows &rows) const {
+    framework::SelectedRows &out_rows =
+        *out_->GetMutable<framework::SelectedRows>();
+    out_rows.set_rows(rows.rows());
+    out_rows.set_height(rows.height());
+    auto &t = rows.value();
+    out_rows.mutable_value()->CopyFrom(t, t.place(), dev_ctx_);
+  }
+  template <typename T>
+  void operator()(const T &v) const {
+    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+  }
+ private:
+  void copy_tensor(const framework::LoDTensor &lod_tensor,
+                   framework::LoDTensor *out) const {
+    auto &out_tensor = *out;
+    out_tensor.CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_);
+    out_tensor.set_lod(lod_tensor.lod());
+  }
+  framework::Variable *out_;
+  const platform::DeviceContext &dev_ctx_;
+};
+class AssignOp : public framework::OperatorBase {
+ public:
+  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) {
+      return;
+    }
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(
+        out != nullptr,
+        "The Output(Out) should not be null if the Input(X) is set.");
+    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+  }
+};
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignOpProtoMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+             "could be LoDTensor, SelectedRows or LoDTensorArray.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+              "is the same as input X.");
+    AddComment(R"DOC(Assign Operator
+Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+  }
+};
+class AssignInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    if (context->HasInput("X")) {
+      auto type = context->GetInputsVarType("X")[0];
+      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::VarDesc_VarType_LOD_TENSOR) {
+        context->SetOutputDim("Out", context->GetInputDim("X"));
+      }
+    }
+  }
+};
+class AssignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *op = new framework::OpDescBind();
+    op->SetType("assign");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    return std::unique_ptr<framework::OpDescBind>(op);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
+                  ops::AssignInferShape, ops::AssignOpProtoMaker);
--- a/paddle/operators/beam_search_decode_op.cc
+++ b/paddle/operators/beam_search_decode_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/beam_search_decode_op.h"
+namespace paddle {
+namespace operators {
+class BeamSearchDecodeOp : public framework::OperatorBase {
+ public:
+  BeamSearchDecodeOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
+    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
+    const size_t step_num = ids->size();
+    PADDLE_ENFORCE_GT(step_num, 0UL,
+                      "beam search steps should be larger than 0");
+    const size_t source_num = ids->at(0).lod().at(0).size() - 1;
+    PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0");
+    for (size_t i = 0; i < step_num; ++i) {
+      PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL,
+                        "Level of LodTensor should be 2");
+    }
+    // prepare output
+    LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
+    LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
+    BeamSearchDecoder<float> beam_search_decoder;
+    beam_search_decoder.PackAllSteps(*ids, *scores, sentenceIds,
+                                     sentenceScores);
+  }
+};
+class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
+                               framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddInput("Scores",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddOutput("SentenceIds",
+              "(LodTensor)"
+              "All possible result sentences of word ids");
+    AddOutput("SentenceScores",
+              "(LodTensor)"
+              "All possible result sentences of word scores");
+    AddComment(R"DOC(
+Pack the result of Beam search op into SentenceIds and SentenceScores.
+)DOC");
+  }
+};
+class BeamSearchDecodeInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("Ids"),
+                   "BeamSearchDecodeOp must has input Ids");
+    PADDLE_ENFORCE(context->HasInput("Scores"),
+                   "BeamSearchDecodeOp must has input Scores");
+    PADDLE_ENFORCE(context->HasOutput("SentenceIds"),
+                   "BeamSearchDecodeOp must has output SentenceIds");
+    PADDLE_ENFORCE(context->HasOutput("SentenceScores"),
+                   "BeamSearchDecodeOp must has output SentenceScores");
+  }
+};
+class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind& op_desc,
+                  framework::BlockDescBind* block) const override {
+    for (auto& o : op_desc.Output("SentenceIds")) {
+      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+    for (auto& o : op_desc.Output("SentenceScores")) {
+      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp,
+                  paddle::operators::BeamSearchDecodeOpProtoMaker,
+                  paddle::operators::BeamSearchDecodeInferShape,
+                  paddle::operators::BeamSearchDecodeInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
--- a/paddle/operators/beam_search_decode_op.h
+++ b/paddle/operators/beam_search_decode_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using LoDTensor = framework::LoDTensor;
+using LoDTensorArray = framework::LoDTensorArray;
+// all the lod have 2 levels.
+// The First is source level, the second is sentence level.
+// source level describe how many candidate words for this source.
+// sentence level describe these candidates belong to which prefix
+const size_t kSourceLevel = 0;
+const size_t kSentenceLevel = 1;
+template <typename T>
+struct BeamNode {
+  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
+  ~BeamNode() {
+    if (parent_) {
+      parent_->DropKid(this);
+      if (parent_->kids_.size() == 0UL) {
+        delete parent_;
+      }
+    }
+    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
+  }
+  void AppendTo(BeamNode* parent) {
+    parent_ = parent;
+    parent->kids_.insert(this);
+  }
+  void DropKid(BeamNode* kid) { kids_.erase(kid); }
+  BeamNode* parent_ = nullptr;
+  std::unordered_set<BeamNode*> kids_;
+  int64_t word_id_;
+  T score_;
+};
+template <typename T>
+using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
+template <typename T>
+struct Sentence {
+  std::vector<int64_t> word_ids;
+  std::vector<T> scores;
+};
+template <typename T>
+using SentenceVector = std::vector<Sentence<T>>;
+template <typename T>
+struct BeamSearchDecoder {
+  /**
+   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
+   */
+  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
+  /**
+   * Param:
+   *  cur_ids: LoDTensor of One step for word ID
+   *  cur_scores: LoDTensor of One Step for word score
+   *  prefixes_list: prefixes for each source sentence.
+   *  sentence_vector_list: result sentence_vector for each source sentence.
+   * Return:
+   *  a new prefixes list for each source of current step
+   */
+  std::vector<BeamNodeVector<T>> PackTwoSteps(
+      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+      std::vector<BeamNodeVector<T>>& prefixes_list,
+      std::vector<SentenceVector<T>>* sentence_vector_list) const;
+  /**
+   * convert the result sentence_vector for each source sentence into two
+   * LodTensor.
+   * One is all candidate sentences with word id, one is all candidate sentences
+   * with word score.
+   * Param:
+   *  sentence_vector_list: sentence_vector for each source sentence.
+   *  id_tensor: result LoDTensor for sentences of id.
+   *  score_tensor: result LoDTensor for sentences of score.
+   */
+  void ConvertSentenceVectorToLodTensor(
+      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+      LoDTensor* score_tensor) const;
+  /**
+   * Pack all steps of id/score LodTensor into sentence LoDTensor
+   * it's main logic is:
+   * ```python
+   *   prefix
+   *   result_sentence
+   *   result_lod_tensor
+   *
+   *   for (step in steps):
+   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
+   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
+   * ```
+   */
+  void PackAllSteps(const LoDTensorArray& step_ids,
+                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                    LoDTensor* score_tensor) const;
+};
+template <typename T>
+Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
+  Sentence<T> sentence;
+  while (node != nullptr) {
+    sentence.word_ids.emplace_back(node->word_id_);
+    sentence.scores.emplace_back(node->score_);
+    node = node->parent_;
+  }
+  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
+  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
+  return sentence;
+}
+template <typename T>
+std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
+    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+    std::vector<BeamNodeVector<T>>& prefixes_list,
+    std::vector<SentenceVector<T>>* sentence_vector_list) const {
+  std::vector<BeamNodeVector<T>> result;
+  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
+       ++src_idx) {
+    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+    BeamNodeVector<T> beam_nodes;
+    // if prefixes size is 0, it means this is the first step. In this step,
+    // all candidate id is the start of candidate sentences.
+    if (prefixes_list.empty()) {
+      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
+                        cur_ids.lod().at(kSentenceLevel).back(),
+                        "in the first step");
+      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
+        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
+            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
+      }
+    } else {
+      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
+      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
+      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
+                        "prefix and candidate set number should be the same");
+      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
+      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
+        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
+        size_t candidate_start = candidate_offset[src_start + prefix_idx];
+        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
+        if (candidate_start == candidate_end) {
+          VLOG(3) << "this sentence has no more candidate, "
+                     "add to result sentence and rm it from beam tree";
+          sentence_vector.push_back(MakeSentence(prefix.get()));
+          prefix.reset();
+        } else {
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            auto* candidate =
+                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
+                                cur_scores.data<T>()[candidate_idx]);
+            candidate->AppendTo(prefix.get());
+            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
+          }
+          prefix.release();
+        }
+      }
+    }
+    result.push_back(std::move(beam_nodes));
+  }
+  return result;
+}
+template <typename T>
+void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
+    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+    LoDTensor* score_tensor) const {
+  size_t src_num = sentence_vector_list.size();
+  PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
+  std::vector<size_t> source_level_lod = {0};
+  std::vector<size_t> sentence_level_lod = {0};
+  std::vector<int64_t> id_data;
+  std::vector<T> score_data;
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
+      id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                     sentence.word_ids.end());
+      score_data.insert(score_data.end(), sentence.scores.begin(),
+                        sentence.scores.end());
+      sentence_level_lod.push_back(sentence_level_lod.back() +
+                                   sentence.word_ids.size());
+    }
+    source_level_lod.push_back(source_level_lod.back() +
+                               sentence_vector_list[src_idx].size());
+  }
+  auto cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+  framework::LoD lod;
+  lod.push_back(source_level_lod);
+  lod.push_back(sentence_level_lod);
+  id_tensor->set_lod(lod);
+  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
+  id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
+  id_tensor->CopyFromVector<int64_t>(id_data, cpu_ctx);
+  score_tensor->set_lod(lod);
+  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
+  score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
+  score_tensor->CopyFromVector<T>(score_data, cpu_ctx);
+}
+template <typename T>
+void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
+                                        const LoDTensorArray& step_scores,
+                                        LoDTensor* id_tensor,
+                                        LoDTensor* score_tensor) const {
+  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
+  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
+                    "step_ids and step_scores should be the same");
+  const size_t step_num = step_ids.size();
+  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
+  // previous prefixes for each step,
+  // the init length is 0, means this is the first step.
+  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
+  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
+  // pack all steps for one batch first, then another batch
+  for (size_t step_id = 0; step_id < step_num; ++step_id) {
+    beamnode_vector_list =
+        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
+                     beamnode_vector_list, &sentence_vector_list);
+  }
+  // append last beam_node to result
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
+      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
+      beam_node.reset();
+    }
+  }
+  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
+                                   score_tensor);
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/beam_search_decode_op_test.cc
+++ b/paddle/operators/beam_search_decode_op_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/beam_search_decode_op.h"
+#include "gtest/gtest.h"
+using CPUPlace = paddle::platform::CPUPlace;
+using LoD = paddle::framework::LoD;
+using LoDTensor = paddle::framework::LoDTensor;
+using LoDTensorArray = paddle::framework::LoDTensorArray;
+template <typename T>
+using BeamNode = paddle::operators::BeamNode<T>;
+template <typename T>
+using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
+template <typename T>
+using Sentence = paddle::operators::Sentence<T>;
+template <typename T>
+using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
+template <typename T>
+using SentenceVector = paddle::operators::SentenceVector<T>;
+namespace paddle {
+namespace test {
+void GenerateExample(const std::vector<size_t>& level_0,
+                     const std::vector<size_t>& level_1,
+                     const std::vector<int>& data, LoDTensorArray* ids,
+                     LoDTensorArray* scores) {
+  PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1,
+                    "source level is used to describe candidate set");
+  PADDLE_ENFORCE_EQ(level_1.back(), data.size(),
+                    "the lowest level is used to describe data"
+                    ", so it's last element should be data length");
+  CPUPlace place;
+  LoD lod;
+  lod.push_back(level_0);
+  lod.push_back(level_1);
+  // Ids
+  LoDTensor tensor_id;
+  tensor_id.set_lod(lod);
+  tensor_id.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  int64_t* id_ptr = tensor_id.mutable_data<int64_t>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    id_ptr[i] = static_cast<int64_t>(data.at(i));
+  }
+  // Scores
+  LoDTensor tensor_score;
+  tensor_score.set_lod(lod);
+  tensor_score.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  float* score_ptr = tensor_score.mutable_data<float>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    score_ptr[i] = static_cast<float>(data.at(i));
+  }
+  ids->push_back(tensor_id);
+  scores->push_back(tensor_score);
+}
+}  // namespace test
+}  // namespace paddle
+TEST(BeamSearchDecodeOp, DeleteBeamNode) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* b2 = new BeamNode<float>(2, 2);
+  auto* b3 = new BeamNode<float>(3, 3);
+  b1->AppendTo(root);
+  b2->AppendTo(root);
+  b3->AppendTo(b1);
+  delete b3;
+  delete b2;
+}
+TEST(BeamSearchDecodeOp, MakeSentence) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* end = new BeamNode<float>(2, 2);
+  b1->AppendTo(root);
+  end->AppendTo(b1);
+  BeamSearchDecoder<float> helper;
+  Sentence<float> sentence = helper.MakeSentence(end);
+  delete end;
+  std::vector<int64_t> expect_ids = {0, 1, 2};
+  ASSERT_EQ(sentence.word_ids, expect_ids);
+  std::vector<float> expect_scores = {0, 1, 2};
+  ASSERT_EQ(sentence.scores, expect_scores);
+}
+TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
+  CPUPlace place;
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+  BeamSearchDecoder<float> helper;
+  beamnode_vector_list = helper.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
+}
+TEST(BeamSearchDecodeOp, PackTwoSteps) {
+  CPUPlace place;
+  // first source has three prefix
+  BeamNodeVector<float> source0_prefixes;
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
+  // second source has two prefix
+  BeamNodeVector<float> source1_prefixes;
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+  beamnode_vector_list.push_back(std::move(source0_prefixes));
+  beamnode_vector_list.push_back(std::move(source1_prefixes));
+  // generate data for one step
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
+                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+  BeamSearchDecoder<float> helper1;
+  beamnode_vector_list = helper1.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
+  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
+}
+TEST(BeamSearchDecodeOp, PackAllSteps) {
+  CPUPlace place;
+  // we will constuct a sample data with 3 steps and 2 source sentences
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
+      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
+                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+  ASSERT_EQ(ids.size(), 3UL);
+  ASSERT_EQ(scores.size(), 3UL);
+  BeamSearchDecoder<float> helper;
+  LoDTensor id_tensor;
+  LoDTensor score_tensor;
+  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+  LoD lod = id_tensor.lod();
+  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  EXPECT_EQ(lod[0], expect_source_lod);
+  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  EXPECT_EQ(lod[1], expect_sentence_lod);
+  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
+  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
+                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
+  for (size_t i = 0; i < expect_data.size(); ++i) {
+    ASSERT_EQ(id_tensor.data<int64_t>()[i],
+              static_cast<int64_t>(expect_data[i]));
+  }
+  for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) {
+    ASSERT_EQ(score_tensor.data<float>()[i],
+              static_cast<float>(id_tensor.data<int64_t>()[i]));
+  }
+}
--- a/paddle/operators/bilinear_tensor_product_op.cc
+++ b/paddle/operators/bilinear_tensor_product_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/bilinear_tensor_product_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class BilinearTensorProductOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
+                      "The input(Weight) must be a 3D tensor.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The first dimension(batch_size) of input(X) must be "
+                      "equal to the first dimension of the input(Y).");
+    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
+                      "The second dimension of input(X) must be equal to "
+                      "the second dimension of the input(Weight).");
+    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
+                      "The second dimension of input(Y) must be equal to "
+                      "the third dimension of the input(Weight).");
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL,
+                     "The Input(Bias) must be a 2-D tensor with "
+                     "the 2nd dimension fixed to 1 (a row vector).");
+      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
+                        "The second dimension of input(Bias) must be equal "
+                        "to the first dimension of the input(Weight).");
+    }
+    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BilinearTensorProductOpMaker(framework::OpProto* proto,
+                               framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of bilinear_tensor_product operator.");
+    AddInput("Y", "The second input of bilinear_tensor_product operator.");
+    AddInput("Weight",
+             "The learnable parameters of bilinear_tensor_product operator.");
+    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
+        .AsDispensable();
+    AddOutput("Out", "The output of bilinear_tensor_product operator.");
+    AddComment(R"DOC(
+Bilinear Tensor Product operator.
+Given input X and Y, a 3D tensor weight, and bias. Each column of the
+output is computed by one slice i = 1, . . . , k of the tensor:
+    M =  (X W_i) \cdot Y
+    Out_i = \sum_i {M_i} + Bias_i
+)DOC");
+  }
+};
+class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
+                      "The input(Out@GRAD) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[0], out_dims[0],
+        "The first dimension(batch_size) of input(Out@GRAD) must be "
+        "equal to the first dimension of the Input(X).");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[0], out_dims[1],
+        "The second dimension of input(Out@GRAD) must be equal to "
+        "the third dimension of the Input(Weight).");
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE_EQ(
+          bias_dims[1], out_dims[1],
+          "The second dimension of input(Out@GRAD) must be equal to "
+          "the second dimension of the Input(Bias).");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+    if (ctx->HasOutput(weight_grad_name)) {
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
+            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
+            ops::BilinearTensorProductOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/bilinear_tensor_product_op.cu
+++ b/paddle/operators/bilinear_tensor_product_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/bilinear_tensor_product_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/bilinear_tensor_product_op.h
+++ b/paddle/operators/bilinear_tensor_product_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename Place, typename T>
+class BilinearTensorProductKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto output_mat = EigenMatrix<T>::From(*out);
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+    auto place = ctx.GetEigenDevice<Place>();
+    // Create the intermediate variable to caculate the result of
+    // Input(X) multiplied by Input(Weight_i), the formula is:
+    // left_mul = X Weight_i.
+    Tensor left_mul;
+    left_mul.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                             ctx.GetPlace());
+    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
+    for (int i = 0; i < out_dim; ++i) {
+      auto output_col_vec = output_mat.chip(i, 1);
+      Tensor weight_mat =
+          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
+      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                           batch_size, y_dim, x_dim, 1, x->data<T>(),
+                           weight_mat.data<T>(), 0, left_mul.data<T>());
+      output_col_vec.device(place) =
+          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
+    }
+    if (bias) {
+      auto bias_vec = EigenMatrix<T>::From(*bias);
+      Eigen::DSizes<int, 2> bcast(batch_size, 1);
+      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+    }
+  }
+};
+template <typename Place, typename T>
+class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* weight = ctx.Input<Tensor>("Weight");
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+    auto x_mat = EigenMatrix<T>::From(*x);
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto d_out_mat = EigenMatrix<T>::From(*d_out);
+    auto place = ctx.GetEigenDevice<Place>();
+    // Create the intermediate variable to caculate the Output(Y@Grad).
+    Tensor x_scale;
+    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
+                            ctx.GetPlace());
+    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
+    // Create the intermediate variable to caculate the Output(X@Grad).
+    Tensor y_scale;
+    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                            ctx.GetPlace());
+    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
+    math::SetConstant<Place, T> set_zero;
+    // Set Output(X@Grad) be zero.
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_x, static_cast<T>(0));
+    }
+    // Set Output(Y@Grad) be zero.
+    if (d_y) {
+      d_y->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_y, static_cast<T>(0));
+    }
+    // Caculate the Output(X@Grad) and Output(Y@Grad).
+    if (d_x || d_y) {
+      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
+      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor weight_i = weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        if (d_x) {
+          y_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_x) *
+              y_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasTrans,
+                               batch_size, x_dim, y_dim, 1, y_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_x->data<T>());
+        }
+        if (d_y) {
+          x_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_y) *
+              x_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                               batch_size, y_dim, x_dim, 1, x_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_y->data<T>());
+        }
+      }
+    }
+    // Caculate the gradient of Input(Weight).
+    if (d_weight) {
+      d_weight->mutable_data<T>(ctx.GetPlace());
+      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        x_scale_mat.device(place) =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_weight) *
+            x_mat;
+        math::gemm<Place, T>(ctx.device_context(), CblasTrans, CblasNoTrans,
+                             x_dim, y_dim, batch_size, 1, x_scale.data<T>(),
+                             y->data<T>(), 0, d_weight_i.data<T>());
+      }
+    }
+    // Caculate the gradient of Input(Bias).
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_mat = EigenMatrix<T>::From(*d_bias);
+      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/conditional_block_op.cc
+++ b/paddle/operators/conditional_block_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <algorithm>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+class ConditionalOp : public framework::OperatorBase {
+ public:
+  ConditionalOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+ protected:
+  std::vector<const framework::LoDTensor *> InputTensors(
+      const framework::Scope &scope) const {
+    std::vector<const framework::LoDTensor *> retv;
+    auto xs = Inputs("X");
+    retv.resize(xs.size(), nullptr);
+    std::transform(
+        xs.begin(), xs.end(), retv.begin(),
+        [&scope](const std::string &var_name) -> const framework::LoDTensor * {
+          auto *var = scope.FindVar(var_name);
+          PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name);
+          return &var->Get<framework::LoDTensor>();
+        });
+    return retv;
+  }
+};
+class ConditionalBlockOp : public ConditionalOp {
+ public:
+  ConditionalBlockOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Output("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
+      scopes->resize(1);
+      scopes->front() = &scope.NewScope();
+      auto &cur_scope = *scopes->front();
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+    }
+  }
+};
+class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The conditional variable of this operator. If X is empty, the "
+             "whole sub-block will not be executed.")
+        .AsDuplicable();
+    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
+    AddOutput("Scope",
+              "(std::vector<Scope*>) The step scope of conditional block. To "
+              "unify the conditional block, rnn and while op, the type of "
+              "scope is std::vector<Scope*>");
+    AddAttr<framework::BlockDescBind *>(
+        "block", "The step block of conditional block operator");
+    AddComment(R"DOC(Conditional block operator
+Run the sub-block if X is not empty. Params is the other inputs and Out is the
+outputs of the sub-block.
+)DOC");
+  }
+};
+class ConditionalBlockGradOp : public ConditionalOp {
+ public:
+  ConditionalBlockGradOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = this->InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Input("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
+      framework::Scope &cur_scope = *scopes[0];
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+                                  Outputs(framework::GradVarName("Params")));
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+                                  Outputs(framework::GradVarName("X")));
+    }
+  }
+ private:
+  void AssignLocalGradientToGlobal(
+      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const std::vector<std::string> &p_names,
+      const std::vector<std::string> &pg_names) const {
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto out_grad_name = pg_names[i];
+      auto in_grad_name = framework::GradVarName(p_names[i]);
+      auto *in_var = cur_scope.FindVar(in_grad_name);
+      if (in_var == nullptr) {
+        continue;
+      }
+      auto new_in_grad_name = cur_scope.Rename(in_grad_name);
+      auto assign =
+          framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}},
+                                          {{"Out", {out_grad_name}}}, {});
+      assign->Run(cur_scope, dev_ctx);
+      cur_scope.Rename(new_in_grad_name, in_grad_name);
+    }
+  }
+};
+class ConditionalBlockGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInputs("X"));
+    if (context->HasInputs("Params")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
+      context->SetOutputsDim(framework::GradVarName("Params"),
+                             context->GetInputsDim("Params"));
+    }
+    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
+    context->SetOutputsDim(framework::GradVarName("X"),
+                           context->GetInputsDim("X"));
+  }
+};
+class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto grad_op = new framework::OpDescBind();
+    grad_op->SetType("conditional_block_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetInput("Scope", Output("Scope"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
+    grad_op->SetBlockAttr("block", *this->grad_block_[0]);
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
+                  ops::ConditionalBlockOpProtoMaker,
+                  ops::ConditionalBlockGradMaker);
+REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
+                  ops::ConditionalBlockGradInferShape);
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
@@ -29,7 +29,7 @@ class L1NormKernel : public framework::OpKernel<T> {
    Out->mutable_data<T>(context.GetPlace());
    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
    auto place = context.GetEigenDevice<Place>();
    out.device(place) = x.abs().sum();

--- a/paddle/operators/lod_reset_op.cc
+++ b/paddle/operators/lod_reset_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/lod_reset_op.h"
+namespace paddle {
+namespace operators {
+class LoDResetOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LoDResetOp should not be null.");
+    // If target LoD is not set form Input(), then it must be set from Attr().
+    if (!ctx->HasInput("TargetLoD")) {
+      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
+      PADDLE_ENFORCE(level0.size() > 1,
+                     "Target LoD is not found, should be set to be a valid one "
+                     "through Input() or Attr().");
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDResetOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
+    AddInput("TargetLoD",
+             "(Tensor, optional) The target level 0 LoD from Input().")
+        .AsDispensable();
+    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddAttr<std::vector<int>>("target_lod",
+                              "The target level 0 LoD from Attr().")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(LoDReset operator
+Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
+Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
+Currently the lod_reset operator only supports the reset of level 0 LoD.
+At least one of Input(TargetLoD) and Attr(target_lod) must be set,
+and if both of them are set, Input(TargetLoD) will be chosen as the
+target LoD.
+An example:
+Given a float LoDTensor X with shape (6, 1), its transpose form represents
+    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
+the sequences that the LoDTensor Output(Out) contains becomes:
+    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+)DOC");
+  }
+};
+class LoDResetGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
+            ops::LoDResetGradOp);
+REGISTER_OP_CPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/lod_reset_op.cu
+++ b/paddle/operators/lod_reset_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/lod_reset_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::GPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class LoDResetKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
+    std::vector<int> level0;
+    if (lod_t) {
+      auto* lod = lod_t->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor lod_cpu;
+        lod_cpu.CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context());
+        lod = lod_cpu.data<int>();
+      }
+      level0 = std::vector<int>(lod, lod + lod_t->numel());
+    } else {
+      level0 = ctx.Attr<std::vector<int>>("target_lod");
+    }
+    PADDLE_ENFORCE(level0.size() > 1UL,
+                   "The size of target LoD should be greater than 1.");
+    PADDLE_ENFORCE(level0[0] == 0,
+                   "Target LoD should be a vector starting from 0.");
+    PADDLE_ENFORCE(level0.back() == in->dims()[0],
+                   "Target LoD should be a vector end with the "
+                   "first dimension of Input(X).");
+    for (size_t i = 0; i < level0.size() - 1; ++i) {
+      PADDLE_ENFORCE(level0[i + 1] > level0[i],
+                     "Target LoD should be an ascending vector.");
+    }
+    out->ShareDataWith(*in);
+    // cast level0 to size_t
+    std::vector<size_t> ulevel0(level0.size(), 0);
+    std::transform(level0.begin(), level0.end(), ulevel0.begin(),
+                   [](int a) { return static_cast<size_t>(a); });
+    framework::LoD target_lod;
+    target_lod.push_back(ulevel0);
+    out->set_lod(target_lod);
+  }
+};
+template <typename Place, typename T>
+class LoDResetGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    d_x->ShareDataWith(*d_out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -27,15 +27,15 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
+                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& strides, std::vector<int>& paddings,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  PoolProcess pool_process, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
+    const int output_channels = output->dims()[1];
-    const int output_height = output.dims()[2];
+    const int output_height = output->dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_width = output->dims()[3];
    const int ksize_height = ksize[0];
    const int ksize_width = ksize[1];
    const int stride_height = strides[0];
@@ -47,7 +47,7 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
    const int output_stride = output_height * output_width;
    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -87,11 +87,12 @@ template <typename PoolProcess, class T>
 class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@@ -110,7 +111,7 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -154,10 +155,11 @@ template <class T>
 class MaxPool2dGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@@ -176,7 +178,7 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -240,17 +242,17 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
+                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& strides, std::vector<int>& paddings,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  PoolProcess pool_process, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
+    const int output_channels = output->dims()[1];
-    const int output_depth = output.dims()[2];
+    const int output_depth = output->dims()[2];
-    const int output_height = output.dims()[3];
+    const int output_height = output->dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_width = output->dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
@@ -265,7 +267,7 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
    const int output_stride = output_depth * output_height * output_width;
    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -315,11 +317,12 @@ template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@@ -343,7 +346,7 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -398,10 +401,11 @@ template <class T>
 class MaxPool3dGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@@ -425,7 +429,7 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
    const T* input_data = input.data<T>();
    const T* output_data = output.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -498,15 +502,15 @@ template <typename T>
 class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
+                  const framework::Tensor& input, std::vector<int>& ksize,
-                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
+    const int output_channels = output->dims()[1];
-    const int output_height = output.dims()[2];
+    const int output_height = output->dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_width = output->dims()[3];
    const int ksize_height = ksize[0];
    const int ksize_width = ksize[1];
    const int stride_height = strides[0];
@@ -517,8 +521,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
    const int output_stride = output_height * output_width;
    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -563,13 +567,13 @@ template <typename T>
 class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
-    const int batch_size = input_grad.dims()[0];
+                  framework::Tensor* input_grad) {
-    const int input_height = input_grad.dims()[2];
+    const int batch_size = input_grad->dims()[0];
-    const int input_width = input_grad.dims()[3];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
    const int output_channels = output_grad.dims()[1];
    const int output_height = output_grad.dims()[2];
    const int output_width = output_grad.dims()[3];
@@ -578,7 +582,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
    const T* mask_data = mask.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
    for (int n = 0; n < batch_size; ++n) {
      for (int c = 0; c < output_channels; ++c) {
@@ -612,17 +616,17 @@ template <typename T>
 class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
+                  const framework::Tensor& input, std::vector<int>& ksize,
-                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
    const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
+    const int output_channels = output->dims()[1];
-    const int output_depth = output.dims()[2];
+    const int output_depth = output->dims()[2];
-    const int output_height = output.dims()[3];
+    const int output_height = output->dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_width = output->dims()[4];
    const int ksize_depth = ksize[0];
    const int ksize_height = ksize[1];
    const int ksize_width = ksize[2];
@@ -636,8 +640,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
    const int output_stride = output_depth * output_height * output_width;
    const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
@@ -691,14 +695,14 @@ template <typename T>
 class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
-    const int batch_size = input_grad.dims()[0];
+                  framework::Tensor* input_grad) {
-    const int input_depth = input_grad.dims()[2];
+    const int batch_size = input_grad->dims()[0];
-    const int input_height = input_grad.dims()[3];
+    const int input_depth = input_grad->dims()[2];
-    const int input_width = input_grad.dims()[4];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
    const int output_channels = output_grad.dims()[1];
    const int output_depth = output_grad.dims()[2];
    const int output_height = output_grad.dims()[3];
@@ -708,7 +712,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
    const T* mask_data = mask.data<T>();
    const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
    for (int n = 0; n < batch_size; ++n) {
      for (int c = 0; c < output_channels; ++c) {

--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -88,60 +88,62 @@ template <typename Place, typename PoolProcess, typename T>
 class Pool2dFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
+                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& strides, std::vector<int>& paddings,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* output);
 };
 template <typename Place, typename PoolProcess, typename T>
 class Pool2dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 template <typename Place, class T>
 class MaxPool2dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 template <typename Place, typename PoolProcess, typename T>
 class Pool3dFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
+                  const framework::Tensor& input, std::vector<int>& ksize,
-                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& strides, std::vector<int>& paddings,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* output);
 };
 template <typename Place, typename PoolProcess, typename T>
 class Pool3dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
                  std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 template <typename Place, class T>
 class MaxPool3dGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                  const framework::Tensor& output,
                  const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 /*
@@ -155,38 +157,38 @@ template <typename Place, typename T>
 class MaxPool2dWithIndexFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
+                  const framework::Tensor& input, std::vector<int>& ksize,
-                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 template <typename Place, typename T>
 class MaxPool2dWithIndexGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 template <typename Place, typename T>
 class MaxPool3dWithIndexFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
+                  const framework::Tensor& input, std::vector<int>& ksize,
-                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 template <typename Place, typename T>
 class MaxPool3dWithIndexGradFunctor {
 public:
  void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 }  // namespace math

--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -74,11 +74,10 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
  Tensor output;
  auto in_dims = input.dims();
  if (in_dims.size() == 3) {
-    output.Resize(in_dims);
+    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
    output.mutable_data<T>(context.GetPlace());
    EigenTranspose<Place, T, 3>(context, input, output, {1, 0, 2});
-    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
+    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-    output.Resize(make_ddim(out_dims));
  } else {
    output.ShareDataWith(input);
  }

--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+namespace paddle {
+namespace operators {
+using LoD = framework::LoD;
+class MergeLoDTensorOp : public framework::OperatorBase {
+ public:
+  MergeLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
+    auto &in_false =
+        scope.FindVar(Input("InFalse"))->Get<framework::LoDTensor>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+    auto &mask_dim = mask.dims();
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+    int rank = in_true.dims().size();
+    platform::Place place = in_true.place();
+    std::type_index data_type = in_true.type();
+    framework::DDim in_true_dims =
+        framework::slice_ddim(in_true.dims(), 1, rank);
+    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
+    auto in_true_dim_vec = framework::vectorize(in_true_dims);
+    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+    auto *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+    // Build LoDTensor `out`
+    size_t in_true_idx = 0;
+    size_t in_false_idx = 0;
+    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+      const framework::LoDTensor *input = nullptr;
+      size_t *in_idx = nullptr;
+      if (static_cast<int>(mask_data[i]) == 0) {
+        input = &in_false;
+        in_idx = &in_false_idx;
+      } else {
+        input = &in_true;
+        in_idx = &in_true_idx;
+      }
+      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+          input->lod(), *in_idx, (*in_idx) + 1, 0);
+      auto &lod_length = lod_and_offset.first;
+      framework::AppendLoD(out_lod, lod_length);
+      size_t start_offset = lod_and_offset.second.first;
+      size_t end_offset = lod_and_offset.second.second;
+      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      size_t len = end_offset - start_offset;
+      if (len == 0) {
+        continue;
+      }
+      out->Slice(out_offset, out_offset + len)
+          .CopyFrom(input->Slice(start_offset, end_offset), place, dev_ctx);
+      out_offset += len;
+      (*in_idx) += 1;
+    }
+    for (size_t i = 0; i < level; i++) {
+      out_lod->insert(out_lod->begin(), x.lod()[i]);
+    }
+  }
+};
+class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input LoDTensor, contains complete lod information to "
+             "construct the output");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddInput("InTrue", "The True branch to be merged");
+    AddInput("InFalse", "The False branch to be merged");
+    AddOutput("Out", "The merged output LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to rank.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Merge True and False branches of LoDTensor into a single Output,
+        with a mask at certain lod level. X is used to obtain complete
+        lod information. Please refer to SplitLoDTensorOp.)DOC");
+  }
+};
+class MergeLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "MergeLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "MergeLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasInput("InTrue"),
+                   "MergeLoDTensorOp must has input InTrue.");
+    PADDLE_ENFORCE(context->HasInput("InFalse"),
+                   "MergeLoDTensorOp must has input InFalse.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "MergeLoDTensorOp must has output Out");
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
+  }
+};
+class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("split_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
+    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
+                  ops::MergeLoDTensorOpProtoMaker,
+                  ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -75,16 +75,16 @@ class PoolKernel : public framework::OpKernel<T> {
              Place, paddle::operators::math::MaxPool<T>, T>
              pool2d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process);
+                         paddings, pool_process, out);
        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool2dFunctor<
              Place, paddle::operators::math::AvgPool<T>, T>
              pool2d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process);
+                         paddings, pool_process, out);
        }
      } break;
      case 3: {
@@ -93,15 +93,15 @@ class PoolKernel : public framework::OpKernel<T> {
              Place, paddle::operators::math::MaxPool<T>, T>
              pool3d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process);
+                         paddings, pool_process, out);
        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool3dFunctor<
              Place, paddle::operators::math::AvgPool<T>, T>
              pool3d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                         paddings, pool_process);
+                         paddings, pool_process, out);
        }
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -142,30 +142,30 @@ class PoolGradKernel : public framework::OpKernel<T> {
          if (pooling_type == "max") {
            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
                pool2d_backward;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            *out_grad, ksize, strides, paddings);
+                            ksize, strides, paddings, in_x_grad);
          } else if (pooling_type == "avg") {
            paddle::operators::math::Pool2dGradFunctor<
                Place, paddle::operators::math::AvgPoolGrad<T>, T>
                pool2d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            *out_grad, ksize, strides, paddings, pool_process);
+                            ksize, strides, paddings, pool_process, in_x_grad);
          }
        } break;
        case 3: {
          if (pooling_type == "max") {
            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
                pool3d_backward;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            *out_grad, ksize, strides, paddings);
+                            ksize, strides, paddings, in_x_grad);
          } else if (pooling_type == "avg") {
            paddle::operators::math::Pool3dGradFunctor<
                Place, paddle::operators::math::AvgPoolGrad<T>, T>
                pool3d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
-                            *out_grad, ksize, strides, paddings, pool_process);
+                            ksize, strides, paddings, pool_process, in_x_grad);
          }
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }

--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -46,14 +46,14 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
      case 2: {
        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
            pool2d_forward;
-        pool2d_forward(context.device_context(), *in_x, *out, *mask, ksize,
+        pool2d_forward(context.device_context(), *in_x, ksize, strides,
-                       strides, paddings);
+                       paddings, out, mask);
      } break;
      case 3: {
        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
            pool3d_forward;
-        pool3d_forward(context.device_context(), *in_x, *out, *mask, ksize,
+        pool3d_forward(context.device_context(), *in_x, ksize, strides,
-                       strides, paddings);
+                       paddings, out, mask);
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
    }
@@ -89,14 +89,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
        case 2: {
          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
              pool2d_backward;
-          pool2d_backward(context.device_context(), *in_x_grad, *out_grad,
+          pool2d_backward(context.device_context(), *out_grad, *mask, ksize,
-                          *mask, ksize, strides, paddings);
+                          strides, paddings, in_x_grad);
        } break;
        case 3: {
          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
              pool3d_backward;
-          pool3d_backward(context.device_context(), *in_x_grad, *out_grad,
+          pool3d_backward(context.device_context(), *out_grad, *mask, ksize,
-                          *mask, ksize, strides, paddings);
+                          strides, paddings, in_x_grad);
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
      }

--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -47,7 +47,7 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                        framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
-             "(vector<LoDTensor>) Input is a vector of LoDTensor, "
+             "(LodTensorArray) Input is a vector of LoDTensor, "
             "each of which is a variable-length sequence or nested sequence.")
        .AsDuplicable();
    AddOutput("Out",

--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -126,6 +126,7 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
      Eigen::DSizes<int, 2> bcast(h, 1);
      if (pooltype == "AVERAGE") {
@@ -136,9 +137,9 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
        in_g_e.device(place) =
            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
      } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e;
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
      } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e;
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
      } else {
        PADDLE_THROW("unsupported pooling pooltype");
      }

--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+namespace paddle {
+namespace operators {
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+using LoD = framework::LoD;
+class SplitLoDTensorOp : public framework::OperatorBase {
+ public:
+  SplitLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto *out_true =
+        scope.FindVar(Output("OutTrue"))->GetMutable<framework::LoDTensor>();
+    auto *out_false =
+        scope.FindVar(Output("OutFalse"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+    auto &x_lod = x.lod();
+    auto &mask_dim = mask.dims();
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
+    // set out_true/out_false lod
+    for (size_t t = 0; t < 2; t++) {
+      LoD *lod = nullptr;
+      if (t == 0) {
+        lod = out_false->mutable_lod();
+      } else {
+        lod = out_true->mutable_lod();
+      }
+      lod->clear();
+      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+        if (static_cast<size_t>(mask_data[i]) == t) {
+          size_t start_idx = i;
+          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+              x_lod, start_idx, start_idx + 1, level);
+          auto &lod_length = lod_and_offset.first;
+          framework::AppendLoD(lod, lod_length);
+          size_t start_offset = lod_and_offset.second.first;
+          size_t end_offset = lod_and_offset.second.second;
+          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+        }
+      }
+    }
+    for (size_t t = 0; t < 2; ++t) {
+      framework::LoDTensor *out;
+      if (t == 0) {
+        out = out_false;
+      } else {
+        out = out_true;
+      }
+      auto &ranges = copy_ranges[t];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out->Resize(x_dim);
+      out->mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[offset: offset+len] = x[each_range.begin: each_range.end]
+        out->Slice(static_cast<int>(offset), static_cast<int>(offset + len))
+            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                              static_cast<int>(each_range.end)),
+                      x.place(), dev_ctx);
+        offset += len;
+      }
+    }
+  }
+};
+class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input LoDTensor");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddOutput("OutTrue", "True branch of input LoDTensor");
+    AddOutput("OutFalse", "False branch of input LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to split.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Split a LoDTensor with a Mask at certain level. The input LoDTensor
+        has 3 sequence at certain lod level. The Mask is a bool column vector,
+        such as [0, 1, 0] at the same level. The first and third sequence will
+        be send to False Output LoDTensor; whereas the second sequence will
+        be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC");
+  }
+};
+class SplitLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "SplitLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "SplitLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasOutput("OutTrue"),
+                   "SplitLoDTensorOp must has output OutTrue.");
+    PADDLE_ENFORCE(context->HasOutput("OutFalse"),
+                   "SplitLoDTensorOp must has output OutFalse.");
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
+    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
+  }
+};
+class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("merge_lod_tensor");
+    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
+    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp,
+                  ops::SplitLoDTensorOpProtoMaker,
+                  ops::SplitLoDTensorInferShape,
+                  ops::SplitLoDTensorArrayGradMaker);
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -29,7 +29,7 @@ class SquaredL2NormKernel : public framework::OpKernel<T> {
    Out->mutable_data<T>(context.GetPlace());
    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
    auto place = context.GetEigenDevice<Place>();
    out.device(place) = x.square().sum();

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -42,6 +42,9 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #endif
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
 namespace paddle {
 namespace pybind {
 static size_t UniqueIntegerGenerator(const std::string &prefix) {

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -37,10 +37,10 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
        DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -66,7 +66,7 @@ if (WITH_TESTING)
    add_subdirectory(paddle/v2/tests)
    add_subdirectory(paddle/v2/reader/tests)
    add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/v2/framework/tests)
+    add_subdirectory(paddle/v2/fluid/tests)
  endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1200,8 +1200,14 @@ def TestData(data_config, async_load_data=None):
 #caffe_mode: compute the output size using floor instead of ceil,
 #            which is consistent of caffe and CuDNN's convention.
-def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
+def cnn_output_size(img_size,
-    output = (2 * padding + img_size - filter_size) / float(stride)
+                    filter_size,
+                    padding,
+                    stride,
+                    caffe_mode,
+                    dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    output = (2 * padding + img_size - filter_s) / float(stride)
    if caffe_mode:
        return 1 + int(math.floor(output))
    else:
@@ -1210,8 +1216,14 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
 #calcualte image_size based on output_size for de-convolution (ConvTransLayer).
 #It is the reverse function of cnn_output_size
-def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
+def cnn_image_size(output_size,
-    img_size = (output_size - 1) * stride + filter_size - 2 * padding
+                   filter_size,
+                   padding,
+                   stride,
+                   caffe_mode,
+                   dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    img_size = (output_size - 1) * stride + filter_s - 2 * padding
    if not caffe_mode:
        img_size = img_size + 1
    return img_size
@@ -1253,9 +1265,9 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
 def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
    pool_conf.pool_type = pool.pool_type
    config_assert(pool.pool_type in [
-        'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
+        'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
-    ], "pool-type %s is not in "
+    ], "pool-type %s is not in " \
-                  "['max-projection', 'avg-projection', "
+              "['max-projection', 'avg-projection', 'max-pool-with-mask'," \
                  "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)
    pool_conf.channels = pool.channels
@@ -1376,6 +1388,12 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
    conv_conf.stride_y = conv.stride_y
    conv_conf.groups = conv.groups
    conv_conf.caffe_mode = conv.caffe_mode
+    if not conv.dilation:
+        conv.dilation = 1
+        conv.dilation_y = 1
+    else:
+        conv_conf.dilation = conv.dilation
+        conv_conf.dilation_y = conv.dilation_y
    if not trans:
        conv_conf.filter_channels = conv.channels / conv.groups
@@ -1383,20 +1401,20 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
            get_img_size(input_layer_name, conv.channels)
        conv_conf.output_x = cnn_output_size(
            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
        conv_conf.output_y = cnn_output_size(
            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
    else:
        conv_conf.filter_channels = num_filters / conv.groups
        conv_conf.output_x, conv_conf.output_y = \
            get_img_size(input_layer_name, conv.channels)
        conv_conf.img_size = cnn_image_size(
            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
        conv_conf.img_size_y = cnn_image_size(
            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
 #caffe_mode: compute the output size using floor instead of ceil,

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -20,7 +20,7 @@ from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
    ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, BasePoolingType, \
+from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
    CudnnAvgPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
@@ -2571,7 +2571,9 @@ def img_conv_layer(input,
    if layer_type:
        if dilation > 1 or dilation_y > 1:
-            assert layer_type in ["cudnn_conv", "cudnn_convt"]
+            assert layer_type in [
+                "cudnn_conv", "cudnn_convt", "exconv", "exconvt"
+            ]
        if trans:
            assert layer_type in ["exconvt", "cudnn_convt"]
        else:
@@ -2699,9 +2701,9 @@ def img_pool_layer(input,
    elif isinstance(pool_type, AvgPooling):
        pool_type.name = 'avg'
-    assert type(pool_type) in [AvgPooling, MaxPooling, CudnnAvgPooling,
+    assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
                               CudnnMaxPooling], \
-        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling are supported"
+        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
    type_name = pool_type.name + '-projection' \
        if (
@@ -3592,10 +3594,9 @@ def lstm_step_layer(input,
    :type gate_act: BaseActivation
    :param state_act: State Activation Type. TanhActivation is the default.
    :type state_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+    :param bias_attr: The parameter attribute for bias. If this parameter is
-                      whose type is not ParameterAttribute, no bias is defined. If the
+                     set to True or None, the bias is initialized to zero.
-                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | True
-    :type bias_attr: ParameterAttribute | None | bool | Any
    :param layer_attr: layer's extra attribute.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
@@ -3650,9 +3651,10 @@ def gru_step_layer(input,
    :param name: The name of this layer. It is optional.
    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
    :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      whose type is not ParameterAttribute, no bias is defined. If the
+                      False or an object whose type is not ParameterAttribute, no bias
-                      parameter is set to True, the bias is initialized to zero.
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param param_attr: the parameter_attribute for transforming the output_mem
                       from previous step.
@@ -3712,9 +3714,10 @@ def gru_step_naive_layer(input,
    :type act: BaseActivation
    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
    :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      whose type is not ParameterAttribute, no bias is defined. If the
+                      False or an object whose type is not ParameterAttribute, no bias
-                      parameter is set to True, the bias is initialized to zero.
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param param_attr:
    :param layer_attr:
@@ -3844,9 +3847,10 @@ def recurrent_layer(input,
    :type input: LayerOutput
    :param act: Activation type. TanhActivation is the default.
    :type act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to 
-                      whose type is not ParameterAttribute, no bias is defined. If the
+                      False or an object whose type is not ParameterAttribute,
-                      parameter is set to True, the bias is initialized to zero.
+                      no bias is defined. If the parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param param_attr: parameter attribute.
    :type param_attr: ParameterAttribute
@@ -4836,9 +4840,10 @@ def tensor_layer(a,
    :type act: BaseActivation
    :param param_attr: The Parameter Attribute.
    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      whose type is not ParameterAttribute, no bias is defined. If the
+                      False or an object whose type is not ParameterAttribute,
-                      parameter is set to True, the bias is initialized to zero.
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param layer_attr: Extra Layer config.
    :type layer_attr: ExtraLayerAttribute | None
@@ -4900,9 +4905,10 @@ def selective_fc_layer(input,
    :type act: BaseActivation
    :param param_attr: The Parameter Attribute.
    :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      whose type is not ParameterAttribute, no bias is defined. If the
+                      False or an object whose type is not ParameterAttribute,
-                      parameter is set to True, the bias is initialized to zero.
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param layer_attr: Extra Layer config.
    :type layer_attr: ExtraLayerAttribute | None
@@ -5585,10 +5591,10 @@ def nce_layer(input,
                             to the num_classes. Each member of the list defines
                             the probability of a class given input x.
    :type neg_distribution: list | tuple | collections.Sequence | None
-    :param bias_attr: The attribute for bias. If this parameter is set False or
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
-                      any object whose type is not ParameterAttribute, no bias
+                      False or an object whose type is not ParameterAttribute,
-                      is added. If this parameter is set True, the bias is
+                      no bias is defined. If this parameter is set to True,
-                      initialized to zero.
+                      the bias is initialized to zero.
    :type bias_attr: ParameterAttribute | None | bool | Any
    :param layer_attr: Extra Layer Attribute.
    :type layer_attr: ExtraLayerAttribute
@@ -6519,9 +6525,9 @@ def gated_unit_layer(input,
    :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
                            for details.
    :type gate_param_attr: ParameterAttribute
-    :param gate_bias_attr: The bias attribute of the gate. If the parameter is set to False or
+    :param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
                           an object whose type is not ParameterAttribute, no bias is defined.
-                           If the parameter is set to True, the bias is initialized to zero.
+                           If this parameter is set to True, the bias is initialized to zero.
    :type gate_bias_attr: ParameterAttribute | bool | None | Any
    :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
                        details.
@@ -6529,9 +6535,9 @@ def gated_unit_layer(input,
    :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
                              for details.
    :type inproj_param_attr: ParameterAttribute
-    :param inproj_bias_attr: The bias attribute of the projection. If the parameter is set to False
+    :param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
                             or an object whose type is not ParameterAttribute, no bias is defined.
-                             If the parameter is set to True, the bias is initialized to zero.
+                             If this parameter is set to True, the bias is initialized to zero.
    :type inproj_bias_attr: ParameterAttribute | bool | None | Any
    :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
                       details.

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -681,34 +681,42 @@ def lstmemory_unit(input,
                                   state_act=TanhActivation())
-    :param input: input layer.
+    :param input: Input layer.
    :type input: LayerOutput
-    :param out_memory: output of previous time step
+    :param out_memory: The output of previous time step.
    :type out_memory: LayerOutput | None
-    :param name: lstmemory unit name.
+    :param name: The lstmemory unit name.
    :type name: basestring
-    :param size: lstmemory unit size.
+    :param size: The lstmemory unit size.
    :type size: int
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
    :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
    :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
    :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
    :type state_act: BaseActivation
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
-                False means no bias, None means default bias.
+                      input to hidden projection.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
+                      False or None means no bias.
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
+                      If this parameter is set to True,
-                projection of the LSTM unit, such as dropout, error clipping.
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
-                False means no bias, None means default bias.
+                      False or None means no bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
+                      If this parameter is set to True,
-    :param lstm_layer_attr: extra attribute of lstm layer.
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
    :type lstm_layer_attr: ExtraLayerAttribute
-    :return: lstmemory unit name.
+    :return: The lstmemory unit name.
    :rtype: LayerOutput
    """
    if size is None:
@@ -786,34 +794,42 @@ def lstmemory_group(input,
                                    gate_act=SigmoidActivation(),
                                    state_act=TanhActivation())
-    :param input: input layer.
+    :param input: Input layer.
    :type input: LayerOutput
-    :param size: lstmemory group size.
+    :param size: The lstmemory group size.
    :type size: int
-    :param name: name of lstmemory group.
+    :param name: The name of lstmemory group.
    :type name: basestring
-    :param out_memory: output of previous time step.
+    :param out_memory: The output of previous time step.
    :type out_memory: LayerOutput | None
-    :param reverse: process the input in a reverse order or not.
+    :param reverse: Process the input in a reverse order or not.
    :type reverse: bool
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
    :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
    :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
    :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
    :type state_act: BaseActivation
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
-                           False means no bias, None means default bias.
+                      input to hidden projection.
-    :type lstm_bias_attr: ParameterAttribute|False|None
+                      False or None means no bias.
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
+                      If this parameter is set to True,
-                False means no bias, None means default bias.
+                      the bias is initialized to zero.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
+    :param input_proj_layer_attr: The extra layer attribute for
-                projection of the LSTM unit, such as dropout, error clipping.
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
    :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_layer_attr: lstm layer's extra attribute.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
    :type lstm_layer_attr: ExtraLayerAttribute
    :return: the lstmemory group.
    :rtype: LayerOutput

--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -15,8 +15,8 @@
 """
 __all__ = [
-    "BasePoolingType", "MaxPooling", "AvgPooling", "CudnnMaxPooling",
+    "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
-    "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
+    "CudnnMaxPooling", "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
 ]
@@ -55,6 +55,19 @@ class MaxPooling(BasePoolingType):
        self.output_max_index = output_max_index
+class MaxWithMaskPooling(BasePoolingType):
+    """
+    MaxWithMask pooling.
+    Not only return the very large values for each dimension in sequence or time steps,
+    but also the location indices of found maxinum values.
+    """
+    def __init__(self):
+        BasePoolingType.__init__(self, "max-pool-with-mask")
 class CudnnMaxPooling(BasePoolingType):
    """
    Cudnn max pooling only support GPU. Return the maxinum value in the

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -28,6 +28,8 @@ layers {
      stride_y: 1
      output_y: 227
      img_size_y: 256
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -28,6 +28,8 @@ layers {
      stride_y: 1
      output_y: 227
      img_size_y: 256
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -28,6 +28,8 @@ layers {
      stride_y: 1
      output_y: 48
      img_size_y: 48
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -30,6 +30,8 @@ layers {
      stride_y: 1
      output_y: 48
      img_size_y: 48
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"
@@ -105,6 +107,8 @@ layers {
      stride_y: 1
      output_y: 24
      img_size_y: 24
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_1__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
@@ -30,6 +30,8 @@ layers {
      stride_y: 1
      output_y: 48
      img_size_y: 48
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
@@ -36,6 +36,8 @@ layers {
      stride_y: 1
      output_y: 14
      img_size_y: 14
+      dilation: 1
+      dilation_y: 1
    }
  }
  bias_parameter_name: "___conv_0__.wbias"

--- a/python/paddle/v2/framework/.gitignore
+++ b/python/paddle/v2/framework/.gitignore
--- a/python/paddle/v2/framework/__init__.py
+++ b/python/paddle/v2/framework/__init__.py
--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/framework/backward.py
-from paddle.v2.framework import framework as framework
+from paddle.v2.fluid import framework as framework
 __all__ = ['append_backward_ops']

--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -13,7 +13,7 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope. 
 """
-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import threading
 __tl_scope__ = threading.local()
@@ -27,13 +27,13 @@ __all__ = [
 def get_cur_scope():
    """
    Get current scope.
-    :rtype: paddle.v2.framework.core.Scope
+    :rtype: paddle.v2.fluid.core.Scope
    """
    cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
    if cur_scope_stack is None:
        __tl_scope__.cur_scope = list()
    if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
+        __tl_scope__.cur_scope.append(paddle.v2.fluid.core.Scope())
    return __tl_scope__.cur_scope[-1]

--- a/python/paddle/v2/framework/evaluator.py
+++ b/python/paddle/v2/framework/evaluator.py
-import paddle.v2.framework.op as op
+import paddle.v2.fluid.op as op
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 def avg_accumulate(accumulated_var, per_eval, num_batches, place):
@@ -22,7 +22,7 @@ class Evaluator(object):
        NOTE: default run on CPUPlace(), running on GPUPlace doesn't improve performance much.
        :param scope: the scope instance contains the input.
-        :type scope: paddle.v2.framework.core.scope
+        :type scope: paddle.v2.fluid.core.scope
        :param operator: operator name for caculating the evaluation for each mini-batch.
        :type operator: string
        :param input: output variable name of forward network.

--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/framework/executor.py
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
-from paddle.v2.framework.framework import Block, Program, g_main_program
+from paddle.v2.fluid.framework import Block, Program, g_main_program
 g_scope = core.Scope()

--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/framework/framework.py
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import collections
 import numpy as np
 import copy
@@ -285,7 +285,7 @@ class Operator(object):
        self.desc.check_attrs()
        no_kernel_op_set = {
            'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad', 'while'
+            'rnn_memory_helper_grad', 'conditional_block', 'while'
        }
        if type not in no_kernel_op_set:
            self.desc.infer_var_type(self.block.desc)

--- a/python/paddle/v2/framework/initializer.py
+++ b/python/paddle/v2/framework/initializer.py
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework
 import numpy as np
 __all__ = [

--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/framework/io.py
 import os
 import cPickle as pickle
-from paddle.v2.framework.framework import Program, Parameter, g_main_program, \
+from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
    Variable
 __all__ = [

--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/framework/layer_helper.py
 import copy
 import itertools
-from paddle.v2.framework.framework import Variable, g_main_program, \
+from paddle.v2.fluid.framework import Variable, g_main_program, \
    g_startup_program, unique_name, Program
-from paddle.v2.framework.initializer import ConstantInitializer, \
+from paddle.v2.fluid.initializer import ConstantInitializer, \
    UniformInitializer, XavierInitializer

--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/framework/layers.py
--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/framework/net_drawer.py
@@ -3,8 +3,8 @@ import json
 import logging
 from collections import defaultdict
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)

--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/framework/nets.py
-import paddle.v2.framework.layers as layers
+import paddle.v2.fluid.layers as layers
 __all__ = ["simple_img_conv_pool", "sequence_conv_pool"]

--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 def get_all_op_protos():

--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/framework/optimizer.py
 from collections import defaultdict
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework
-from paddle.v2.framework.framework import unique_name, Program
+from paddle.v2.fluid.framework import unique_name, Program
-from paddle.v2.framework.backward import append_backward_ops
+from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.framework.initializer import ConstantInitializer
+from paddle.v2.fluid.initializer import ConstantInitializer
-from paddle.v2.framework.regularizer import append_regularization_ops
+from paddle.v2.fluid.regularizer import append_regularization_ops
-from paddle.v2.framework.layer_helper import LayerHelper
+from paddle.v2.fluid.layer_helper import LayerHelper
 __all__ = [
    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',

--- a/python/paddle/v2/framework/regularizer.py
+++ b/python/paddle/v2/framework/regularizer.py
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework
 __all__ = [
    'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'

--- a/python/paddle/v2/framework/tests/.gitignore
+++ b/python/paddle/v2/framework/tests/.gitignore
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_train.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ b/python/paddle/v2/framework/tests/test_recommender_system.py
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_dynamic_lstm.py
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
--- a/python/paddle/v2/framework/tests/test_adadelta_op.py
+++ b/python/paddle/v2/framework/tests/test_adadelta_op.py
--- a/python/paddle/v2/framework/tests/test_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_adam_op.py
+++ b/python/paddle/v2/framework/tests/test_adam_op.py
--- a/python/paddle/v2/framework/tests/test_adamax_op.py
+++ b/python/paddle/v2/framework/tests/test_adamax_op.py
--- a/python/paddle/v2/framework/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/framework/tests/test_array_read_write_op.py
--- a/python/paddle/v2/fluid/tests/test_assign_op.py
+++ b/python/paddle/v2/fluid/tests/test_assign_op.py
--- a/python/paddle/v2/framework/tests/test_auc_op.py
+++ b/python/paddle/v2/framework/tests/test_auc_op.py
--- a/python/paddle/v2/framework/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_batch_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
--- a/python/paddle/v2/framework/tests/test_cast_op.py
+++ b/python/paddle/v2/framework/tests/test_cast_op.py
--- a/python/paddle/v2/framework/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/framework/tests/test_chunk_eval_op.py
--- a/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_clip_by_norm_op.py
--- a/python/paddle/v2/framework/tests/test_clip_op.py
+++ b/python/paddle/v2/framework/tests/test_clip_op.py
--- a/python/paddle/v2/framework/tests/test_compare_op.py
+++ b/python/paddle/v2/framework/tests/test_compare_op.py
--- a/python/paddle/v2/framework/tests/test_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_concat_op.py
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
--- a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_conv3d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_op.py
--- a/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_conv_shift_op.py
+++ b/python/paddle/v2/framework/tests/test_conv_shift_op.py
--- a/python/paddle/v2/framework/tests/test_cos_sim_op.py
+++ b/python/paddle/v2/framework/tests/test_cos_sim_op.py
--- a/python/paddle/v2/framework/tests/test_create_op_doc_string.py
+++ b/python/paddle/v2/framework/tests/test_create_op_doc_string.py
--- a/python/paddle/v2/framework/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/framework/tests/test_crf_decoding_op.py
--- a/python/paddle/v2/framework/tests/test_crop_op.py
+++ b/python/paddle/v2/framework/tests/test_crop_op.py
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
--- a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
--- a/python/paddle/v2/framework/tests/test_dropout_op.py
+++ b/python/paddle/v2/framework/tests/test_dropout_op.py
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_div_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_div_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
--- a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_sub_op.py
--- a/python/paddle/v2/framework/tests/test_evaluator.py
+++ b/python/paddle/v2/framework/tests/test_evaluator.py
--- a/python/paddle/v2/framework/tests/test_exception.py
+++ b/python/paddle/v2/framework/tests/test_exception.py
--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
--- a/python/paddle/v2/framework/tests/test_expand_op.py
+++ b/python/paddle/v2/framework/tests/test_expand_op.py
--- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/v2/framework/tests/test_fill_constant_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_constant_op.py
--- a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
+++ b/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
--- a/python/paddle/v2/framework/tests/test_framework_debug_str.py
+++ b/python/paddle/v2/framework/tests/test_framework_debug_str.py
--- a/python/paddle/v2/framework/tests/test_gather_op.py
+++ b/python/paddle/v2/framework/tests/test_gather_op.py
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
--- a/python/paddle/v2/framework/tests/test_gru_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_op.py
--- a/python/paddle/v2/framework/tests/test_gru_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py
--- a/python/paddle/v2/framework/tests/test_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/framework/tests/test_image_classification_layer.py
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/framework/tests/test_inference_model_io.py
--- a/python/paddle/v2/framework/tests/test_initializer.py
+++ b/python/paddle/v2/framework/tests/test_initializer.py
--- a/python/paddle/v2/framework/tests/test_l1_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_l1_norm_op.py
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/framework/tests/test_layers.py
--- a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
+++ b/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
--- a/python/paddle/v2/framework/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/framework/tests/test_lod_array_length_op.py
--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/framework/tests/test_lod_rank_table.py
--- a/python/paddle/v2/fluid/tests/test_lod_reset_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array.py
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
--- a/python/paddle/v2/framework/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table_op.py
--- a/python/paddle/v2/framework/tests/test_lrn_op.py
+++ b/python/paddle/v2/framework/tests/test_lrn_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_op.py
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
--- a/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
--- a/python/paddle/v2/framework/tests/test_matmul_op.py
+++ b/python/paddle/v2/framework/tests/test_matmul_op.py
--- a/python/paddle/v2/framework/tests/test_mean_op.py
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
--- a/python/paddle/v2/framework/tests/test_minus_op.py
+++ b/python/paddle/v2/framework/tests/test_minus_op.py
--- a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
--- a/python/paddle/v2/framework/tests/test_momentum_op.py
+++ b/python/paddle/v2/framework/tests/test_momentum_op.py
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
--- a/python/paddle/v2/framework/tests/test_multiplex_op.py
+++ b/python/paddle/v2/framework/tests/test_multiplex_op.py
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/framework/tests/test_nccl_init_op.py
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
--- a/python/paddle/v2/framework/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/framework/tests/test_op_support_gpu.py
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
--- a/python/paddle/v2/framework/tests/test_pad_op.py
+++ b/python/paddle/v2/framework/tests/test_pad_op.py
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/framework/tests/test_parameter.py
--- a/python/paddle/v2/framework/tests/test_pool2d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
--- a/python/paddle/v2/framework/tests/test_pool3d_op.py
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
--- a/python/paddle/v2/framework/tests/test_pool_max_op.py
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
--- a/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
+++ b/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
--- a/python/paddle/v2/framework/tests/test_precision_recall_op.py
+++ b/python/paddle/v2/framework/tests/test_precision_recall_op.py
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/framework/tests/test_program.py
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
--- a/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
--- a/python/paddle/v2/framework/tests/test_proximal_gd_op.py
+++ b/python/paddle/v2/framework/tests/test_proximal_gd_op.py
--- a/python/paddle/v2/framework/tests/test_rank_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_rank_loss_op.py
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
--- a/python/paddle/v2/framework/tests/test_reduce_op.py
+++ b/python/paddle/v2/framework/tests/test_reduce_op.py
--- a/python/paddle/v2/framework/tests/test_regularizer.py
+++ b/python/paddle/v2/framework/tests/test_regularizer.py
--- a/python/paddle/v2/framework/tests/test_reshape_op.py
+++ b/python/paddle/v2/framework/tests/test_reshape_op.py
--- a/python/paddle/v2/framework/tests/test_rmsprop_op.py
+++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py
--- a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
--- a/python/paddle/v2/framework/tests/test_scale_op.py
+++ b/python/paddle/v2/framework/tests/test_scale_op.py
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
--- a/python/paddle/v2/framework/tests/test_selected_rows.py
+++ b/python/paddle/v2/framework/tests/test_selected_rows.py
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py
--- a/python/paddle/v2/framework/tests/test_seq_conv.py
+++ b/python/paddle/v2/framework/tests/test_seq_conv.py
--- a/python/paddle/v2/framework/tests/test_seq_expand.py
+++ b/python/paddle/v2/framework/tests/test_seq_expand.py
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
--- a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
--- a/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
--- a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/v2/framework/tests/test_sign_op.py
+++ b/python/paddle/v2/framework/tests/test_sign_op.py
--- a/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
+++ b/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/v2/framework/tests/test_split_op.py
+++ b/python/paddle/v2/framework/tests/test_split_op.py
--- a/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
+++ b/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
--- a/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
+++ b/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
--- a/python/paddle/v2/framework/tests/test_sum_op.py
+++ b/python/paddle/v2/framework/tests/test_sum_op.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
--- a/python/paddle/v2/framework/tests/test_top_k_op.py
+++ b/python/paddle/v2/framework/tests/test_top_k_op.py
--- a/python/paddle/v2/framework/tests/test_transpose_op.py
+++ b/python/paddle/v2/framework/tests/test_transpose_op.py
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/framework/tests/test_variable.py
--- a/python/paddle/v2/framework/tests/test_while_op.py
+++ b/python/paddle/v2/framework/tests/test_while_op.py
--- a/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
+++ b/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
--- a/python/setup.py.in
+++ b/python/setup.py.in