diff --git a/.gitignore b/.gitignore
index 1512c1438e9e0b0b7b6e0c273a24b273cb652b04..7480bd53a403d74932d56409fdb0a9dd7bb6b9d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,7 +21,7 @@ third_party/
 cmake-build-*
 
 # generated while compiling
-python/paddle/v2/framework/core.so
+python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 53a36f8f20d1143470928f57eda6f575d9048236..d5b55e1c95f248f551e6a0a3b39123169dd7784f 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -121,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat,
 
 paddle_matrix paddle_matrix_create_sparse(
     uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+#ifndef PADDLE_MOBILE_INFERENCE
   auto ptr = new paddle::capi::CMatrix();
   ptr->mat = paddle::Matrix::createSparseMatrix(
       height,
@@ -131,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse(
       false,
       useGpu);
   return ptr;
+#else
+  return nullptr;
+#endif
 }
 
 paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
@@ -140,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                             uint64_t colSize,
                                             float* valueArray,
                                             uint64_t valueSize) {
+#ifndef PADDLE_MOBILE_INFERENCE
   if (mat == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (rowArray == nullptr || colArray == nullptr ||
@@ -160,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
   } else {
     return kPD_NOT_SUPPORTED;
   }
+#else
+  return kPD_NOT_SUPPORTED;
+#endif
 }
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index bb5223f8a275fa2550bf8b7e94a9c4333de4c8c9..01b8bad2ee9f528f8622346f43b9ff82225a7e73 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height,
  * @param isBinary is binary (either 1 or 0 in matrix) or not.
  * @param useGpu is using GPU or not.
  * @return paddle_matrix.
+ * @note Mobile inference does not support this interface.
  */
 PD_API paddle_matrix paddle_matrix_create_sparse(
     uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
@@ -129,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
  * NULL if the matrix is binary.
  * @param [in] valueSize length of value array. Zero if the matrix is binary.
  * @return paddle_error
+ * @note Mobile inference does not support this interface.
  */
 PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                                    int* rowArray,
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 0865b02c4f275f3d5069109917b05dff1393fc1e..efd1b7a73e1655f95eb83a5e2f59e82cbf7eba16 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -27,7 +27,9 @@ if(WITH_GPU)
     set_source_files_properties(${CUDA_CXX_SOURCES}
                                 PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
+    if (NOT MOBILE_INFERENCE)
     set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
+    endif()
 endif()
 
 set(CUDA_CU_SOURCES
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index 6b56d9ec8d3daae96aaaa04ed79cb637331e2281..89c1f48edacbe0a4432957fe066481412db7e6e1 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "hl_base.h"
 
 /**
- * @brief   Maximum pool forward.
+ * @brief   Maximum pool forward with Mask output.
  *
  * @param[in]   frameCnt    batch size of input image.
  * @param[in]   inputData   input data.
@@ -35,7 +35,7 @@ limitations under the License. */
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  * @param[in]   tgtStride   stride between output data samples.
- *
+ * @param[out]  maskData    the location indices of select max data.
  */
 extern void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               real* maskData = NULL);
 
 /**
  * @brief   Maximum pool backward.
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index a76dbf0b6578de0606702ad1af227fbf6e1cd62e..968ed4840ffb0623b57bd6e6d839973e109394de 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               real* MaskData) {}
 
 inline void hl_maxpool_backward(const int frameCnt,
                                 const real* inputData,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index 58674febdc4a094c95ff03701e4586c32729847d..3699b1e8ae9d8f813439eaeaa760c4a9f6e100a0 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads,
                                  const int offsetH,
                                  const int offsetW,
                                  real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 real* maskData) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads,
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     real maxval = -FLT_MAX;
+    int max_index = -1;
     inputData += (frameNum * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w])
-          maxval = inputData[h * width + w];
+        if (maxval < inputData[h * width + w]) {
+          max_index = h * width + w;
+          maxval = inputData[max_index];
+        }
       }
     }
     int tgtIndex =
         index % (pooledW * pooledH * channels) + frameNum * tgtStride;
     tgtData[tgtIndex] = maxval;
+    if (maskData != NULL) {
+      maskData[tgtIndex] = max_index;
+    }
   }
 }
 
@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt,
                         const int paddingH,
                         const int paddingW,
                         real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        real* maskData) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   dim3 threads(1024, 1);
@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt,
                                                          paddingH,
                                                          paddingW,
                                                          tgtData,
-                                                         tgtStride);
+                                                         tgtStride,
+                                                         maskData);
   CHECK_SYNC("hl_maxpool_forward failed");
 }
 
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 913cd0f81eaef37014f38c71e7c3d23bfeec1466..b3b9c45ded95ce2e735b8898d47760956dcacdce 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -377,6 +377,12 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
   return grad_op_descs;
 }
 
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx);
+
 std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     ProgramDescBind& program_desc, int block_idx,
     std::unordered_set<std::string>* no_grad_vars,
@@ -392,13 +398,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
 
     if ((*it)->Type() == "recurrent") {
       int step_block_idx = (*it)->GetBlockAttr("step_block");
-      auto backward_block_op_descs = MakeBlockBackward(
-          program_desc, step_block_idx, no_grad_vars, grad_to_var);
+      BlockDescBind* backward_block = CreateStepBlock(
+          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else if ((*it)->Type() == "conditional_block") {
       BlockDescBind* backward_block =
-          program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-      for (auto& ptr : backward_block_op_descs) {
-        backward_block->AppendAllocatedOp(std::move(ptr));
-      }
+          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
+                          (*it)->GetBlockAttr("block"));
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else {
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
@@ -449,6 +455,21 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
   return backward_descs;
 }
 
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx) {
+  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
+                                                   no_grad_vars, grad_to_var);
+  BlockDescBind* backward_block =
+      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+  for (auto& ptr : backward_block_op_descs) {
+    backward_block->AppendAllocatedOp(move(ptr));
+  }
+  return backward_block;
+}
+
 ParamGradInfoMap AppendBackward(
     ProgramDescBind& program_desc, const VarDescBind& target,
     const std::unordered_set<std::string>& no_grad_vars) {
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index d060196bb2c478b776851288cb71a1880d60660d..0f19870bec3e69d07278507cc556a86bbd25d12d 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -27,10 +27,32 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
     return VarDesc_VarType_LOD_RANK_TABLE;
   } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
     return VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+    return VarDesc_VarType_SELECTED_ROWS;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
 }
 
+template <typename Visitor>
+inline void VisitVarType(const Variable& var, Visitor visitor) {
+  switch (ToVarType(var.Type())) {
+    case VarDesc_VarType_LOD_TENSOR:
+      visitor(var.Get<framework::LoDTensor>());
+      return;
+    case VarDesc_VarType_LOD_RANK_TABLE:
+      visitor(var.Get<LoDRankTable>());
+      return;
+    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+      visitor(var.Get<LoDTensorArray>());
+      return;
+    case VarDesc_VarType_SELECTED_ROWS:
+      visitor(var.Get<SelectedRows>());
+      return;
+    default:
+      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
index baf78bc6c88d0d294f4457b81c52b22e425d9fdb..062ea25a11470dd9ecdafb278dee9a2e0979f00b 100644
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -61,6 +61,7 @@ public:
     // function arguments
     strides_ = config.get<std::vector<size_t>>("strides");
     paddings_ = config.get<std::vector<size_t>>("paddings");
+    dilations_ = config.get<std::vector<size_t>>("dilations");
     groups_ = config.get<size_t>("groups");
 
     // number of inputs and outputs
@@ -118,6 +119,7 @@ protected:
 
   std::vector<size_t> strides_;
   std::vector<size_t> paddings_;
+  std::vector<size_t> dilations_;
 
   /// Group size, refer to grouped convolution in
   /// Alex Krizhevsky's paper: when group=2, the first half of the
@@ -133,6 +135,10 @@ protected:
 
   inline int paddingW() const { return paddings_[1]; }
 
+  inline int dilationH() const { return dilations_[0]; }
+
+  inline int dilationW() const { return dilations_[1]; }
+
   // A temporary memory in convolution calculation.
   MemoryHandlePtr memory_;
 
diff --git a/paddle/function/ConvOpTest.h b/paddle/function/ConvOpTest.h
index cb02a96d0dbef6f64fd9e7576179572e68bf5513..d8d3c792df236ab0fd412b0cf77f275355848627 100644
--- a/paddle/function/ConvOpTest.h
+++ b/paddle/function/ConvOpTest.h
@@ -79,45 +79,59 @@ void Convolution(const std::string& conv1,
             if (outputChannels < inputChannels) continue;
             for (size_t stride : {1, 2}) {
               for (size_t padding : {0, 1}) {
-                if (padding >= filterSize) break;
+                for (size_t dilation : {1, 3}) {
+                  if (padding >= filterSize) break;
+                  size_t filterS = (filterSize - 1) * dilation + 1;
 
-                // NNPACK only supports stride = 1 if batchSize > 1
-                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
-                    batchSize > 1 && stride > 1)
-                  break;
+                  if (inputSize + 2 * padding < filterS) break;
 
-                size_t outputSize =
-                    (inputSize - filterSize + 2 * padding + stride) / stride;
-                VLOG(3) << " batchSize=" << batchSize
-                        << " inputChannels=" << inputChannels
-                        << " inputHeight=" << inputSize
-                        << " inputWidth=" << inputSize
-                        << " outputChannels=" << outputChannels
-                        << " filterHeight=" << filterSize
-                        << " filterWidth=" << filterSize
-                        << " outputHeight=" << outputSize
-                        << " outputWidth=" << outputSize << " stride=" << stride
-                        << " padding=" << padding;
+                  if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
+                       conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
+                      dilation > 1)
+                    break;
 
-                std::vector<size_t> paddings = {padding, padding};
-                std::vector<size_t> strides = {stride, stride};
-                Compare2Function<DType1, DType2> test(
-                    conv1,
-                    conv2,
-                    FuncConfig()
-                        .set("paddings", paddings)
-                        .set("strides", strides)
-                        .set("groups", (size_t)1)
-                        .set("algo", (std::string) "auto"));
+                  // NNPACK only supports stride = 1 if batchSize > 1
+                  if ((conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
+                      batchSize > 1 && stride > 1)
+                    break;
 
-                TensorShape input{
-                    batchSize, inputChannels, inputSize, inputSize};
-                TensorShape filter{
-                    outputChannels, inputChannels, filterSize, filterSize};
-                TensorShape output{
-                    batchSize, outputChannels, outputSize, outputSize};
+                  size_t outputSize =
+                      (inputSize - filterS + 2 * padding + stride) / stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputSize
+                          << " inputWidth=" << inputSize
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterSize
+                          << " filterWidth=" << filterSize
+                          << " outputHeight=" << outputSize
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;
 
-                function(test, input, filter, output);
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  std::vector<size_t> dilations = {dilation, dilation};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("dilations", dilations)
+                          .set("groups", (size_t)1)
+                          .set("algo", (std::string) "auto"));
+
+                  TensorShape input{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape output{
+                      batchSize, outputChannels, outputSize, outputSize};
+
+                  function(test, input, filter, output);
+                }
               }
             }
           }
@@ -144,6 +158,7 @@ void Convolution2(const std::string& conv1,
               for (size_t outputChannels : {7}) {
                 size_t stride = 1;
                 size_t padding = 0;
+                size_t dilation = 1;
                 size_t outputHeight =
                     (inputHeight - filterHeight + 2 * padding + stride) /
                     stride;
@@ -162,6 +177,7 @@ void Convolution2(const std::string& conv1,
 
                 std::vector<size_t> paddings = {padding, padding};
                 std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {dilation, dilation};
                 Compare2Function<DType1, DType2> test(
                     conv1,
                     conv2,
@@ -169,6 +185,7 @@ void Convolution2(const std::string& conv1,
                         .set("paddings", paddings)
                         .set("strides", strides)
                         .set("groups", (size_t)1)
+                        .set("dilations", dilations)
                         .set("algo", (std::string) "auto"));
 
                 TensorShape input{
@@ -223,6 +240,7 @@ void DepthwiseConvolution(const std::string& conv1,
 
                 std::vector<size_t> paddings = {padding, padding};
                 std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {1, 1};
                 size_t groups = inputChannels;
                 Compare2Function<DType1, DType2> test(
                     conv1,
@@ -231,6 +249,7 @@ void DepthwiseConvolution(const std::string& conv1,
                         .set("paddings", paddings)
                         .set("strides", strides)
                         .set("groups", groups)
+                        .set("dilations", dilations)
                         .set("algo", (std::string) "auto"));
 
                 TensorShape input{
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index bdb56ddac38b91d756fc6f31282f29c0489fd660..8d34eee886a6202691e5dec2ab62e7c5b0ac7fb1 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -100,7 +100,9 @@ public:
                  strideH(),
                  strideW(),
                  paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
         } else {
           colData = inputData + g * inputOffset;
         }
@@ -223,7 +225,9 @@ public:
                  strideH(),
                  strideW(),
                  paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
         }
       }
       inputGrad += inputChannels * inputHeight * inputWidth;
@@ -310,7 +314,9 @@ public:
                  strideH(),
                  strideW(),
                  paddingH(),
-                 paddingW());
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
         } else {
           colData = inputData + g * inputOffset;
         }
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 1e0cff436ff60d5a029e89657d00af2b0bf8b454..0c37fc972484bfbede01d23652e384071bf883af 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -78,7 +78,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };
 
 template <ColFormat Format, DeviceType Device, class T>
@@ -91,7 +93,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth);
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
 };
 
 }  // namespace paddle
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
index b7d1eb1eded7a7471fd5833a649916d3ee3e598e..f864d42f8075209c70ca2e16a70e4f2c9d58eef4 100644
--- a/paddle/function/Im2ColOp.cpp
+++ b/paddle/function/Im2ColOp.cpp
@@ -31,7 +31,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -47,8 +49,8 @@ public:
       int c_im = c / filterWidth / filterHeight;
       for (int h = 0; h < outputHeight; ++h) {
         for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
           if ((imRowIdx - paddingHeight) < 0 ||
               (imRowIdx - paddingHeight) >= inputHeight ||
               (imColIdx - paddingWidth) < 0 ||
@@ -81,7 +83,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -97,8 +101,8 @@ public:
       int c_im = c / filterWidth / filterHeight;
       for (int h = 0; h < outputHeight; ++h) {
         for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
           if ((imRowIdx - paddingHeight) >= 0 &&
               (imRowIdx - paddingHeight) < inputHeight &&
               (imColIdx - paddingWidth) >= 0 &&
@@ -134,7 +138,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -147,9 +153,10 @@ public:
         for (int channel = 0; channel < inputChannels; ++channel) {
           for (int filterH = 0; filterH < filterHeight; ++filterH) {
             for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
               int colDataOffset =
                   (((outputH * outputWidth + outputW) * inputChannels +
                     channel) *
@@ -189,7 +196,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -202,9 +211,10 @@ public:
         for (int channel = 0; channel < inputChannels; ++channel) {
           for (int filterH = 0; filterH < filterHeight; ++filterH) {
             for (int filterW = 0; filterW < filterWidth; ++filterW) {
-              int imRowOffset =
-                  outputH * strideHeight + filterH - paddingHeight;
-              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
               int colDataOffset =
                   (((outputH * outputWidth + outputW) * inputChannels +
                     channel) *
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
index bd98610498b1af003574129118be4684d38e5813..71da11b95557d7b59de5ea6c65d1d43db42f211c 100644
--- a/paddle/function/Im2ColOpGpu.cu
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -28,6 +28,8 @@ __global__ void im2col(const T* data_im,
                        int strideW,
                        int paddingH,
                        int paddingW,
+                       int dilationH,
+                       int dilationW,
                        int height_col,
                        int width_col,
                        T* data_col) {
@@ -44,8 +46,8 @@ __global__ void im2col(const T* data_im,
     data_col += (channel_out * height_col + h_out) * width_col + w_out;
     for (int i = 0; i < blockH; ++i) {
       for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in + i);
-        int cIdx = int(w_in + j);
+        int rIdx = int(h_in + i * dilationH);
+        int cIdx = int(w_in + j * dilationW);
         if ((rIdx - (int)paddingH) >= (int)height ||
             (rIdx - (int)paddingH) < 0 ||
             (cIdx - (int)paddingW) >= (int)width ||
@@ -77,7 +79,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -102,6 +106,8 @@ public:
                                                     strideWidth,
                                                     paddingHeight,
                                                     paddingWidth,
+                                                    dilationHeight,
+                                                    dilationWidth,
                                                     outputHeight,
                                                     outputWidth,
                                                     colData);
@@ -121,6 +127,8 @@ __global__ void col2im(size_t n,
                        size_t strideW,
                        size_t paddingH,
                        size_t paddingW,
+                       size_t dilationH,
+                       size_t dilationW,
                        size_t height_col,
                        size_t width_col,
                        T* data_im) {
@@ -131,23 +139,34 @@ __global__ void col2im(size_t n,
     int w = int(index % width);
     int h = int((index / width) % height);
     int c = int(index / (width * height));
+    int filterH = (blockH - 1) * dilationH + 1;
+    int filterW = (blockW - 1) * dilationW + 1;
+
     if ((w - (int)paddingW) >= 0 &&
         (w - (int)paddingW) < (width - 2 * paddingW) &&
         (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
       // compute the start and end of the output
       int w_col_start =
-          (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
       int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
       int h_col_start =
-          (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
       int h_col_end = min(int(h / strideH + 1), int(height_col));
+
       for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
         for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
           // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH * blockW) +
-                      (h - h_col * (int)strideH) * (int)blockW +
-                      (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+          int h_k = (h - h_col * strideH);
+          int w_k = (w - w_col * strideW);
+          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
+            h_k /= dilationH;
+            w_k /= dilationW;
+            int c_col =
+                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
+                    width_col +
+                w_col;
+            val += data_col[c_col];
+          }
         }
       }
       h -= paddingH;
@@ -173,7 +192,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -205,6 +226,8 @@ public:
         strideWidth,
         paddingHeight,
         paddingWidth,
+        dilationHeight,
+        dilationWidth,
         outputHeight,
         outputWidth,
         imData);
@@ -229,6 +252,8 @@ __global__ void im2colOCF(const T* imData,
                           int strideWidth,
                           int paddingHeight,
                           int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                           int outputHeight,
                           int outputWidth) {
   int swId = blockIdx.x;
@@ -237,8 +262,10 @@ __global__ void im2colOCF(const T* imData,
        channelId += blockDim.z) {
     for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
       for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int widthOffset =
+            idx * dilationHeight + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationWidth + shId * strideHeight - paddingHeight;
         int imOffset = widthOffset + heightOffset * inputWidth +
                        channelId * inputHeight * inputWidth;
 
@@ -273,7 +300,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -312,6 +341,8 @@ public:
                                                        strideWidth,
                                                        paddingHeight,
                                                        paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                        outputHeight,
                                                        outputWidth);
     CHECK_SYNC("Im2ColFunctor GPU failed");
@@ -330,6 +361,8 @@ __global__ void col2imOCF(T* imData,
                           int strideWidth,
                           int paddingHeight,
                           int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
                           int outputHeight,
                           int outputWidth) {
   int swId = blockIdx.x;
@@ -338,8 +371,10 @@ __global__ void col2imOCF(T* imData,
        channelId += blockDim.z) {
     for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
       for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
-        int widthOffset = idx + swId * strideWidth - paddingWidth;
-        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int widthOffset =
+            idx * dilationWidth + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationHeight + shId * strideHeight - paddingHeight;
         int imOffset = widthOffset + heightOffset * inputWidth +
                        channelId * inputHeight * inputWidth;
 
@@ -372,7 +407,9 @@ public:
                   int strideHeight,
                   int strideWidth,
                   int paddingHeight,
-                  int paddingWidth) {
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
     int inputChannels = imShape[0];
     int inputHeight = imShape[1];
     int inputWidth = imShape[2];
@@ -411,6 +448,8 @@ public:
                                                        strideWidth,
                                                        paddingHeight,
                                                        paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
                                                        outputHeight,
                                                        outputWidth);
     CHECK_SYNC("Col2ImFunctor GPU failed");
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index a0a01a5fc7fc055dce6ddb3ee51c7ab18f8a4ca7..1f085538d81904dbd5b5d6bcd014adaed22e37d7 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -29,82 +29,98 @@ void TestIm2ColFunctor() {
           for (size_t filterWidth : {3, 7}) {
             for (size_t stride : {1, 2}) {
               for (size_t padding : {0, 1}) {
-                if (inputHeight <= filterHeight || inputWidth <= filterWidth)
-                  break;
-                if (padding >= filterHeight || padding >= filterWidth) break;
-                size_t outputHeight =
-                    (inputHeight - filterHeight + 2 * padding + stride) /
-                    stride;
-                size_t outputWidth =
-                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
-
-                TensorShape imShape =
-                    TensorShape({channels, inputHeight, inputWidth});
-                TensorShape colShape1 = TensorShape({channels,
-                                                     filterHeight,
-                                                     filterWidth,
-                                                     outputHeight,
-                                                     outputWidth});
-                TensorShape colShape2 = TensorShape({outputHeight,
-                                                     outputWidth,
-                                                     channels,
-                                                     filterHeight,
-                                                     filterWidth});
-
-                size_t height = channels * filterHeight * filterWidth;
-                size_t width = outputHeight * outputWidth;
-                VectorPtr input1 = Vector::create(imShape.getElements(), false);
-                VectorPtr input2 = Vector::create(imShape.getElements(), false);
-                MatrixPtr output1 = Matrix::create(height, width, false, false);
-                MatrixPtr output2 = Matrix::create(width, height, false, false);
-                input1->uniform(0.001, 1);
-                input2->copyFrom(*input1);
-
-                Im2ColFunctor<kCFO, Device, T> im2Col1;
-                Im2ColFunctor<kOCF, Device, T> im2Col2;
-                im2Col1(input1->getData(),
-                        imShape,
-                        output1->getData(),
-                        colShape1,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-                im2Col2(input2->getData(),
-                        imShape,
-                        output2->getData(),
-                        colShape2,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-
-                // The transposition of the result of ColFormat == kCFO
-                // is equal to the result of ColFormat == kOCF.
-                MatrixPtr test;
-                output2->transpose(test, true);
-                autotest::TensorCheckErr(*output1, *test);
-
-                Col2ImFunctor<kCFO, Device, T> col2Im1;
-                Col2ImFunctor<kOCF, Device, T> col2Im2;
-                col2Im1(input1->getData(),
-                        imShape,
-                        output1->getData(),
-                        colShape1,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-                col2Im2(input2->getData(),
-                        imShape,
-                        output2->getData(),
-                        colShape2,
-                        stride,
-                        stride,
-                        padding,
-                        padding);
-
-                autotest::TensorCheckErr(*input1, *input2);
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+                  TensorShape colShape2 = TensorShape({outputHeight,
+                                                       outputWidth,
+                                                       channels,
+                                                       filterHeight,
+                                                       filterWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(width, height, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, Device, T> im2Col1;
+                  Im2ColFunctor<kOCF, Device, T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+
+                  // The transposition of the result of ColFormat == kCFO
+                  // is equal to the result of ColFormat == kOCF.
+                  MatrixPtr test;
+                  output2->transpose(test, true);
+                  autotest::TensorCheckErr(*output1, *test);
+
+                  Col2ImFunctor<kCFO, Device, T> col2Im1;
+                  Col2ImFunctor<kOCF, Device, T> col2Im2;
+
+                  col2Im1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  col2Im2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  autotest::TensorCheckErr(*input1, *input2);
+                }
               }
             }
           }
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 5f39167afc34affbea7858fa0794ef52b786a383..91d732641a4a5eed050841b59fd10da397eb732f 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -85,9 +85,49 @@ if(MOBILE_INFERENCE)
          gradientmachines/GradientMachineMode.cpp
          gradientmachines/MultiGradientMachine.cpp)
 
-    # Remove useless layers
+    # Remove layers that used in training
     list(REMOVE_ITEM GSERVER_SOURCES
-    	 layers/RecurrentLayerGroup.cpp)
+    	 layers/RecurrentLayerGroup.cpp
+         layers/CostLayer.cpp
+         layers/MultiBoxLossLayer.cpp
+         layers/WarpCTCLayer.cpp
+         layers/CTCLayer.cpp
+         layers/LinearChainCTC.cpp
+         layers/PrintLayer.cpp)
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/OuterProdLayer.cpp
+         layers/SumToOneNormLayer.cpp
+         layers/ConvShiftLayer.cpp
+         layers/InterpolationLayer.cpp
+         layers/AgentLayer.cpp
+         layers/DotMulOperator.cpp
+         layers/GruStepLayer.cpp
+         layers/LstmStepLayer.cpp
+         layers/ConvexCombinationLayer.cpp
+         layers/Conv3DLayer.cpp
+         layers/DeConv3DLayer.cpp
+         layers/CropLayer.cpp
+         layers/CrossEntropyOverBeam.cpp
+         layers/DataNormLayer.cpp
+         layers/FeatureMapExpandLayer.cpp
+         layers/HierarchicalSigmoidLayer.cpp
+         layers/MultinomialSampler.cpp
+         layers/NCELayer.cpp
+         layers/KmaxSeqScoreLayer.cpp
+         layers/MDLstmLayer.cpp
+         layers/MultiplexLayer.cpp
+         layers/PadLayer.cpp
+         layers/Pool3DLayer.cpp
+         layers/ResizeLayer.cpp
+         layers/RotateLayer.cpp
+         layers/RowConvLayer.cpp
+         layers/RowL2NormLayer.cpp
+         layers/SamplingIdLayer.cpp
+         layers/ScaleShiftLayer.cpp
+         layers/SelectiveFullyConnectedLayer.cpp
+         layers/SpatialPyramidPoolLayer.cpp
+         layers/BilinearInterpLayer.cpp
+         layers/ClipLayer.cpp)
 endif()
 
 if(WITH_GPU)
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index dbadc352a4ccd7483bf67e1025c212f514e32a24..be112b41239cace3fa9b9ee97923f8c3c7a9a98f 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "NeuralNetwork.h"
 #include "hl_gpu.h"
-#include "paddle/gserver/layers/AgentLayer.h"
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -28,6 +27,7 @@ limitations under the License. */
 #ifndef PADDLE_MOBILE_INFERENCE
 #include "MultiNetwork.h"
 #include "RecurrentGradientMachine.h"
+#include "paddle/gserver/layers/AgentLayer.h"
 #endif
 
 namespace paddle {
@@ -192,9 +192,11 @@ void NeuralNetwork::init(const ModelConfig& config,
 void NeuralNetwork::connect(LayerPtr agentLayer,
                             LayerPtr realLayer,
                             int height) {
+#ifndef PADDLE_MOBILE_INFERENCE
   AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
   CHECK_NOTNULL(agent);
   agent->setRealLayer(realLayer, height);
+#endif
 }
 
 void NeuralNetwork::connect(std::string agentLayerName,
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 48dfcb49a4c2c46891bb5236fc1f8e644c03f327..7ff0c73721d3de93aa7fa5fae58876884592c51f 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -79,6 +79,10 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
   for (int i = 0; i < config_.inputs_size(); i++) {
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    std::vector<size_t> dilations = {(size_t)dilationY_[i],
+                                     (size_t)dilation_[i]};
+
+    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
 
     // Convolution Layer uses the GemmConv function by default.
     convType = "GemmConv";
@@ -97,13 +101,14 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
       if ((filterSize_[i] == filterSizeY_[i]) &&
           (filterSize_[i] == 3 || filterSize_[i] == 4) &&
-          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) {
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
+          !useDilation) {
         convType = "NeonDepthwiseConv";
       }
 #endif
     }
 
-    if (FLAGS_use_nnpack && !isDeconv_) {
+    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
       createFunction(forward_,
                      "NNPACKConv",
                      FuncConfig()
@@ -117,6 +122,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
+                         .set("dilations", dilations)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
@@ -124,6 +130,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
+                         .set("dilations", dilations)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
@@ -131,6 +138,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
+                         .set("dilations", dilations)
                          .set("groups", (size_t)groups_[i]));
     }
   }
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 01f2aae6cf88d47296da804061b9b039cca593db..b55b86221cd411addfa8c5e93f8089f5ed9b0557 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -98,6 +98,7 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
 LayerPtr Layer::create(const LayerConfig& config) {
   std::string type = config.type();
 
+#ifndef PADDLE_MOBILE_INFERENCE
   // NOTE: As following types have illegal character '-',
   // they can not use REGISTER_LAYER to registrar.
   // Besides, to fit with old training models,
@@ -106,7 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) {
     return LayerPtr(new MultiClassCrossEntropy(config));
   else if (type == "rank-cost")
     return LayerPtr(new RankingCost(config));
-#ifndef PADDLE_MOBILE_INFERENCE
   else if (type == "auc-validation")
     return LayerPtr(new AucValidation(config));
   else if (type == "pnpair-validation")
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
index 6ffe4fbec643e50d27924a989875454d307f5b9b..0f2b67fd758ec1513f42c4cb1a36f2f3915f4740 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -54,7 +54,6 @@ void MKLDNNAddtoLayer::reshape(
   ow = iw;
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
-  printSizeInfo();
 }
 
 void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
index ed3887cbf653878623764a310c9f364f4d8be27f..071bdf54d5dc9538d5ced580a73b9c0fbcea41fb 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -125,7 +125,6 @@ void MKLDNNBatchNormLayer::reshape(
       << "Input channel can not be changed";
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
-  printSizeInfo();
 }
 
 void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index b8120eda1e2dadab943869a05546351a369af6fd..8aa54e0a9efa7adb766cbb6009f6a29410c6ae7d 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -102,8 +102,6 @@ void MKLDNNConvLayer::reshape(
 
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
-
-  printSizeInfo();
 }
 
 void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index 1fed0e1c6565b763a3ee73a0853f560ddfbd44c6..9c69136684e5f9005860b476ec6ed1bbc9ceff6c 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -92,7 +92,7 @@ public:
   void printSizeInfo() override {
     MKLDNNLayer::printSizeInfo();
     VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
-                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 3429c53d2396e051d62fe0ae405934758e89f9c2..350ec65fffbc73c3a6e4245f763f4c6aa868f574 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -84,8 +84,6 @@ void MKLDNNFcLayer::reshape(
 
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc);
-
-  printSizeInfo();
 }
 
 void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 6e89260f49979d4edb4da138507a73dc2bf120de..a18c455beab96ef25b5545281bae4d48cec98d9e 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -71,8 +71,6 @@ void MKLDNNPoolLayer::reshape(
   reshapeOutput(oh, ow);
 
   resizeOutput(bs, oc * oh * ow);
-
-  printSizeInfo();
 }
 
 void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d810a58d9a3aea4333806dc9805d3444c3772ba3
--- /dev/null
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxPoolWithMaskLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+  setOutput("mask", &mask_);
+  return true;
+}
+
+size_t MaxPoolWithMaskLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  layerSize = outputX_ * outputY_ * channels_;
+  getOutput().setFrameHeight(outputY_);
+  getOutput().setFrameWidth(outputX_);
+
+  return layerSize;
+}
+
+void MaxPoolWithMaskLayer::forward(PassType passType) {
+  size_t size = getSize();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  int batchSize = inputV->getHeight();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  CHECK_EQ(size, outV->getWidth());
+
+  resetSpecifyOutput(mask_,
+                     batchSize,
+                     size,
+                     /* isValueClean */ false,
+                     /* isGradClean */ true);
+
+  MatrixPtr maskV = mask_.value;
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_,
+                       maskV);
+}
+
+void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  MatrixPtr outGrad = getOutputGrad();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
+
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0174add9d944930289f2bdf78d9f730fd1fcc7d
--- /dev/null
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class MaxPoolWithMaskLayer : public PoolLayer {
+protected:
+  Argument mask_;
+
+public:
+  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
+      : PoolLayer(config) {}
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 7b932d5a76e9c4fe7cbe5882bbc19eb3de4b503a..87613a96c5b3c2da212f63e9e678bcd22308b08e 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PoolLayer.h"
+#include "MaxPoolWithMaskLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
 #ifdef PADDLE_WITH_CUDA
@@ -44,7 +45,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
   strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-
   return true;
 }
 
@@ -57,6 +57,8 @@ Layer* PoolLayer::create(const LayerConfig& config) {
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);
 #endif
+  } else if (pool == "max-pool-with-mask") {
+    return new MaxPoolWithMaskLayer(config);
   } else {
     LOG(FATAL) << "Unknown pool type: " << pool;
     return nullptr;
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
index 99cfddb0cf3337745a716a8c329713c18b99eda3..35d4b12d3d357800fe72899069b5377c252fac5f 100644
--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -98,7 +98,7 @@ void ROIPoolLayer::forward(PassType passType) {
     size_t roiStartH = round(bottomROIs[2] * spatialScale_);
     size_t roiEndW = round(bottomROIs[3] * spatialScale_);
     size_t roiEndH = round(bottomROIs[4] * spatialScale_);
-    CHECK_GE(roiBatchIdx, 0);
+    CHECK_GE(roiBatchIdx, 0UL);
     CHECK_LT(roiBatchIdx, batchSize);
     size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL);
     size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL);
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index aa94ee406e27c86e6d49b6d2b5327a3f86bcacd6..4bea348f637f39444e8aad89278e6366ecd73b1d 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -1,9 +1,12 @@
 # gserver pacakge unittests
 
 add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_MultinomialSampler)
 add_simple_unittest(test_RecurrentLayer)
 
+if(NOT MOBILE_INFERENCE)
+  add_simple_unittest(test_MultinomialSampler)
+endif()
+
 function(gserver_test TARGET)
   add_unittest_without_exec(${TARGET}
       ${TARGET}.cpp
@@ -24,6 +27,7 @@ gserver_test(test_ConvUnify)
 gserver_test(test_BatchNorm)
 gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
+gserver_test(test_MaxPoolingWithMaskOutput)
 
 ########## test_Mkldnn layers and activations ##########
 if(WITH_MKLDNN)
@@ -48,7 +52,7 @@ if(WITH_PYTHON)
 endif()
 
 ############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE)
+if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
     add_unittest_without_exec(test_WarpCTCLayer
         test_WarpCTCLayer.cpp)
 
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index fcbcb5b0f1f4cb07066363c9fa93fb1726459f30..3517d293e3c901caaa19952b04e56d1ef0d2b46e 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -434,7 +434,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  int dilation = 1;
+  int dilation = 2;
   if (type == "cudnn_conv") {
 #if CUDNN_VERSION >= 6000
     dilation = 2;
@@ -1234,6 +1234,7 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
 
 #ifdef PADDLE_WITH_CUDA
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
@@ -1242,6 +1243,7 @@ TEST(Layer, PoolLayer) {
   testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
 
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index a0e039c2a33b586e21775ad06c1278a10804d654..a859e34c8996d81f14bf1edcb6e23d5a4f687e6b 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -297,7 +297,7 @@ static void getAddtoConfig(TestConfig& cfg,
 }
 
 void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
-  CHECK_GE(nInputs, 1);
+  CHECK_GE(nInputs, 1UL);
   TestConfig dnnConfig;
   getAddtoConfig(dnnConfig, pm, nInputs);
   dnnConfig.layerConfig.set_type("mkldnn_addto");
diff --git a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..16438886df94cab9d29d05924bb047e6c7f1f6fa
--- /dev/null
+++ b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(1);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
+                                       const string& poolType,
+                                       bool use_gpu,
+                                       MatrixPtr& maskMat) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(5);
+  pool->set_img_size_y(5);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+
+  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
+                   maskMat);
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  MatrixPtr inputMat;
+  MatrixPtr maskMat;
+  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
+                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
+                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
+  real maskData[] = {12, 4, 22, 24};
+
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->setData(inputData);
+  maskMat->setData(maskData);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->copyFrom(inputData, 25);
+  maskMat->copyFrom(maskData, 4);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#endif
+}
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 53dd5383601782231e6e742784007d1c9154dc6b..e3eff59dc575ee43552e401bc887f885a9804b61 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1902,5 +1902,52 @@ void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
 }
 
 template class BaseMatrixT<real>;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 template class BaseMatrixT<int>;
+
+#else
+
+template <>
+void BaseMatrixT<int>::zero() {
+  applyUnary(unary::Zero<int>());
+}
+
+template <>
+void BaseMatrixT<int>::assign(int p) {
+  applyUnary(unary::Assign<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
+  applyBinary(binary::IsEqual<int>(value), b);
+}
+
+template <>
+void BaseMatrixT<int>::neg() {
+  applyUnary(unary::Neg<int>());
+}
+
+template <>
+void BaseMatrixT<int>::abs2() {
+  applyUnary(unary::Abs<int>());
+}
+
+template <>
+void BaseMatrixT<int>::add(int p) {
+  applyUnary(unary::Add<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::add(int p1, int p2) {
+  applyUnary(unary::Add2<int>(p1, p2));
+}
+
+template <>
+void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
+  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
+}
+
+#endif
 }  // namespace paddle
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 68b5296228cd733dc3cb7ca0f762e0a69187dbff..86bb270a4372841b3e6f4676e222d2190549c153 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -25,6 +25,19 @@ else()
     message(STATUS "Compile with MKLDNNMatrix")
 endif()
 
+if(MOBILE_INFERENCE)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
+    # Remove sparse
+    list(REMOVE_ITEM MATH_HEADERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
+endif()
 set(MATH_SOURCES
     "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
     "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 36d57bbb65245de6b0de5909b55fbc4be3eccd78..aad1348353d558abca72ed0fa5cf943237e3ac78 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "Matrix.h"
 
@@ -309,3 +312,57 @@ private:
   using Matrix::subMatrix;
 };
 }  // namespace paddle
+
+#else
+
+#include "Matrix.h"
+
+namespace paddle {
+
+class CpuSparseMatrix : public Matrix {
+public:
+  CpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  real* getValue() const { return nullptr; }
+  size_t getColStartIdx(size_t i) const { return 0; }
+  size_t getRowStartIdx(size_t i) const { return 0; }
+  size_t getColNum(size_t i) const { return 0; }
+  int* getRowCols(size_t i) const { return nullptr; }
+
+  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
+    return nullptr;
+  }
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 21a8f73c3e650d4b3c3b86247594cd965f4ead35..a710479bab82ed52122cf59bb14a05ccbd4aa05c 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -152,12 +152,7 @@ void MKLDNNMatrix::downSpatial() {
   }
   memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
   memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
-  mkldnn_primitive_t result;
-  mkldnn::error::wrap_c_api(
-      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
-      "could not create a memory primitive");
-  reset(result);
-  set_data_handle(data_);
+  resetMKLDNNMemory(pd, data_);
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 54cfefe23b3dc70fd12fd2ca8886c941047b59f7..39d40a1f61609a649d3341c170d24b0604921ac2 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -145,6 +145,27 @@ public:
     m_.reset();
   }
 
+  /**
+   * override the CpuMatrix::resize
+   */
+  void resize(size_t newHeight, size_t newWidth) override {
+    m_->resize(newHeight, newWidth);
+    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+      return;
+    }
+    CpuMatrix::setData(data_);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+    auto pd = mkldnn::memory::primitive_desc(
+        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+                             getDtype(),
+                             mkldnn::memory::format::nc),
+        getEngine());
+    resetMKLDNNMemory(pd, data_);
+  }
+
   /**
    * override Matrix::getData
    * check data before return
@@ -215,6 +236,17 @@ protected:
                    memory::format srcFmt,
                    memory::format dstFmt,
                    memory::dims dm);
+  /**
+   * reset this MKLDNN Memory from primitve desc
+   */
+  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+    mkldnn_primitive_t result;
+    mkldnn::error::wrap_c_api(
+        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+        "could not create a memory primitive");
+    reset(result);
+    set_data_handle(data);
+  }
 
 private:
   // save the CpuMatrixPtr in case the buffer released outside
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index c3e34d5309d9ca8a32d7b0a8043e668cdb5be54b..88e9180690606c92cf46c5b295d80f14e5d64567 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -451,6 +451,7 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
 }
 
 void GpuMatrix::collectBias(Matrix& a, real scale) {
+#ifdef PADDLE_WITH_CUDA
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
   GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
@@ -461,6 +462,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
     hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
     hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
   }
+#endif
 }
 
 void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
@@ -552,6 +554,7 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                     const GpuMatrix& b,
                     real scaleAB,
                     real scaleT) {
+#ifdef PADDLE_WITH_CUDA
   CHECK(isContiguous());
   CHECK(b.isContiguous());
   CHECK(b.useGpu_ == true) << "Matrix type are not equal";
@@ -578,12 +581,14 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                           b.height_,
                           scaleAB,
                           scaleT);
+#endif
 }
 
 void GpuMatrix::mul(const GpuMatrix& a,
                     const GpuSparseMatrix& b,
                     real scaleAB,
                     real scaleT) {
+#ifdef PADDLE_WITH_CUDA
   CHECK(isContiguous());
   CHECK(a.isContiguous());
   CHECK(a.useGpu_ == true) << "Matrix type are not equal";
@@ -622,6 +627,7 @@ void GpuMatrix::mul(const GpuMatrix& a,
                             scaleAB,
                             scaleT);
   }
+#endif
 }
 
 /* this = a*b */
@@ -1028,15 +1034,23 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
+  real* maskData = NULL;
   size_t frameNum = inputMat.getHeight();
   CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
+  if (maskMatP != NULL) {
+    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
+    CHECK(outputH * outputW * channels == maskMatP->getWidth());
+    maskData = maskMatP->getData();
+  }
+
   hl_maxpool_forward(frameNum,
                      inputData,
                      channels,
@@ -1051,7 +1065,8 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                      paddingH,
                      paddingW,
                      data_,
-                     getStride());
+                     getStride(),
+                     maskData);
 }
 
 void GpuMatrix::maxPoolBackward(Matrix& inputMat,
@@ -1548,6 +1563,7 @@ void GpuMatrix::bilinearBackward(const Matrix& out,
 }
 
 void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
   GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
   auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
 
@@ -1563,9 +1579,11 @@ void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
   hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
   hl_matrix_multi_binary_cross_entropy(
       output_d, entropy_d, mat_d, height_, outputPtr->width_);
+#endif
 }
 
 void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
   GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
   auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
 
@@ -1581,6 +1599,7 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
   hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
   hl_matrix_multi_binary_cross_entropy_bp(
       output_d, grad_d, mat_d, height_, width_);
+#endif
 }
 
 void GpuMatrix::vol2Col(real* dataSrc,
@@ -1973,9 +1992,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
   real* inputData = inputMat.getData();
   real* outData = data_;
+  real* maskData = NULL;
   size_t num = inputMat.getHeight();
   size_t inLength = imgSizeH * imgSizeW;
   size_t outLength = outputH * outputW;
@@ -1984,6 +2005,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
   CHECK_EQ(channels * outLength, this->getWidth());
   size_t outStride = getStride();
 
+  if (maskMatP != NULL) {
+    maskData = maskMatP->getData();
+    CHECK_EQ(channels * outLength, maskMatP->getWidth());
+  }
+
   /* initialize the data_ */
   for (size_t i = 0; i < height_; i++) {
     for (size_t j = 0; j < width_; j++) {
@@ -2005,10 +2031,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
           int wstart = pw * strideW - paddingW;
           int wend = std::min(wstart + sizeX, imgSizeW);
           wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              outData[ph * outputW + pw] = std::max(
-                  outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+          if (maskData == NULL) {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                outData[ph * outputW + pw] = std::max(
+                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+              }
+            }
+          } else {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
+                  outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
+                  maskData[ph * outputW + pw] = h * imgSizeW + w;
+                }
+              }
             }
           }
         }
@@ -2016,6 +2053,8 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
       // compute offset
       inputData += inLength;
       outData += outLength;
+
+      if (maskData != NULL) maskData += outLength;
     }
   }
 }
@@ -3226,6 +3265,7 @@ template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
                                                            real scaleAB,
                                                            real scaleT);
 
+#ifndef PADDLE_MOBILE_INFERENCE
 void SharedCpuMatrix::mul(CpuSparseMatrix* a,
                           CpuMatrix* b,
                           real scaleAB,
@@ -3354,6 +3394,7 @@ void SharedCpuMatrix::initBlock(int blockNum) {
   }
 }
 
+#endif
 /* Add a (column) vector b to matrix a, column by column */
 void CpuMatrix::addColumnVector(const Matrix& b) {
   BaseMatrix::addColVector(const_cast<Matrix&>(b));
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 44180bca8bca53e74d71ce7bed3516399c01c81d..e273f1123690e31984c97185c5a8bc5e7b92c38c 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -861,7 +861,8 @@ public:
 
   /**
    * Pooling forward operation, pick out the largest element
-   * in the sizeX of value
+   * in the sizeX of value, if the maskMatP is not NULL, it will
+   * also caculate the location indices.
    */
   virtual void maxPoolForward(Matrix& inputMat,
                               size_t imgSizeH,
@@ -874,7 +875,8 @@ public:
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              MatrixPtr maskMatP = NULL) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -1426,7 +1428,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
 
   void maxPoolBackward(Matrix& image,
                        size_t imgSizeH,
@@ -1697,7 +1700,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
 
   void maxPoolBackward(Matrix& image,
                        size_t imgSizeH,
@@ -2066,6 +2070,7 @@ public:
 
 class SharedCpuMatrix : public CpuMatrix {
 public:
+#ifndef PADDLE_MOBILE_INFERENCE
   /* blockNum is number of partitions of the matrix  */
   SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
       : CpuMatrix(height, width, trans) {
@@ -2111,6 +2116,7 @@ private:
   ThreadLocal<CpuMatrixPtr> localBuf_;
   ThreadLocal<std::vector<int>> localBufRows_;
   ThreadLocal<std::vector<int>> blockSeq_;
+#endif
 };
 
 typedef struct { unsigned int col; } sparse_non_value_t;
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 16300db081f89182faa82ea5798e8ec2f1cd93f9..e0a3c6d2286521f6030867b747099514a16df5cf 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "CpuSparseMatrix.h"
 #include "Matrix.h"
@@ -237,3 +240,47 @@ private:
 };
 
 }  // namespace paddle
+
+#else
+
+#include "CpuSparseMatrix.h"
+
+namespace paddle {
+
+class GpuSparseMatrix : public Matrix {
+public:
+  GpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, true) {}
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 8704eb038d5d42ca834d232c0a651e9ffb2b40f3..ca7a6806da3a58ad5fffdbb6505319964c25bc6f 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <gflags/gflags.h>
 #include <string.h>
 #include <algorithm>
@@ -313,3 +315,27 @@ private:
 };
 
 }  // namespace paddle
+
+#else
+namespace paddle {
+
+class SparseRowCpuMatrix : public CpuMatrix {
+public:
+  void reserveStore() {}
+  void clearIndices() {}
+};
+
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+public:
+  void setupIndices() {}
+  void addRows(MatrixPtr input) {}
+  void addRows(IVectorPtr ids) {}
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
+class SparseRowIdsCpuMatrix : public CpuMatrix {};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index ceb96b2e250d8e04ffb2b1d8c77ad498dca91cf3..d8b7f9e3fc74040189ade83049e4a1c3348e08de 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -3,8 +3,10 @@
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
-add_simple_unittest(test_SparseMatrix)
 add_simple_unittest(test_RowBuffer)
+if(NOT MOBILE_INFERENCE)
+    add_simple_unittest(test_SparseMatrix)
+endif()
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 29ce44c23308cb5ae1c1df5c9be1412c28abe96f..709f7de2e43093114d096cbfca5b5d49293a6d3e 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -214,6 +214,7 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
         rnn/recurrent_op_utils.cc
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..609e915b932e2bc4d5abee1e5f868cc07a7619d3
--- /dev/null
+++ b/paddle/operators/assign_op.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_type.h"
+
+namespace paddle {
+namespace operators {
+class AssignFunctor {
+ public:
+  AssignFunctor(framework::Variable *out,
+                const platform::DeviceContext &dev_ctx)
+      : out_(out), dev_ctx_(dev_ctx) {}
+
+  void operator()(const framework::LoDTensor &lod_tensor) const {
+    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
+    copy_tensor(lod_tensor, &out_tensor);
+  }
+
+  void operator()(const framework::LoDTensorArray &array) const {
+    auto &out_array = *out_->GetMutable<framework::LoDTensorArray>();
+    out_array.resize(array.size());
+    for (size_t i = 0; i < array.size(); ++i) {
+      copy_tensor(array[i], &out_array[i]);
+    }
+  }
+
+  void operator()(const framework::SelectedRows &rows) const {
+    framework::SelectedRows &out_rows =
+        *out_->GetMutable<framework::SelectedRows>();
+    out_rows.set_rows(rows.rows());
+    out_rows.set_height(rows.height());
+    auto &t = rows.value();
+    out_rows.mutable_value()->CopyFrom(t, t.place(), dev_ctx_);
+  }
+
+  template <typename T>
+  void operator()(const T &v) const {
+    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+  }
+
+ private:
+  void copy_tensor(const framework::LoDTensor &lod_tensor,
+                   framework::LoDTensor *out) const {
+    auto &out_tensor = *out;
+    out_tensor.CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_);
+    out_tensor.set_lod(lod_tensor.lod());
+  }
+
+  framework::Variable *out_;
+  const platform::DeviceContext &dev_ctx_;
+};
+
+class AssignOp : public framework::OperatorBase {
+ public:
+  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) {
+      return;
+    }
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(
+        out != nullptr,
+        "The Output(Out) should not be null if the Input(X) is set.");
+    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+  }
+};
+
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignOpProtoMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+             "could be LoDTensor, SelectedRows or LoDTensorArray.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+              "is the same as input X.");
+    AddComment(R"DOC(Assign Operator
+
+Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+  }
+};
+
+class AssignInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    if (context->HasInput("X")) {
+      auto type = context->GetInputsVarType("X")[0];
+      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::VarDesc_VarType_LOD_TENSOR) {
+        context->SetOutputDim("Out", context->GetInputDim("X"));
+      }
+    }
+  }
+};
+
+class AssignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *op = new framework::OpDescBind();
+    op->SetType("assign");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    return std::unique_ptr<framework::OpDescBind>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
+                  ops::AssignInferShape, ops::AssignOpProtoMaker);
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3904a97d58166cfeeb2be7d2144700dbd8bc5721
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BeamSearchDecodeOp : public framework::OperatorBase {
+ public:
+  BeamSearchDecodeOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+
+    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
+    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
+    const size_t step_num = ids->size();
+    PADDLE_ENFORCE_GT(step_num, 0UL,
+                      "beam search steps should be larger than 0");
+    const size_t source_num = ids->at(0).lod().at(0).size() - 1;
+    PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0");
+
+    for (size_t i = 0; i < step_num; ++i) {
+      PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL,
+                        "Level of LodTensor should be 2");
+    }
+
+    // prepare output
+    LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
+    LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
+
+    BeamSearchDecoder<float> beam_search_decoder;
+    beam_search_decoder.PackAllSteps(*ids, *scores, sentenceIds,
+                                     sentenceScores);
+  }
+};
+
+class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
+                               framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddInput("Scores",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddOutput("SentenceIds",
+              "(LodTensor)"
+              "All possible result sentences of word ids");
+    AddOutput("SentenceScores",
+              "(LodTensor)"
+              "All possible result sentences of word scores");
+    AddComment(R"DOC(
+Pack the result of Beam search op into SentenceIds and SentenceScores.
+)DOC");
+  }
+};
+
+class BeamSearchDecodeInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("Ids"),
+                   "BeamSearchDecodeOp must has input Ids");
+    PADDLE_ENFORCE(context->HasInput("Scores"),
+                   "BeamSearchDecodeOp must has input Scores");
+    PADDLE_ENFORCE(context->HasOutput("SentenceIds"),
+                   "BeamSearchDecodeOp must has output SentenceIds");
+    PADDLE_ENFORCE(context->HasOutput("SentenceScores"),
+                   "BeamSearchDecodeOp must has output SentenceScores");
+  }
+};
+
+class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind& op_desc,
+                  framework::BlockDescBind* block) const override {
+    for (auto& o : op_desc.Output("SentenceIds")) {
+      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+    for (auto& o : op_desc.Output("SentenceScores")) {
+      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp,
+                  paddle::operators::BeamSearchDecodeOpProtoMaker,
+                  paddle::operators::BeamSearchDecodeInferShape,
+                  paddle::operators::BeamSearchDecodeInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f007ec22f9a66572971516a711317f348e1ec5a
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.h
@@ -0,0 +1,280 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoDTensorArray = framework::LoDTensorArray;
+
+// all the lod have 2 levels.
+// The First is source level, the second is sentence level.
+// source level describe how many candidate words for this source.
+// sentence level describe these candidates belong to which prefix
+const size_t kSourceLevel = 0;
+const size_t kSentenceLevel = 1;
+
+template <typename T>
+struct BeamNode {
+  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
+
+  ~BeamNode() {
+    if (parent_) {
+      parent_->DropKid(this);
+      if (parent_->kids_.size() == 0UL) {
+        delete parent_;
+      }
+    }
+    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
+  }
+
+  void AppendTo(BeamNode* parent) {
+    parent_ = parent;
+    parent->kids_.insert(this);
+  }
+
+  void DropKid(BeamNode* kid) { kids_.erase(kid); }
+
+  BeamNode* parent_ = nullptr;
+  std::unordered_set<BeamNode*> kids_;
+  int64_t word_id_;
+  T score_;
+};
+
+template <typename T>
+using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
+
+template <typename T>
+struct Sentence {
+  std::vector<int64_t> word_ids;
+  std::vector<T> scores;
+};
+
+template <typename T>
+using SentenceVector = std::vector<Sentence<T>>;
+
+template <typename T>
+struct BeamSearchDecoder {
+  /**
+   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
+   */
+  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
+
+  /**
+   * Param:
+   *  cur_ids: LoDTensor of One step for word ID
+   *  cur_scores: LoDTensor of One Step for word score
+   *  prefixes_list: prefixes for each source sentence.
+   *  sentence_vector_list: result sentence_vector for each source sentence.
+   * Return:
+   *  a new prefixes list for each source of current step
+   */
+  std::vector<BeamNodeVector<T>> PackTwoSteps(
+      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+      std::vector<BeamNodeVector<T>>& prefixes_list,
+      std::vector<SentenceVector<T>>* sentence_vector_list) const;
+
+  /**
+   * convert the result sentence_vector for each source sentence into two
+   * LodTensor.
+   * One is all candidate sentences with word id, one is all candidate sentences
+   * with word score.
+   * Param:
+   *  sentence_vector_list: sentence_vector for each source sentence.
+   *  id_tensor: result LoDTensor for sentences of id.
+   *  score_tensor: result LoDTensor for sentences of score.
+   */
+  void ConvertSentenceVectorToLodTensor(
+      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+      LoDTensor* score_tensor) const;
+
+  /**
+   * Pack all steps of id/score LodTensor into sentence LoDTensor
+   * it's main logic is:
+   * ```python
+   *   prefix
+   *   result_sentence
+   *   result_lod_tensor
+   *
+   *   for (step in steps):
+   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
+   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
+   * ```
+   */
+  void PackAllSteps(const LoDTensorArray& step_ids,
+                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                    LoDTensor* score_tensor) const;
+};
+
+template <typename T>
+Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
+  Sentence<T> sentence;
+  while (node != nullptr) {
+    sentence.word_ids.emplace_back(node->word_id_);
+    sentence.scores.emplace_back(node->score_);
+    node = node->parent_;
+  }
+
+  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
+  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
+
+  return sentence;
+}
+
+template <typename T>
+std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
+    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+    std::vector<BeamNodeVector<T>>& prefixes_list,
+    std::vector<SentenceVector<T>>* sentence_vector_list) const {
+  std::vector<BeamNodeVector<T>> result;
+
+  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
+       ++src_idx) {
+    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+
+    BeamNodeVector<T> beam_nodes;
+
+    // if prefixes size is 0, it means this is the first step. In this step,
+    // all candidate id is the start of candidate sentences.
+    if (prefixes_list.empty()) {
+      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
+                        cur_ids.lod().at(kSentenceLevel).back(),
+                        "in the first step");
+      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
+        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
+            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
+      }
+    } else {
+      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
+      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
+
+      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
+                        "prefix and candidate set number should be the same");
+
+      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
+      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
+        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
+        size_t candidate_start = candidate_offset[src_start + prefix_idx];
+        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
+        if (candidate_start == candidate_end) {
+          VLOG(3) << "this sentence has no more candidate, "
+                     "add to result sentence and rm it from beam tree";
+          sentence_vector.push_back(MakeSentence(prefix.get()));
+          prefix.reset();
+        } else {
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            auto* candidate =
+                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
+                                cur_scores.data<T>()[candidate_idx]);
+            candidate->AppendTo(prefix.get());
+            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
+          }
+          prefix.release();
+        }
+      }
+    }
+    result.push_back(std::move(beam_nodes));
+  }
+  return result;
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
+    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+    LoDTensor* score_tensor) const {
+  size_t src_num = sentence_vector_list.size();
+
+  PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
+
+  std::vector<size_t> source_level_lod = {0};
+  std::vector<size_t> sentence_level_lod = {0};
+  std::vector<int64_t> id_data;
+  std::vector<T> score_data;
+
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
+      id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                     sentence.word_ids.end());
+      score_data.insert(score_data.end(), sentence.scores.begin(),
+                        sentence.scores.end());
+      sentence_level_lod.push_back(sentence_level_lod.back() +
+                                   sentence.word_ids.size());
+    }
+    source_level_lod.push_back(source_level_lod.back() +
+                               sentence_vector_list[src_idx].size());
+  }
+
+  auto cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+
+  framework::LoD lod;
+  lod.push_back(source_level_lod);
+  lod.push_back(sentence_level_lod);
+
+  id_tensor->set_lod(lod);
+  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
+  id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
+  id_tensor->CopyFromVector<int64_t>(id_data, cpu_ctx);
+
+  score_tensor->set_lod(lod);
+  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
+  score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
+  score_tensor->CopyFromVector<T>(score_data, cpu_ctx);
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
+                                        const LoDTensorArray& step_scores,
+                                        LoDTensor* id_tensor,
+                                        LoDTensor* score_tensor) const {
+  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
+  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
+                    "step_ids and step_scores should be the same");
+  const size_t step_num = step_ids.size();
+  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+
+  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
+
+  // previous prefixes for each step,
+  // the init length is 0, means this is the first step.
+  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
+  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
+
+  // pack all steps for one batch first, then another batch
+  for (size_t step_id = 0; step_id < step_num; ++step_id) {
+    beamnode_vector_list =
+        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
+                     beamnode_vector_list, &sentence_vector_list);
+  }
+  // append last beam_node to result
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
+      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
+      beam_node.reset();
+    }
+  }
+
+  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
+                                   score_tensor);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/beam_search_decode_op_test.cc b/paddle/operators/beam_search_decode_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ac23991f3c7768abaf94f3a4b750697de0ef114
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op_test.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+#include "gtest/gtest.h"
+
+using CPUPlace = paddle::platform::CPUPlace;
+using LoD = paddle::framework::LoD;
+using LoDTensor = paddle::framework::LoDTensor;
+using LoDTensorArray = paddle::framework::LoDTensorArray;
+
+template <typename T>
+using BeamNode = paddle::operators::BeamNode<T>;
+template <typename T>
+using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
+template <typename T>
+using Sentence = paddle::operators::Sentence<T>;
+template <typename T>
+using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
+template <typename T>
+using SentenceVector = paddle::operators::SentenceVector<T>;
+
+namespace paddle {
+namespace test {
+
+void GenerateExample(const std::vector<size_t>& level_0,
+                     const std::vector<size_t>& level_1,
+                     const std::vector<int>& data, LoDTensorArray* ids,
+                     LoDTensorArray* scores) {
+  PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1,
+                    "source level is used to describe candidate set");
+  PADDLE_ENFORCE_EQ(level_1.back(), data.size(),
+                    "the lowest level is used to describe data"
+                    ", so it's last element should be data length");
+
+  CPUPlace place;
+
+  LoD lod;
+  lod.push_back(level_0);
+  lod.push_back(level_1);
+
+  // Ids
+  LoDTensor tensor_id;
+  tensor_id.set_lod(lod);
+  tensor_id.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  int64_t* id_ptr = tensor_id.mutable_data<int64_t>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    id_ptr[i] = static_cast<int64_t>(data.at(i));
+  }
+
+  // Scores
+  LoDTensor tensor_score;
+  tensor_score.set_lod(lod);
+  tensor_score.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  float* score_ptr = tensor_score.mutable_data<float>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    score_ptr[i] = static_cast<float>(data.at(i));
+  }
+
+  ids->push_back(tensor_id);
+  scores->push_back(tensor_score);
+}
+
+}  // namespace test
+}  // namespace paddle
+
+TEST(BeamSearchDecodeOp, DeleteBeamNode) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* b2 = new BeamNode<float>(2, 2);
+  auto* b3 = new BeamNode<float>(3, 3);
+
+  b1->AppendTo(root);
+  b2->AppendTo(root);
+  b3->AppendTo(b1);
+
+  delete b3;
+  delete b2;
+}
+
+TEST(BeamSearchDecodeOp, MakeSentence) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* end = new BeamNode<float>(2, 2);
+  b1->AppendTo(root);
+  end->AppendTo(b1);
+
+  BeamSearchDecoder<float> helper;
+  Sentence<float> sentence = helper.MakeSentence(end);
+  delete end;
+
+  std::vector<int64_t> expect_ids = {0, 1, 2};
+  ASSERT_EQ(sentence.word_ids, expect_ids);
+
+  std::vector<float> expect_scores = {0, 1, 2};
+  ASSERT_EQ(sentence.scores, expect_scores);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
+  CPUPlace place;
+
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  BeamSearchDecoder<float> helper;
+  beamnode_vector_list = helper.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoSteps) {
+  CPUPlace place;
+
+  // first source has three prefix
+  BeamNodeVector<float> source0_prefixes;
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
+
+  // second source has two prefix
+  BeamNodeVector<float> source1_prefixes;
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  beamnode_vector_list.push_back(std::move(source0_prefixes));
+  beamnode_vector_list.push_back(std::move(source1_prefixes));
+
+  // generate data for one step
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
+                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  BeamSearchDecoder<float> helper1;
+  beamnode_vector_list = helper1.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+
+  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
+  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
+}
+
+TEST(BeamSearchDecodeOp, PackAllSteps) {
+  CPUPlace place;
+
+  // we will constuct a sample data with 3 steps and 2 source sentences
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
+      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
+                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  ASSERT_EQ(ids.size(), 3UL);
+  ASSERT_EQ(scores.size(), 3UL);
+
+  BeamSearchDecoder<float> helper;
+
+  LoDTensor id_tensor;
+  LoDTensor score_tensor;
+  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+
+  LoD lod = id_tensor.lod();
+  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  EXPECT_EQ(lod[0], expect_source_lod);
+  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  EXPECT_EQ(lod[1], expect_sentence_lod);
+  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
+  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
+                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
+  for (size_t i = 0; i < expect_data.size(); ++i) {
+    ASSERT_EQ(id_tensor.data<int64_t>()[i],
+              static_cast<int64_t>(expect_data[i]));
+  }
+  for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) {
+    ASSERT_EQ(score_tensor.data<float>()[i],
+              static_cast<float>(id_tensor.data<int64_t>()[i]));
+  }
+}
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c65ba7eb262f3aabe2c00837b79806c0b40b60fd
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class BilinearTensorProductOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
+                      "The input(Weight) must be a 3D tensor.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The first dimension(batch_size) of input(X) must be "
+                      "equal to the first dimension of the input(Y).");
+    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
+                      "The second dimension of input(X) must be equal to "
+                      "the second dimension of the input(Weight).");
+    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
+                      "The second dimension of input(Y) must be equal to "
+                      "the third dimension of the input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL,
+                     "The Input(Bias) must be a 2-D tensor with "
+                     "the 2nd dimension fixed to 1 (a row vector).");
+      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
+                        "The second dimension of input(Bias) must be equal "
+                        "to the first dimension of the input(Weight).");
+    }
+
+    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BilinearTensorProductOpMaker(framework::OpProto* proto,
+                               framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of bilinear_tensor_product operator.");
+    AddInput("Y", "The second input of bilinear_tensor_product operator.");
+    AddInput("Weight",
+             "The learnable parameters of bilinear_tensor_product operator.");
+    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
+        .AsDispensable();
+    AddOutput("Out", "The output of bilinear_tensor_product operator.");
+    AddComment(R"DOC(
+Bilinear Tensor Product operator.
+Given input X and Y, a 3D tensor weight, and bias. Each column of the
+output is computed by one slice i = 1, . . . , k of the tensor:
+
+    M =  (X W_i) \cdot Y
+    Out_i = \sum_i {M_i} + Bias_i
+
+)DOC");
+  }
+};
+
+class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
+                      "The input(Out@GRAD) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[0], out_dims[0],
+        "The first dimension(batch_size) of input(Out@GRAD) must be "
+        "equal to the first dimension of the Input(X).");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[0], out_dims[1],
+        "The second dimension of input(Out@GRAD) must be equal to "
+        "the third dimension of the Input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE_EQ(
+          bias_dims[1], out_dims[1],
+          "The second dimension of input(Out@GRAD) must be equal to "
+          "the second dimension of the Input(Bias).");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    auto weight_grad_name = framework::GradVarName("Weight");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+    if (ctx->HasOutput(weight_grad_name)) {
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
+            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
+            ops::BilinearTensorProductOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..858d2668d01379afe8082cd1eda32a2a5d09bd18
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffa4f43a327418498c1f110504127e7d2878409d
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class BilinearTensorProductKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto output_mat = EigenMatrix<T>::From(*out);
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+    auto place = ctx.GetEigenDevice<Place>();
+
+    // Create the intermediate variable to caculate the result of
+    // Input(X) multiplied by Input(Weight_i), the formula is:
+    // left_mul = X Weight_i.
+    Tensor left_mul;
+    left_mul.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                             ctx.GetPlace());
+    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
+
+    for (int i = 0; i < out_dim; ++i) {
+      auto output_col_vec = output_mat.chip(i, 1);
+      Tensor weight_mat =
+          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
+      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                           batch_size, y_dim, x_dim, 1, x->data<T>(),
+                           weight_mat.data<T>(), 0, left_mul.data<T>());
+      output_col_vec.device(place) =
+          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
+    }
+    if (bias) {
+      auto bias_vec = EigenMatrix<T>::From(*bias);
+      Eigen::DSizes<int, 2> bcast(batch_size, 1);
+      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* weight = ctx.Input<Tensor>("Weight");
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+
+    auto x_mat = EigenMatrix<T>::From(*x);
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto d_out_mat = EigenMatrix<T>::From(*d_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    // Create the intermediate variable to caculate the Output(Y@Grad).
+    Tensor x_scale;
+    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
+                            ctx.GetPlace());
+    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
+
+    // Create the intermediate variable to caculate the Output(X@Grad).
+    Tensor y_scale;
+    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                            ctx.GetPlace());
+    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
+
+    math::SetConstant<Place, T> set_zero;
+
+    // Set Output(X@Grad) be zero.
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_x, static_cast<T>(0));
+    }
+
+    // Set Output(Y@Grad) be zero.
+    if (d_y) {
+      d_y->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_y, static_cast<T>(0));
+    }
+
+    // Caculate the Output(X@Grad) and Output(Y@Grad).
+    if (d_x || d_y) {
+      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
+      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor weight_i = weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        if (d_x) {
+          y_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_x) *
+              y_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasTrans,
+                               batch_size, x_dim, y_dim, 1, y_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_x->data<T>());
+        }
+        if (d_y) {
+          x_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_y) *
+              x_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                               batch_size, y_dim, x_dim, 1, x_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_y->data<T>());
+        }
+      }
+    }
+
+    // Caculate the gradient of Input(Weight).
+    if (d_weight) {
+      d_weight->mutable_data<T>(ctx.GetPlace());
+      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        x_scale_mat.device(place) =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_weight) *
+            x_mat;
+        math::gemm<Place, T>(ctx.device_context(), CblasTrans, CblasNoTrans,
+                             x_dim, y_dim, batch_size, 1, x_scale.data<T>(),
+                             y->data<T>(), 0, d_weight_i.data<T>());
+      }
+    }
+
+    // Caculate the gradient of Input(Bias).
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_mat = EigenMatrix<T>::From(*d_bias);
+      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index 716b5ee92d0d8737d2069460f53989f691ff7c77..bf7e88368157d29e627c3c06384f28b6e5e4ecc1 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -94,5 +94,13 @@ class CompareOp : public framework::OperatorWithKernel {
 
 REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
 REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
+REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_OP(greater_than, "Out = X > Y");
+REGISTER_LOGICAL_KERNEL(greater_than, CPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_OP(greater_equal, "Out = X >= Y");
+REGISTER_LOGICAL_KERNEL(greater_equal, CPU,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_OP(equal, "Out = X == Y");
 REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu
index 42a5bb2f45fd389f60c3dc034cade7f56a907e35..6ac8c124b9b2e7c808808ecc8802a2e5aeaa5b5d 100644
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@@ -15,4 +15,9 @@
 #include "paddle/operators/compare_op.h"
 
 REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(greater_than, GPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
index 04e04e347b398abb5fb66876bf801b1eee688ec6..afdf3ab3e098b4e7f4c996471617d97ec49264b1 100644
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -27,6 +27,24 @@ struct LessThanFunctor {
   HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
 };
 
+template <typename T>
+struct LessEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
+};
+
+template <typename T>
+struct GreaterThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
+};
+
+template <typename T>
+struct GreaterEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
+};
+
 template <typename T>
 struct EqualFunctor {
   using ELEM_TYPE = T;
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5b124682d755ffb39f32c9f001a3cf113a01a2c
--- /dev/null
+++ b/paddle/operators/conditional_block_op.cc
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <algorithm>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ConditionalOp : public framework::OperatorBase {
+ public:
+  ConditionalOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  std::vector<const framework::LoDTensor *> InputTensors(
+      const framework::Scope &scope) const {
+    std::vector<const framework::LoDTensor *> retv;
+    auto xs = Inputs("X");
+    retv.resize(xs.size(), nullptr);
+    std::transform(
+        xs.begin(), xs.end(), retv.begin(),
+        [&scope](const std::string &var_name) -> const framework::LoDTensor * {
+          auto *var = scope.FindVar(var_name);
+          PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name);
+          return &var->Get<framework::LoDTensor>();
+        });
+    return retv;
+  }
+};
+
+class ConditionalBlockOp : public ConditionalOp {
+ public:
+  ConditionalBlockOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Output("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
+      scopes->resize(1);
+      scopes->front() = &scope.NewScope();
+      auto &cur_scope = *scopes->front();
+
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+    }
+  }
+};
+
+class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The conditional variable of this operator. If X is empty, the "
+             "whole sub-block will not be executed.")
+        .AsDuplicable();
+    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
+    AddOutput("Scope",
+              "(std::vector<Scope*>) The step scope of conditional block. To "
+              "unify the conditional block, rnn and while op, the type of "
+              "scope is std::vector<Scope*>");
+    AddAttr<framework::BlockDescBind *>(
+        "block", "The step block of conditional block operator");
+    AddComment(R"DOC(Conditional block operator
+
+Run the sub-block if X is not empty. Params is the other inputs and Out is the
+outputs of the sub-block.
+)DOC");
+  }
+};
+
+class ConditionalBlockGradOp : public ConditionalOp {
+ public:
+  ConditionalBlockGradOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = this->InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Input("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
+      framework::Scope &cur_scope = *scopes[0];
+
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+                                  Outputs(framework::GradVarName("Params")));
+
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+                                  Outputs(framework::GradVarName("X")));
+    }
+  }
+
+ private:
+  void AssignLocalGradientToGlobal(
+      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const std::vector<std::string> &p_names,
+      const std::vector<std::string> &pg_names) const {
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto out_grad_name = pg_names[i];
+      auto in_grad_name = framework::GradVarName(p_names[i]);
+      auto *in_var = cur_scope.FindVar(in_grad_name);
+      if (in_var == nullptr) {
+        continue;
+      }
+      auto new_in_grad_name = cur_scope.Rename(in_grad_name);
+      auto assign =
+          framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}},
+                                          {{"Out", {out_grad_name}}}, {});
+      assign->Run(cur_scope, dev_ctx);
+      cur_scope.Rename(new_in_grad_name, in_grad_name);
+    }
+  }
+};
+
+class ConditionalBlockGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInputs("X"));
+    if (context->HasInputs("Params")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
+      context->SetOutputsDim(framework::GradVarName("Params"),
+                             context->GetInputsDim("Params"));
+    }
+    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
+    context->SetOutputsDim(framework::GradVarName("X"),
+                           context->GetInputsDim("X"));
+  }
+};
+
+class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto grad_op = new framework::OpDescBind();
+    grad_op->SetType("conditional_block_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetInput("Scope", Output("Scope"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
+    grad_op->SetBlockAttr("block", *this->grad_block_[0]);
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
+                  ops::ConditionalBlockOpProtoMaker,
+                  ops::ConditionalBlockGradMaker);
+REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
+                  ops::ConditionalBlockGradInferShape);
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..282775fcda45fe3bbd72bf04a7ae828f2c840ab7
--- /dev/null
+++ b/paddle/operators/expand_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto x_dims = ctx->GetInputDim("X");
+
+    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
+                      "The number of Attr(expand_times)'s value must be equal "
+                      "to the rank of Input(X).");
+    PADDLE_ENFORCE_LE(x_dims.size(), 6,
+                      "The rank of Input(X) must not be greater than 6.");
+
+    std::vector<int64_t> out_shape(x_dims.size());
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_GE(expand_times[i], 1,
+                        "Each value of Attr(expand_times) should not be "
+                        "less than 1.");
+      out_shape[i] = x_dims[i] * expand_times[i];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+};
+
+class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+             "X is the input tensor to be expanded.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+              "The rank of Output(Out) is same as Input(X) except that each "
+              "dimension size of Output(Out) is equal to corresponding "
+              "dimension size of Input(X) multiplying corresponding value of "
+              "Attr(expand_times).");
+    AddAttr<std::vector<int>>("expand_times",
+                              "Expand times number for each dimension.");
+    AddComment(R"DOC(
+Expand operator tiles the input by given times number. You should set times
+number for each dimension by providing attribute 'expand_times'. The rank of X
+should be in [1, 6]. Please notice that size of 'expand_times' must be same with
+X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(expand_times):  [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class ExpandGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
+                        "Each dimension size of Input(Out@GRAD) should be "
+                        "equal to multiplication of crroresponding dimension "
+                        "size of Input(X) and Attr(expand_times) value.");
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
+            ops::ExpandGradOp);
+REGISTER_OP_CPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad, ops::ExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6744562b6c21dd8bfeb7e4cb6b809dc7913aa3a5
--- /dev/null
+++ b/paddle/operators/expand_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    expand_grad, ops::ExpandGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ae2c11a5d31dafc1b90d129054ebfabfb761bfe
--- /dev/null
+++ b/paddle/operators/expand_op.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+  case n + 1: {                     \
+    Expand<n + 1>(context);         \
+    break;                          \
+  }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n)                                               \
+  BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
+                         BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n)                                        \
+  case n: {                                                        \
+    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                         \
+  }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class ExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
+      default:
+        PADDLE_ENFORCE(false,
+                       "Only support tensor with rank being between 1 and 6.");
+    }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    auto x_dims = in0->dims();
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      bcast_dims[i] = expand_times[i];
+    }
+    auto x = EigenTensor<T, Rank>::From(*in0);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = x.broadcast(bcast_dims);
+  }
+};
+
+template <typename Place, typename T>
+class ExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto x_dims = in0->dims();
+    // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
+    //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
+    //    dimensions [expand_times[i], x_dims[i]].
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      if (expand_times[i] == 1) {
+        reshape_dims_vec.push_back(x_dims[i]);
+      } else {
+        if (x_dims[i] == 1) {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+        } else {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+          reshape_dims_vec.push_back(x_dims[i]);
+        }
+      }
+    }
+
+    int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED +
+               reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1;
+    // no need reduce, just copy
+    if (reduce_dims_vec.size() == 0) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      out0->CopyFrom(*in0, context.GetPlace(), context.device_context());
+    } else {
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(72)
+        default:
+          PADDLE_ENFORCE(
+              false, "Only support tensor with rank being between 1 and 6.");
+      }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandBackward(const framework::ExecutionContext& context,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1;
+    size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1;
+    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reshape dimensions.");
+    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reduce dimensions.");
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto x = EigenVector<T>::Flatten(*(context.Input<Tensor>("X")));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims / MAX_RANK_SUPPORTED + 1> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims % MAX_RANK_SUPPORTED + 1> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(context.GetEigenDevice<Place>()) =
+        out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h
index de459818ad83d389e5a95e0303ae40b32743c4e7..3c60dc3dc7415f34ed9d238e6f41b197ec404883 100644
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
@@ -29,7 +29,7 @@ class L1NormKernel : public framework::OpKernel<T> {
     Out->mutable_data<T>(context.GetPlace());
 
     auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
     auto place = context.GetEigenDevice<Place>();
 
     out.device(place) = x.abs().sum();
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32831cb1e2cf188a507773ef1e00b22de98d82ab
--- /dev/null
+++ b/paddle/operators/lod_reset_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDResetOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LoDResetOp should not be null.");
+    // If target LoD is not set form Input(), then it must be set from Attr().
+    if (!ctx->HasInput("TargetLoD")) {
+      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
+      PADDLE_ENFORCE(level0.size() > 1,
+                     "Target LoD is not found, should be set to be a valid one "
+                     "through Input() or Attr().");
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDResetOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
+    AddInput("TargetLoD",
+             "(Tensor, optional) The target level 0 LoD from Input().")
+        .AsDispensable();
+    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddAttr<std::vector<int>>("target_lod",
+                              "The target level 0 LoD from Attr().")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(LoDReset operator
+
+Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
+Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
+Currently the lod_reset operator only supports the reset of level 0 LoD.
+At least one of Input(TargetLoD) and Attr(target_lod) must be set,
+and if both of them are set, Input(TargetLoD) will be chosen as the
+target LoD.
+
+An example:
+Given a float LoDTensor X with shape (6, 1), its transpose form represents
+
+    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+
+with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+
+    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+
+If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
+the sequences that the LoDTensor Output(Out) contains becomes:
+
+    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+
+)DOC");
+  }
+};
+
+class LoDResetGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
+            ops::LoDResetGradOp);
+REGISTER_OP_CPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5244a17c3aad01909e3b8cf5f4d5abf8a44edc7f
--- /dev/null
+++ b/paddle/operators/lod_reset_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::GPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bb916ccee80c83a02ea429fe95f5fafc86ccfa6
--- /dev/null
+++ b/paddle/operators/lod_reset_op.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class LoDResetKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
+
+    std::vector<int> level0;
+    if (lod_t) {
+      auto* lod = lod_t->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor lod_cpu;
+        lod_cpu.CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context());
+        lod = lod_cpu.data<int>();
+      }
+      level0 = std::vector<int>(lod, lod + lod_t->numel());
+    } else {
+      level0 = ctx.Attr<std::vector<int>>("target_lod");
+    }
+
+    PADDLE_ENFORCE(level0.size() > 1UL,
+                   "The size of target LoD should be greater than 1.");
+    PADDLE_ENFORCE(level0[0] == 0,
+                   "Target LoD should be a vector starting from 0.");
+    PADDLE_ENFORCE(level0.back() == in->dims()[0],
+                   "Target LoD should be a vector end with the "
+                   "first dimension of Input(X).");
+    for (size_t i = 0; i < level0.size() - 1; ++i) {
+      PADDLE_ENFORCE(level0[i + 1] > level0[i],
+                     "Target LoD should be an ascending vector.");
+    }
+
+    out->ShareDataWith(*in);
+    // cast level0 to size_t
+    std::vector<size_t> ulevel0(level0.size(), 0);
+    std::transform(level0.begin(), level0.end(), ulevel0.begin(),
+                   [](int a) { return static_cast<size_t>(a); });
+    framework::LoD target_lod;
+    target_lod.push_back(ulevel0);
+    out->set_lod(target_lod);
+  }
+};
+
+template <typename Place, typename T>
+class LoDResetGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->ShareDataWith(*d_out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 90bc9f4f922e7aa09523bad8ffb3ef477dd89857..ab7f23f57043844d45c36acc475422613164bee1 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -13,7 +13,7 @@ if(WITH_GPU)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
index 50cfb88bb5700dda3785e63e0ccc6457cc928da0..ead89e146f32ef005b06f4f6f04224d691805d74 100644
--- a/paddle/operators/math/pooling.cc
+++ b/paddle/operators/math/pooling.cc
@@ -27,15 +27,15 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
     const int ksize_height = ksize[0];
     const int ksize_width = ksize[1];
     const int stride_height = strides[0];
@@ -47,7 +47,7 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
     const int output_stride = output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -87,11 +87,12 @@ template <typename PoolProcess, class T>
 class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -110,7 +111,7 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -154,10 +155,11 @@ template <class T>
 class MaxPool2dGradFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -176,7 +178,7 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -240,17 +242,17 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
     const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
     const int ksize_depth = ksize[0];
     const int ksize_height = ksize[1];
     const int ksize_width = ksize[2];
@@ -265,7 +267,7 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -315,11 +317,12 @@ template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_grad_process) {
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -343,7 +346,7 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -398,10 +401,11 @@ template <class T>
 class MaxPool3dGradFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
@@ -425,7 +429,7 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -498,15 +502,15 @@ template <typename T>
 class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
     const int ksize_height = ksize[0];
     const int ksize_width = ksize[1];
     const int stride_height = strides[0];
@@ -517,8 +521,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
     const int output_stride = output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -563,13 +567,13 @@ template <typename T>
 class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_height = input_grad.dims()[2];
-    const int input_width = input_grad.dims()[3];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
     const int output_channels = output_grad.dims()[1];
     const int output_height = output_grad.dims()[2];
     const int output_width = output_grad.dims()[3];
@@ -578,7 +582,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
 
     const T* mask_data = mask.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
@@ -612,17 +616,17 @@ template <typename T>
 class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
     const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
     const int ksize_depth = ksize[0];
     const int ksize_height = ksize[1];
     const int ksize_width = ksize[2];
@@ -636,8 +640,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
     const int output_stride = output_depth * output_height * output_width;
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -691,14 +695,14 @@ template <typename T>
 class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_depth = input_grad.dims()[2];
-    const int input_height = input_grad.dims()[3];
-    const int input_width = input_grad.dims()[4];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
     const int output_channels = output_grad.dims()[1];
     const int output_depth = output_grad.dims()[2];
     const int output_height = output_grad.dims()[3];
@@ -708,7 +712,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
 
     const T* mask_data = mask.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     for (int n = 0; n < batch_size; ++n) {
       for (int c = 0; c < output_channels; ++c) {
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
index 736327f4b7b9e9df9ce8f7f60b0437fc1d2d373a..6d1138ad50cb095e85b4ceb44fa81731316f10dd 100644
--- a/paddle/operators/math/pooling.cu
+++ b/paddle/operators/math/pooling.cu
@@ -21,13 +21,13 @@ namespace math {
 
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2D(const int nthreads, const T* input_data,
-                             T* output_data, const int channels,
-                             const int input_height, const int input_width,
-                             const int output_height, const int output_width,
-                             const int ksize_height, const int ksize_width,
-                             const int stride_height, const int stride_width,
-                             const int padding_height, const int padding_width,
-                             PoolProcess pool_process) {
+                             const int channels, const int input_height,
+                             const int input_width, const int output_height,
+                             const int output_width, const int ksize_height,
+                             const int ksize_width, const int stride_height,
+                             const int stride_width, const int padding_height,
+                             const int padding_width, PoolProcess pool_process,
+                             T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -59,11 +59,11 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
 template <typename PoolProcess, typename T>
 __global__ void KernelPool2DGrad(
     const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, PoolProcess pool_process) {
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    PoolProcess pool_process, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int offsetW = index % input_width + padding_width;
@@ -107,11 +107,11 @@ __global__ void KernelPool2DGrad(
 template <typename T>
 __global__ void KernelMaxPool2DGrad(
     const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, const int output_height,
-    const int output_width, const int ksize_height, const int ksize_width,
-    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width) {
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -158,16 +158,16 @@ template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
     const int ksize_height = ksize[0];
     const int ksize_width = ksize[1];
     const int stride_height = strides[0];
@@ -176,7 +176,7 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -187,11 +187,10 @@ class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
         PoolProcess,
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, output_data, input_channels,
-                              input_height, input_width, output_height,
-                              output_width, ksize_height, ksize_width,
-                              stride_height, stride_width, padding_height,
-                              padding_width, pool_process);
+                 .stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, pool_process, output_data);
   }
 };
 
@@ -204,11 +203,11 @@ template <typename PoolProcess, typename T>
 class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process) {
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -225,7 +224,7 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * input_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -237,10 +236,10 @@ class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, output_height, output_width,
-        ksize_height, ksize_width, stride_height, stride_width, padding_height,
-        padding_width, pool_process);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        pool_process, input_grad_data);
   }
 };
 
@@ -253,10 +252,11 @@ template <typename T>
 class MaxPool2dGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -274,7 +274,7 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -285,10 +285,10 @@ class MaxPool2dGradFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, output_height, output_width,
-        ksize_height, ksize_width, stride_height, stride_width, padding_height,
-        padding_width);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
   }
 };
 
@@ -313,14 +313,16 @@ template class Pool2dGradFunctor<
     platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
 
 template <typename PoolProcess, typename T>
-__global__ void KernelPool3D(
-    const int nthreads, const T* input_data, T* output_data, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process) {
+__global__ void KernelPool3D(const int nthreads, const T* input_data,
+                             const int channels, const int input_depth,
+                             const int input_height, const int input_width,
+                             const int output_depth, const int output_height,
+                             const int output_width, const int ksize_depth,
+                             const int ksize_height, const int ksize_width,
+                             const int stride_depth, const int stride_height,
+                             const int stride_width, const int padding_depth,
+                             const int padding_height, const int padding_width,
+                             PoolProcess pool_process, T* output_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -358,13 +360,13 @@ __global__ void KernelPool3D(
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3DGrad(
     const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process) {
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, PoolProcess pool_process,
+    T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int offsetW = index % input_width + padding_width;
@@ -422,13 +424,12 @@ __global__ void KernelPool3DGrad(
 template <typename T>
 __global__ void KernelMaxPool3DGrad(
     const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_depth, const int input_height, const int input_width,
-    const int output_depth, const int output_height, const int output_width,
-    const int ksize_depth, const int ksize_height, const int ksize_width,
-    const int stride_depth, const int stride_height, const int stride_width,
-    const int padding_depth, const int padding_height,
-    const int padding_width) {
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -480,18 +481,18 @@ template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_process) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
     const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
     const int ksize_depth = ksize[0];
     const int ksize_height = ksize[1];
     const int ksize_width = ksize[2];
@@ -503,7 +504,7 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -516,11 +517,11 @@ class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, input_channels, input_depth,
-        input_height, input_width, output_depth, output_height, output_width,
-        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width,
-        pool_process);
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, pool_process,
+        output_data);
   }
 };
 
@@ -533,11 +534,11 @@ template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_process) {
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -560,7 +561,7 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -573,11 +574,11 @@ class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_depth, input_height, input_width, output_depth,
-        output_height, output_width, ksize_depth, ksize_height, ksize_width,
-        stride_depth, stride_height, stride_width, padding_depth,
-        padding_height, padding_width, pool_process);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, pool_process, input_grad_data);
   }
 };
 
@@ -590,10 +591,11 @@ template <class T>
 class MaxPool3dGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
@@ -616,7 +618,7 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
     const T* input_data = input.data<T>();
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -628,11 +630,11 @@ class MaxPool3dGradFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_depth, input_height, input_width, output_depth,
-        output_height, output_width, ksize_depth, ksize_height, ksize_width,
-        stride_depth, stride_height, stride_width, padding_depth,
-        padding_height, padding_width);
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, input_grad_data);
   }
 };
 
@@ -658,11 +660,11 @@ template class Pool3dGradFunctor<
 
 template <typename T>
 __global__ void KernelMaxPool2dWithIdx(
-    const int nthreads, const T* input_data, T* output_data, T* mask_data,
-    const int channels, const int input_height, const int input_width,
-    const int output_height, const int output_width, const int ksize_height,
-    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width) {
+    const int nthreads, const T* input_data, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, T* output_data, T* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -697,11 +699,11 @@ __global__ void KernelMaxPool2dWithIdx(
 
 template <typename T>
 __global__ void KernelMaxPool2DWithIdxGrad(
-    const int nthreads, T* input_grad, const T* output_grad, const T* mask_data,
+    const int nthreads, const T* output_grad, const T* mask_data,
     const int channels, const int input_height, const int input_width,
     const int output_height, const int output_width, const int ksize_height,
     const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width) {
+    const int padding_height, const int padding_width, T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
@@ -748,16 +750,16 @@ template <typename T>
 class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
-    const int output_channels = output.dims()[1];
-    const int output_height = output.dims()[2];
-    const int output_width = output.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
     const int ksize_height = ksize[0];
     const int ksize_width = ksize[1];
     const int stride_height = strides[0];
@@ -766,8 +768,8 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
     const int padding_width = paddings[1];
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_height * output_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -777,11 +779,10 @@ class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
     KernelMaxPool2dWithIdx<
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_data, output_data, mask_data,
-                              input_channels, input_height, input_width,
-                              output_height, output_width, ksize_height,
-                              ksize_width, stride_height, stride_width,
-                              padding_height, padding_width);
+                 .stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, output_data, mask_data);
   }
 };
 
@@ -794,14 +795,14 @@ template <typename T>
 class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_channels = input_grad.dims()[1];
-    const int input_height = input_grad.dims()[2];
-    const int input_width = input_grad.dims()[3];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
     const int output_height = output_grad.dims()[2];
     const int output_width = output_grad.dims()[3];
     const int ksize_height = ksize[0];
@@ -813,7 +814,7 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
 
     const T* mask_data = mask.data<T>();
     const T* output_grad_data = output_grad.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * input_channels * input_height * input_width;
     int blocks = (nthreads + 1024 - 1) / 1024;
@@ -823,11 +824,11 @@ class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
     KernelMaxPool2DWithIdxGrad<
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(nthreads, input_grad_data, output_grad_data,
-                              mask_data, input_channels, input_height,
-                              input_width, output_height, output_width,
-                              ksize_height, ksize_width, stride_height,
-                              stride_width, padding_height, padding_width);
+                 .stream()>>>(nthreads, output_grad_data, mask_data,
+                              input_channels, input_height, input_width,
+                              output_height, output_width, ksize_height,
+                              ksize_width, stride_height, stride_width,
+                              padding_height, padding_width, input_grad_data);
   }
 };
 
@@ -838,13 +839,13 @@ template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double>;
 
 template <typename T>
 __global__ void KernelMaxPool3DWithIdx(
-    const int nthreads, const T* input_data, T* output_data, T* mask_data,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width) {
+    const int nthreads, const T* input_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    T* output_data, T* mask_data) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int pw = index % output_width;
@@ -886,13 +887,13 @@ __global__ void KernelMaxPool3DWithIdx(
 
 template <typename T>
 __global__ void KernelMaxPool3DWithIdxGrad(
-    const int nthreads, T* input_grad, const T* output_grad, const T* mask,
-    const int channels, const int input_depth, const int input_height,
-    const int input_width, const int output_depth, const int output_height,
-    const int output_width, const int ksize_depth, const int ksize_height,
-    const int ksize_width, const int stride_depth, const int stride_height,
-    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width) {
+    const int nthreads, const T* output_grad, const T* mask, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    T* input_grad) {
   for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
        index += blockDim.x * gridDim.x) {
     int w_offset = index % input_width;
@@ -952,18 +953,18 @@ template <typename T>
 class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_depth = input.dims()[2];
     const int input_height = input.dims()[3];
     const int input_width = input.dims()[4];
-    const int output_channels = output.dims()[1];
-    const int output_depth = output.dims()[2];
-    const int output_height = output.dims()[3];
-    const int output_width = output.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
     const int ksize_depth = ksize[0];
     const int ksize_height = ksize[1];
     const int ksize_width = ksize[2];
@@ -975,8 +976,8 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
     const int padding_width = paddings[2];
 
     const T* input_data = input.data<T>();
-    T* output_data = output.mutable_data<T>(context.GetPlace());
-    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    T* mask_data = mask->mutable_data<T>(context.GetPlace());
 
     int nthreads = batch_size * output_channels * output_depth * output_height *
                    output_width;
@@ -988,11 +989,10 @@ class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_data, output_data, mask_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width);
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, output_data, mask_data);
   }
 };
 
@@ -1005,15 +1005,15 @@ template <typename T>
 class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings) {
-    const int batch_size = input_grad.dims()[0];
-    const int input_channels = input_grad.dims()[1];
-    const int input_depth = input_grad.dims()[2];
-    const int input_height = input_grad.dims()[3];
-    const int input_width = input_grad.dims()[4];
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
     const int output_depth = output_grad.dims()[2];
     const int output_height = output_grad.dims()[3];
     const int output_width = output_grad.dims()[4];
@@ -1029,7 +1029,7 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
 
     const T* output_grad_data = output_grad.data<T>();
     const T* mask_data = mask.data<T>();
-    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
     int nthreads =
         batch_size * input_channels * input_depth * input_height * input_width;
@@ -1041,11 +1041,11 @@ class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(
-        nthreads, input_grad_data, output_grad_data, mask_data, input_channels,
-        input_depth, input_height, input_width, output_depth, output_height,
-        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
-        stride_height, stride_width, padding_depth, padding_height,
-        padding_width);
+        nthreads, output_grad_data, mask_data, input_channels, input_depth,
+        input_height, input_width, output_depth, output_height, output_width,
+        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        input_grad_data);
   }
 };
 
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
index c50c57b5c52cdc5c12425cb119b80502aef5451e..f6719e1e628cdd2cf7445ec9cd05713bc4f14c84 100644
--- a/paddle/operators/math/pooling.h
+++ b/paddle/operators/math/pooling.h
@@ -88,60 +88,62 @@ template <typename Place, typename PoolProcess, typename T>
 class Pool2dFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* output);
 };
 
 template <typename Place, typename PoolProcess, typename T>
 class Pool2dGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
 template <typename Place, class T>
 class MaxPool2dGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 
 template <typename Place, typename PoolProcess, typename T>
 class Pool3dFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  std::vector<int>& ksize, std::vector<int>& strides,
-                  std::vector<int>& paddings, PoolProcess pool_compute);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* output);
 };
 
 template <typename Place, typename PoolProcess, typename T>
 class Pool3dGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
                   std::vector<int>& strides, std::vector<int>& paddings,
-                  PoolProcess pool_compute);
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
 };
 
 template <typename Place, class T>
 class MaxPool3dGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& input,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 
 /*
@@ -155,38 +157,38 @@ template <typename Place, typename T>
 class MaxPool2dWithIndexFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 
 template <typename Place, typename T>
 class MaxPool2dWithIndexGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 
 template <typename Place, typename T>
 class MaxPool3dWithIndexFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor& output,
-                  framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask);
 };
 
 template <typename Place, typename T>
 class MaxPool3dWithIndexGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  framework::Tensor& input_grad,
                   const framework::Tensor& output_grad,
                   const framework::Tensor& mask, std::vector<int>& ksize,
-                  std::vector<int>& strides, std::vector<int>& paddings);
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
 };
 
 }  // namespace math
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
index 5ce30740c90b5cd0bd4f8ab183cf985ed5d827c1..4f565946d596b5e5fbf90f16c0c13c780c36886c 100644
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -74,11 +74,10 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
   Tensor output;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
-    output.Resize(in_dims);
+    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
     output.mutable_data<T>(context.GetPlace());
     EigenTranspose<Place, T, 3>(context, input, output, {1, 0, 2});
-    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
-    output.Resize(make_ddim(out_dims));
+    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
   } else {
     output.ShareDataWith(input);
   }
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80460c476921b63ec5228a9780880c7db3c85217
--- /dev/null
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class MergeLoDTensorOp : public framework::OperatorBase {
+ public:
+  MergeLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
+    auto &in_false =
+        scope.FindVar(Input("InFalse"))->Get<framework::LoDTensor>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    int rank = in_true.dims().size();
+    platform::Place place = in_true.place();
+    std::type_index data_type = in_true.type();
+    framework::DDim in_true_dims =
+        framework::slice_ddim(in_true.dims(), 1, rank);
+
+    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
+
+    auto in_true_dim_vec = framework::vectorize(in_true_dims);
+    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+
+    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+
+    // Build LoDTensor `out`
+
+    size_t in_true_idx = 0;
+    size_t in_false_idx = 0;
+    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+      const framework::LoDTensor *input = nullptr;
+      size_t *in_idx = nullptr;
+      if (static_cast<int>(mask_data[i]) == 0) {
+        input = &in_false;
+        in_idx = &in_false_idx;
+      } else {
+        input = &in_true;
+        in_idx = &in_true_idx;
+      }
+      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+          input->lod(), *in_idx, (*in_idx) + 1, 0);
+      auto &lod_length = lod_and_offset.first;
+
+      framework::AppendLoD(out_lod, lod_length);
+
+      size_t start_offset = lod_and_offset.second.first;
+      size_t end_offset = lod_and_offset.second.second;
+
+      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      size_t len = end_offset - start_offset;
+      if (len == 0) {
+        continue;
+      }
+      out->Slice(out_offset, out_offset + len)
+          .CopyFrom(input->Slice(start_offset, end_offset), place, dev_ctx);
+      out_offset += len;
+      (*in_idx) += 1;
+    }
+
+    for (size_t i = 0; i < level; i++) {
+      out_lod->insert(out_lod->begin(), x.lod()[i]);
+    }
+  }
+};
+
+class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input LoDTensor, contains complete lod information to "
+             "construct the output");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddInput("InTrue", "The True branch to be merged");
+    AddInput("InFalse", "The False branch to be merged");
+    AddOutput("Out", "The merged output LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to rank.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Merge True and False branches of LoDTensor into a single Output,
+        with a mask at certain lod level. X is used to obtain complete
+        lod information. Please refer to SplitLoDTensorOp.)DOC");
+  }
+};
+
+class MergeLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "MergeLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "MergeLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasInput("InTrue"),
+                   "MergeLoDTensorOp must has input InTrue.");
+    PADDLE_ENFORCE(context->HasInput("InFalse"),
+                   "MergeLoDTensorOp must has input InFalse.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "MergeLoDTensorOp must has output Out");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
+  }
+};
+
+class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("split_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
+    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
+                  ops::MergeLoDTensorOpProtoMaker,
+                  ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
index 4da1941ab541483e706257667b14aa5a95e0c3cc..63492a89e8d4e44a036bc3c2b16cc54c7e77b534 100644
--- a/paddle/operators/pool_op.h
+++ b/paddle/operators/pool_op.h
@@ -75,16 +75,16 @@ class PoolKernel : public framework::OpKernel<T> {
               Place, paddle::operators::math::MaxPool<T>, T>
               pool2d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
 
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool2dFunctor<
               Place, paddle::operators::math::AvgPool<T>, T>
               pool2d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
         }
       } break;
       case 3: {
@@ -93,15 +93,15 @@ class PoolKernel : public framework::OpKernel<T> {
               Place, paddle::operators::math::MaxPool<T>, T>
               pool3d_forward;
           paddle::operators::math::MaxPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
         } else if (pooling_type == "avg") {
           paddle::operators::math::Pool3dFunctor<
               Place, paddle::operators::math::AvgPool<T>, T>
               pool3d_forward;
           paddle::operators::math::AvgPool<T> pool_process;
-          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
-                         paddings, pool_process);
+          pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                         paddings, pool_process, out);
         }
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -142,30 +142,30 @@ class PoolGradKernel : public framework::OpKernel<T> {
           if (pooling_type == "max") {
             paddle::operators::math::MaxPool2dGradFunctor<Place, T>
                 pool2d_backward;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings);
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool2dGradFunctor<
                 Place, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool2d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings, pool_process);
+            pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, pool_process, in_x_grad);
           }
         } break;
         case 3: {
           if (pooling_type == "max") {
             paddle::operators::math::MaxPool3dGradFunctor<Place, T>
                 pool3d_backward;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings);
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, in_x_grad);
           } else if (pooling_type == "avg") {
             paddle::operators::math::Pool3dGradFunctor<
                 Place, paddle::operators::math::AvgPoolGrad<T>, T>
                 pool3d_backward;
             paddle::operators::math::AvgPoolGrad<T> pool_process;
-            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
-                            *out_grad, ksize, strides, paddings, pool_process);
+            pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
+                            ksize, strides, paddings, pool_process, in_x_grad);
           }
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
index ea37de84abeb577461ccd5c1f0eda8bacb4458eb..c0e3b117dc3ea351b9edfed4d1823de0db27d30a 100644
--- a/paddle/operators/pool_with_index_op.h
+++ b/paddle/operators/pool_with_index_op.h
@@ -46,14 +46,14 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
       case 2: {
         paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
             pool2d_forward;
-        pool2d_forward(context.device_context(), *in_x, *out, *mask, ksize,
-                       strides, paddings);
+        pool2d_forward(context.device_context(), *in_x, ksize, strides,
+                       paddings, out, mask);
       } break;
       case 3: {
         paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
             pool3d_forward;
-        pool3d_forward(context.device_context(), *in_x, *out, *mask, ksize,
-                       strides, paddings);
+        pool3d_forward(context.device_context(), *in_x, ksize, strides,
+                       paddings, out, mask);
       } break;
       default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
     }
@@ -89,14 +89,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
         case 2: {
           paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
               pool2d_backward;
-          pool2d_backward(context.device_context(), *in_x_grad, *out_grad,
-                          *mask, ksize, strides, paddings);
+          pool2d_backward(context.device_context(), *out_grad, *mask, ksize,
+                          strides, paddings, in_x_grad);
         } break;
         case 3: {
           paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
               pool3d_backward;
-          pool3d_backward(context.device_context(), *in_x_grad, *out_grad,
-                          *mask, ksize, strides, paddings);
+          pool3d_backward(context.device_context(), *out_grad, *mask, ksize,
+                          strides, paddings, in_x_grad);
         } break;
         default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
       }
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index db737bed7a4d2dc5b60cbc6ac172caec95acd35e..d1de0b444712a8c304c33bd194e306dfe3c41f02 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -47,7 +47,7 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                         framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(vector<LoDTensor>) Input is a vector of LoDTensor, "
+             "(LodTensorArray) Input is a vector of LoDTensor, "
              "each of which is a variable-length sequence or nested sequence.")
         .AsDuplicable();
     AddOutput("Out",
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index 2b8a25c2414c20efaffedfc8603697b3a104634f..7f136d8cf0e1eaae7b4de32988b60ae8a5034cc6 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -126,6 +126,7 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
       auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
       Eigen::DSizes<int, 2> bcast(h, 1);
 
       if (pooltype == "AVERAGE") {
@@ -136,9 +137,9 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
         in_g_e.device(place) =
             (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
       } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e;
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
       } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e;
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
       } else {
         PADDLE_THROW("unsupported pooling pooltype");
       }
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db635f2ba0804143c9a2e04ff006dfbc8744f3fc
--- /dev/null
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -0,0 +1,186 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+using LoD = framework::LoD;
+
+class SplitLoDTensorOp : public framework::OperatorBase {
+ public:
+  SplitLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto *out_true =
+        scope.FindVar(Output("OutTrue"))->GetMutable<framework::LoDTensor>();
+    auto *out_false =
+        scope.FindVar(Output("OutFalse"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+    auto &x_lod = x.lod();
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
+
+    // set out_true/out_false lod
+    for (size_t t = 0; t < 2; t++) {
+      LoD *lod = nullptr;
+      if (t == 0) {
+        lod = out_false->mutable_lod();
+      } else {
+        lod = out_true->mutable_lod();
+      }
+      lod->clear();
+      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+        if (static_cast<size_t>(mask_data[i]) == t) {
+          size_t start_idx = i;
+          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+              x_lod, start_idx, start_idx + 1, level);
+
+          auto &lod_length = lod_and_offset.first;
+          framework::AppendLoD(lod, lod_length);
+
+          size_t start_offset = lod_and_offset.second.first;
+          size_t end_offset = lod_and_offset.second.second;
+          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+        }
+      }
+    }
+
+    for (size_t t = 0; t < 2; ++t) {
+      framework::LoDTensor *out;
+      if (t == 0) {
+        out = out_false;
+      } else {
+        out = out_true;
+      }
+      auto &ranges = copy_ranges[t];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out->Resize(x_dim);
+      out->mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[offset: offset+len] = x[each_range.begin: each_range.end]
+        out->Slice(static_cast<int>(offset), static_cast<int>(offset + len))
+            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                              static_cast<int>(each_range.end)),
+                      x.place(), dev_ctx);
+        offset += len;
+      }
+    }
+  }
+};
+
+class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input LoDTensor");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddOutput("OutTrue", "True branch of input LoDTensor");
+    AddOutput("OutFalse", "False branch of input LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to split.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Split a LoDTensor with a Mask at certain level. The input LoDTensor
+        has 3 sequence at certain lod level. The Mask is a bool column vector,
+        such as [0, 1, 0] at the same level. The first and third sequence will
+        be send to False Output LoDTensor; whereas the second sequence will
+        be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC");
+  }
+};
+
+class SplitLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "SplitLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "SplitLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasOutput("OutTrue"),
+                   "SplitLoDTensorOp must has output OutTrue.");
+    PADDLE_ENFORCE(context->HasOutput("OutFalse"),
+                   "SplitLoDTensorOp must has output OutFalse.");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
+    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
+  }
+};
+
+class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("merge_lod_tensor");
+    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
+    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp,
+                  ops::SplitLoDTensorOpProtoMaker,
+                  ops::SplitLoDTensorInferShape,
+                  ops::SplitLoDTensorArrayGradMaker);
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h
index c8d37ac40c1533a77acf78e6a42e1659555127e1..48d7b1c2d56882f04330dbf27b0a92e37cb8874c 100644
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -29,7 +29,7 @@ class SquaredL2NormKernel : public framework::OpKernel<T> {
     Out->mutable_data<T>(context.GetPlace());
 
     auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
     auto place = context.GetEigenDevice<Place>();
 
     out.device(place) = x.square().sum();
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index f0311095012d944768d80abe423d4a9bfc0e97f5..3b0f09cea6eb34915f21b11fcea6028821a8c3ff 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -200,7 +200,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
                                      false,
                                      useGpu_);
     }
-  } else if (matType == MAT_NORMAL_SHARED) {
+  }
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOLINTNEXTLINE
+  else if (matType == MAT_NORMAL_SHARED) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     size_t blockNum = 0;
     CHECK(isGradShared(&blockNum));
@@ -259,7 +262,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
   } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
     CHECK(isGradSparseUpdate());
     mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  } else {
+  }
+#endif
+  // NOLINTNEXTLINE
+  else {
     LOG(FATAL) << "Unsupported mat type" << matType;
   }
 }
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
index 248baf6613c5caebdabcecff5d57290585238d78..d9f49527dcf150fcb35d3af512088f75dec0b5c6 100644
--- a/paddle/platform/call_once.h
+++ b/paddle/platform/call_once.h
@@ -27,20 +27,22 @@ namespace platform {
 
  This wrap is a hack to avoid this bug.
 */
-template <class Callable, class... Args>
+template <typename Callable, typename... Args>
 inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
   bool good = false;
   std::exception ex;
-  std::call_once(flag, [&]() {
-    try {
-      f(args...);
-      good = true;
-    } catch (const std::exception& e) {
-      ex = e;
-    } catch (...) {
-      ex = std::runtime_error("excption caught in call_once");
-    }
-  });
+  std::call_once(flag,
+                 [&](Args&&... args) {
+                   try {
+                     f(args...);
+                     good = true;
+                   } catch (const std::exception& e) {
+                     ex = e;
+                   } catch (...) {
+                     ex = std::runtime_error("excption caught in call_once");
+                   }
+                 },
+                 args...);
   if (!good) {
     throw std::exception(ex);
   }
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 0f906e0e470b7f95bb2103ae55330fc1831aa78f..3d8d3f1d2fd3977f945928c723db5fcafffeae85 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -42,6 +42,9 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #endif
 
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
 namespace paddle {
 namespace pybind {
 static size_t UniqueIntegerGenerator(const std::string &prefix) {
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
index c691fe26255914811c8861cff80495c821990179..cfb8c713d96008a74287fb1248657c30f3b81164 100644
--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -33,6 +33,7 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
                                  bool withValue,
                                  bool useGpu,
                                  bool equalNnzPerSample) {
+#ifndef PADDLE_MOBILE_INFERENCE
   std::vector<int64_t> ids(height);
   std::vector<int64_t> indices(height + 1);
   indices[0] = 0;
@@ -84,6 +85,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
     }
     return mat;
   }
+#endif
+  return nullptr;
 }
 
 void generateSequenceStartPositions(size_t batchSize,
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 32578ad7799c0a276972ccef7770c2eae8438069..c8632295a25b160513a8e154bf1a5453c0005031 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -37,10 +37,10 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -66,7 +66,7 @@ if (WITH_TESTING)
     add_subdirectory(paddle/v2/tests)
     add_subdirectory(paddle/v2/reader/tests)
     add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/v2/framework/tests)
+    add_subdirectory(paddle/v2/fluid/tests)
   endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 43d02bf70e74c3903d50a4a2177059f4f474045a..5bd68e211ac1c8e05f40dc3ca37eef99f32af47f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1200,8 +1200,14 @@ def TestData(data_config, async_load_data=None):
 
 #caffe_mode: compute the output size using floor instead of ceil,
 #            which is consistent of caffe and CuDNN's convention.
-def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
-    output = (2 * padding + img_size - filter_size) / float(stride)
+def cnn_output_size(img_size,
+                    filter_size,
+                    padding,
+                    stride,
+                    caffe_mode,
+                    dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    output = (2 * padding + img_size - filter_s) / float(stride)
     if caffe_mode:
         return 1 + int(math.floor(output))
     else:
@@ -1210,8 +1216,14 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
 
 #calcualte image_size based on output_size for de-convolution (ConvTransLayer).
 #It is the reverse function of cnn_output_size
-def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
-    img_size = (output_size - 1) * stride + filter_size - 2 * padding
+def cnn_image_size(output_size,
+                   filter_size,
+                   padding,
+                   stride,
+                   caffe_mode,
+                   dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    img_size = (output_size - 1) * stride + filter_s - 2 * padding
     if not caffe_mode:
         img_size = img_size + 1
     return img_size
@@ -1253,9 +1265,9 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
 def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
-        'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
-    ], "pool-type %s is not in "
-                  "['max-projection', 'avg-projection', "
+        'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
+    ], "pool-type %s is not in " \
+              "['max-projection', 'avg-projection', 'max-pool-with-mask'," \
                   "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)
 
     pool_conf.channels = pool.channels
@@ -1376,6 +1388,12 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
     conv_conf.stride_y = conv.stride_y
     conv_conf.groups = conv.groups
     conv_conf.caffe_mode = conv.caffe_mode
+    if not conv.dilation:
+        conv.dilation = 1
+        conv.dilation_y = 1
+    else:
+        conv_conf.dilation = conv.dilation
+        conv_conf.dilation_y = conv.dilation_y
 
     if not trans:
         conv_conf.filter_channels = conv.channels / conv.groups
@@ -1383,20 +1401,20 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
             get_img_size(input_layer_name, conv.channels)
         conv_conf.output_x = cnn_output_size(
             conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
         conv_conf.output_y = cnn_output_size(
             conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
     else:
         conv_conf.filter_channels = num_filters / conv.groups
         conv_conf.output_x, conv_conf.output_y = \
             get_img_size(input_layer_name, conv.channels)
         conv_conf.img_size = cnn_image_size(
             conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
         conv_conf.img_size_y = cnn_image_size(
             conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
 
 
 #caffe_mode: compute the output size using floor instead of ceil,
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 617fbff948bf03098eca4a31f44d4ff05e73dbcf..b59f2f657d7cd0232be87ee2699e00f08ac3dc5b 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -20,7 +20,7 @@ from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
     ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, BasePoolingType, \
+from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
     CudnnAvgPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
@@ -2571,7 +2571,9 @@ def img_conv_layer(input,
 
     if layer_type:
         if dilation > 1 or dilation_y > 1:
-            assert layer_type in ["cudnn_conv", "cudnn_convt"]
+            assert layer_type in [
+                "cudnn_conv", "cudnn_convt", "exconv", "exconvt"
+            ]
         if trans:
             assert layer_type in ["exconvt", "cudnn_convt"]
         else:
@@ -2699,9 +2701,9 @@ def img_pool_layer(input,
     elif isinstance(pool_type, AvgPooling):
         pool_type.name = 'avg'
 
-    assert type(pool_type) in [AvgPooling, MaxPooling, CudnnAvgPooling,
+    assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
                                CudnnMaxPooling], \
-        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling are supported"
+        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
 
     type_name = pool_type.name + '-projection' \
         if (
@@ -3592,10 +3594,9 @@ def lstm_step_layer(input,
     :type gate_act: BaseActivation
     :param state_act: State Activation Type. TanhActivation is the default.
     :type state_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param bias_attr: The parameter attribute for bias. If this parameter is
+                     set to True or None, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | True
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -3650,9 +3651,10 @@ def gru_step_layer(input,
     :param name: The name of this layer. It is optional.
     :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
     :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: the parameter_attribute for transforming the output_mem
                        from previous step.
@@ -3712,9 +3714,10 @@ def gru_step_naive_layer(input,
     :type act: BaseActivation
     :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
     :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr:
     :param layer_attr:
@@ -3844,9 +3847,10 @@ def recurrent_layer(input,
     :type input: LayerOutput
     :param act: Activation type. TanhActivation is the default.
     :type act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to 
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If the parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: parameter attribute.
     :type param_attr: ParameterAttribute
@@ -4836,9 +4840,10 @@ def tensor_layer(a,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute | None
@@ -4900,9 +4905,10 @@ def selective_fc_layer(input,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute | None
@@ -5585,10 +5591,10 @@ def nce_layer(input,
                              to the num_classes. Each member of the list defines
                              the probability of a class given input x.
     :type neg_distribution: list | tuple | collections.Sequence | None
-    :param bias_attr: The attribute for bias. If this parameter is set False or
-                      any object whose type is not ParameterAttribute, no bias
-                      is added. If this parameter is set True, the bias is
-                      initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -6498,9 +6504,9 @@ def gated_unit_layer(input,
     :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
                             for details.
     :type gate_param_attr: ParameterAttribute
-    :param gate_bias_attr: The bias attribute of the gate. If the parameter is set to False or
+    :param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
                            an object whose type is not ParameterAttribute, no bias is defined.
-                           If the parameter is set to True, the bias is initialized to zero.
+                           If this parameter is set to True, the bias is initialized to zero.
     :type gate_bias_attr: ParameterAttribute | bool | None | Any
     :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
                         details.
@@ -6508,9 +6514,9 @@ def gated_unit_layer(input,
     :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
                               for details.
     :type inproj_param_attr: ParameterAttribute
-    :param inproj_bias_attr: The bias attribute of the projection. If the parameter is set to False
+    :param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
                              or an object whose type is not ParameterAttribute, no bias is defined.
-                             If the parameter is set to True, the bias is initialized to zero.
+                             If this parameter is set to True, the bias is initialized to zero.
     :type inproj_bias_attr: ParameterAttribute | bool | None | Any
     :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
                        details.
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 3821d075cba5d39b5808a39093b8570d9302b667..d323d34c3ff47614342934c2a02492f66d27dc10 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -681,34 +681,42 @@ def lstmemory_unit(input,
                                    state_act=TanhActivation())
 
 
-    :param input: input layer.
+    :param input: Input layer.
     :type input: LayerOutput
-    :param out_memory: output of previous time step
+    :param out_memory: The output of previous time step.
     :type out_memory: LayerOutput | None
-    :param name: lstmemory unit name.
+    :param name: The lstmemory unit name.
     :type name: basestring
-    :param size: lstmemory unit size.
+    :param size: The lstmemory unit size.
     :type size: int
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
     :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
     :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
     :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
     :type state_act: BaseActivation
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
     :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param lstm_layer_attr: extra attribute of lstm layer.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
     :type lstm_layer_attr: ExtraLayerAttribute
-    :return: lstmemory unit name.
+    :return: The lstmemory unit name.
     :rtype: LayerOutput
     """
     if size is None:
@@ -786,34 +794,42 @@ def lstmemory_group(input,
                                     gate_act=SigmoidActivation(),
                                     state_act=TanhActivation())
 
-    :param input: input layer.
+    :param input: Input layer.
     :type input: LayerOutput
-    :param size: lstmemory group size.
+    :param size: The lstmemory group size.
     :type size: int
-    :param name: name of lstmemory group.
+    :param name: The name of lstmemory group.
     :type name: basestring
-    :param out_memory: output of previous time step.
+    :param out_memory: The output of previous time step.
     :type out_memory: LayerOutput | None
-    :param reverse: process the input in a reverse order or not.
+    :param reverse: Process the input in a reverse order or not.
     :type reverse: bool
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
     :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
     :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
     :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
     :type state_act: BaseActivation
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                           False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
     :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_layer_attr: lstm layer's extra attribute.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
     :type lstm_layer_attr: ExtraLayerAttribute
     :return: the lstmemory group.
     :rtype: LayerOutput
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index 0c38a8dce553ec120cacc72edb604bfeb1819f93..f45616551bcd4822c668234c3afaf6aa35cd2953 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -15,8 +15,8 @@
 """
 
 __all__ = [
-    "BasePoolingType", "MaxPooling", "AvgPooling", "CudnnMaxPooling",
-    "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
+    "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
+    "CudnnMaxPooling", "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
 ]
 
 
@@ -55,6 +55,19 @@ class MaxPooling(BasePoolingType):
         self.output_max_index = output_max_index
 
 
+class MaxWithMaskPooling(BasePoolingType):
+    """
+    MaxWithMask pooling.
+
+    Not only return the very large values for each dimension in sequence or time steps,
+    but also the location indices of found maxinum values.
+
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "max-pool-with-mask")
+
+
 class CudnnMaxPooling(BasePoolingType):
     """
     Cudnn max pooling only support GPU. Return the maxinum value in the
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
index 5ddf6052df021b055390a42c25ce6c0d650e4aee..b14121e82cb7d9516c4771fc896b9b3b9e01d1c8 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -28,6 +28,8 @@ layers {
       stride_y: 1
       output_y: 227
       img_size_y: 256
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index c0252b945b4c7fd6b4dad8770e3e1dccb88df28a..c7a487a11231cba6182b654108773037bdb0ec35 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -28,6 +28,8 @@ layers {
       stride_y: 1
       output_y: 227
       img_size_y: 256
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
index fd5224ca55cd1f642ca2f927f867a7cbf8a47cf6..25ec6323751fae5778657945a765d8ca162ee2c4 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -28,6 +28,8 @@ layers {
       stride_y: 1
       output_y: 48
       img_size_y: 48
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
index 03f4f3a31d6c222d949f64341bb8ac4c2a56fc5a..39dc4871469785fbe667e43f1f0fb9da7a19e2d2 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -30,6 +30,8 @@ layers {
       stride_y: 1
       output_y: 48
       img_size_y: 48
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
@@ -105,6 +107,8 @@ layers {
       stride_y: 1
       output_y: 24
       img_size_y: 24
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_1__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
index 15c6ab4dc8e61dedc10acaa49db7d8ae136d4952..d5d6d31a17b84d8ddb4e555caca804f2f6c50992 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
@@ -30,6 +30,8 @@ layers {
       stride_y: 1
       output_y: 48
       img_size_y: 48
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
index f1bc65b3aee7488700a9d24e049adb510649c475..0ec88aa998cce91be4d0ca5430ad49aa4dc6aa63 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
@@ -36,6 +36,8 @@ layers {
       stride_y: 1
       output_y: 14
       img_size_y: 14
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
diff --git a/python/paddle/v2/framework/.gitignore b/python/paddle/v2/fluid/.gitignore
similarity index 100%
rename from python/paddle/v2/framework/.gitignore
rename to python/paddle/v2/fluid/.gitignore
diff --git a/python/paddle/v2/framework/__init__.py b/python/paddle/v2/fluid/__init__.py
similarity index 100%
rename from python/paddle/v2/framework/__init__.py
rename to python/paddle/v2/fluid/__init__.py
diff --git a/python/paddle/v2/framework/backward.py b/python/paddle/v2/fluid/backward.py
similarity index 97%
rename from python/paddle/v2/framework/backward.py
rename to python/paddle/v2/fluid/backward.py
index 678efd5d20585355a684bb2df16fdb57a69e0eeb..f188582178f667125ec95cd230100fdb10ce7e88 100644
--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -1,4 +1,4 @@
-from paddle.v2.framework import framework as framework
+from paddle.v2.fluid import framework as framework
 
 __all__ = ['append_backward_ops']
 
diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/fluid/default_scope_funcs.py
similarity index 92%
rename from python/paddle/v2/framework/default_scope_funcs.py
rename to python/paddle/v2/fluid/default_scope_funcs.py
index c07f9a6ab96ac86fd6d20fbe0bc560845107f063..60c6165b6bd959f7bb3d92afed667f00f73f144f 100644
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/fluid/default_scope_funcs.py
@@ -13,7 +13,7 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope. 
 """
 
-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import threading
 
 __tl_scope__ = threading.local()
@@ -27,13 +27,13 @@ __all__ = [
 def get_cur_scope():
     """
     Get current scope.
-    :rtype: paddle.v2.framework.core.Scope
+    :rtype: paddle.v2.fluid.core.Scope
     """
     cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
     if cur_scope_stack is None:
         __tl_scope__.cur_scope = list()
     if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
+        __tl_scope__.cur_scope.append(paddle.v2.fluid.core.Scope())
     return __tl_scope__.cur_scope[-1]
 
 
diff --git a/python/paddle/v2/framework/evaluator.py b/python/paddle/v2/fluid/evaluator.py
similarity index 94%
rename from python/paddle/v2/framework/evaluator.py
rename to python/paddle/v2/fluid/evaluator.py
index 254dd5f1a33eef17ad7a0117541255a4399ef23c..180d0135ffe8fa8982cfcde242033b5a69eed1cf 100644
--- a/python/paddle/v2/framework/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -1,6 +1,6 @@
-import paddle.v2.framework.op as op
+import paddle.v2.fluid.op as op
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 def avg_accumulate(accumulated_var, per_eval, num_batches, place):
@@ -22,7 +22,7 @@ class Evaluator(object):
         NOTE: default run on CPUPlace(), running on GPUPlace doesn't improve performance much.
 
         :param scope: the scope instance contains the input.
-        :type scope: paddle.v2.framework.core.scope
+        :type scope: paddle.v2.fluid.core.scope
         :param operator: operator name for caculating the evaluation for each mini-batch.
         :type operator: string
         :param input: output variable name of forward network.
diff --git a/python/paddle/v2/framework/executor.py b/python/paddle/v2/fluid/executor.py
similarity index 94%
rename from python/paddle/v2/framework/executor.py
rename to python/paddle/v2/fluid/executor.py
index f5c833190e73a277bef2509e02c4be051768933d..ed1c2c06daa7ede97e138049a1f7044d071c31e8 100644
--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Block, Program, g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import Block, Program, g_main_program
 
 g_scope = core.Scope()
 
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/fluid/framework.py
similarity index 99%
rename from python/paddle/v2/framework/framework.py
rename to python/paddle/v2/fluid/framework.py
index b9db2707c0705659260c04ab3412f429058a1316..e2587b4f74506c6eb2b253fa9b24db4838bfedbc 100644
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import collections
 import numpy as np
 import copy
@@ -285,7 +285,7 @@ class Operator(object):
         self.desc.check_attrs()
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad', 'while'
+            'rnn_memory_helper_grad', 'conditional_block', 'while'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
diff --git a/python/paddle/v2/framework/initializer.py b/python/paddle/v2/fluid/initializer.py
similarity index 99%
rename from python/paddle/v2/framework/initializer.py
rename to python/paddle/v2/fluid/initializer.py
index 98a87bfa86efb39f381b9f99b2b1f0d7ec7d9833..ded144ecd5db83ce50ca0dc6243fdc52ac0b7a2f 100644
--- a/python/paddle/v2/framework/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework
 import numpy as np
 
 __all__ = [
diff --git a/python/paddle/v2/framework/io.py b/python/paddle/v2/fluid/io.py
similarity index 98%
rename from python/paddle/v2/framework/io.py
rename to python/paddle/v2/fluid/io.py
index 5c247904a330e25b1a9f53db431947840db3f615..394a171c67a99ffb0c7caaf71e850fe541f8286e 100644
--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -1,7 +1,7 @@
 import os
 import cPickle as pickle
 
-from paddle.v2.framework.framework import Program, Parameter, g_main_program, \
+from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
     Variable
 
 __all__ = [
diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
similarity index 94%
rename from python/paddle/v2/framework/layer_helper.py
rename to python/paddle/v2/fluid/layer_helper.py
index c38346b79fecfb2f82a60b360c505da16ecdf3c0..9dc3c119ea47ca11956d85119ce1ec6d3d6bb7e8 100644
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -1,10 +1,10 @@
 import copy
 import itertools
 
-from paddle.v2.framework.framework import Variable, g_main_program, \
+from paddle.v2.fluid.framework import Variable, g_main_program, \
     g_startup_program, unique_name, Program
-from paddle.v2.framework.initializer import ConstantInitializer, \
-    UniformInitializer
+from paddle.v2.fluid.initializer import ConstantInitializer, \
+    UniformInitializer, XavierInitializer
 
 
 class LayerHelper(object):
@@ -61,7 +61,7 @@ class LayerHelper(object):
 
     @property
     def param_attr(self):
-        default = {'name': None, 'initializer': UniformInitializer()}
+        default = {'name': None, 'initializer': XavierInitializer()}
         actual = self.kwargs.get('param_attr', None)
         if actual is None:
             actual = default
@@ -70,10 +70,11 @@ class LayerHelper(object):
                 actual[default_field] = default[default_field]
         return actual
 
+    @property
     def bias_attr(self):
-        default = {'name': None, 'initializer': ConstantInitializer()}
+        default = {'name': None, 'initializer': XavierInitializer()}
         bias_attr = self.kwargs.get('bias_attr', None)
-        if bias_attr is True:
+        if bias_attr is None:
             bias_attr = default
 
         if isinstance(bias_attr, dict):
@@ -166,7 +167,7 @@ class LayerHelper(object):
                 num_flatten_dims = 1
 
         size = list(input_var.shape[num_flatten_dims:])
-        bias_attr = self.bias_attr()
+        bias_attr = self.bias_attr
         if not bias_attr:
             return input_var
 
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/fluid/layers.py
similarity index 83%
rename from python/paddle/v2/framework/layers.py
rename to python/paddle/v2/fluid/layers.py
index 9a1999243750aca62a4ef898ae979d273902b45c..8a1aa1c42d5a006539d221f96e3535434c9a4c43 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -1,22 +1,24 @@
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, \
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
     Operator
-from paddle.v2.framework.initializer import ConstantInitializer, \
+from paddle.v2.fluid.initializer import ConstantInitializer, \
     NormalInitializer
-from paddle.v2.framework.layer_helper import LayerHelper, unique_name
+from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
+import cStringIO
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
     'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
-    'batch_norm', 'accuracy'
+    'batch_norm', 'accuracy', 'split_lod_tensor'
 ]
 
 
 def fc(input,
        size,
        param_attr=None,
-       bias_attr=True,
+       bias_attr=None,
        name=None,
        act=None,
        num_flatten_dims=1,
@@ -125,6 +127,55 @@ def embedding(input,
     return tmp
 
 
+# TODO(qijun): expose H0 and C0
+def dynamic_lstm(input,
+                 size,
+                 data_type='float32',
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('lstm', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=data_type)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=data_type, suffix='b')
+
+    hidden = helper.create_tmp_variable(data_type)
+    cell = helper.create_tmp_variable(data_type)
+    batch_gate = helper.create_tmp_variable(data_type)
+    batch_cell_pre_act = helper.create_tmp_variable(data_type)
+
+    helper.append_op(
+        type='lstm',
+        inputs={'Input': input,
+                'Weight': weight,
+                'Bias': bias},
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
+
+
 def data(name,
          shape,
          data_type='float32',
@@ -175,6 +226,11 @@ def data(name,
         stop_gradient=stop_gradient)
 
 
+def create_tensor(dtype, name=None, main_program=None):
+    helper = LayerHelper("create_tensor", **locals())
+    return helper.create_variable(name=helper.name, dtype=dtype)
+
+
 def _convert_(name):
     """
     Formatting.
@@ -191,6 +247,58 @@ def _convert_(name):
     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
 
 
+def _generate_doc_string_(op_proto):
+    """
+    Generate docstring by OpProto
+    
+    Args:
+        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
+
+    Returns:
+        str: the document string
+    """
+
+    def _type_to_str_(tp):
+        return framework_pb2.AttrType.Name(tp)
+
+    if not isinstance(op_proto, framework_pb2.OpProto):
+        raise TypeError("OpProto should be `framework_pb2.OpProto`")
+
+    buf = cStringIO.StringIO()
+    buf.write(op_proto.comment)
+    buf.write('\nArgs:\n')
+    for each_input in op_proto.inputs:
+        line_begin = '    {0}: '.format(_convert_(each_input.name))
+        buf.write(line_begin)
+        buf.write(each_input.comment)
+        buf.write('\n')
+        buf.write(' ' * len(line_begin))
+        buf.write('Duplicable: ')
+        buf.write(str(each_input.duplicable))
+        buf.write('  Optional: ')
+        buf.write(str(each_input.dispensable))
+        buf.write('\n')
+
+    for each_attr in op_proto.attrs:
+        buf.write('    ')
+        buf.write(each_attr.name)
+        buf.write(' (')
+        buf.write(_type_to_str_(each_attr.type))
+        buf.write('): ')
+        buf.write(each_attr.comment)
+        buf.write('\n')
+
+    if len(op_proto.outputs) != 0:
+        buf.write('\nReturns:\n')
+        buf.write('    ')
+        for each_opt in op_proto.outputs:
+            if not each_opt.intermediate:
+                break
+        buf.write(each_opt.comment)
+
+    return buf.getvalue()
+
+
 def _create_op_func_(op_type):
     """
     Create an Operator for a Function.
@@ -249,11 +357,6 @@ def _create_op_func_(op_type):
         return dtype
 
     def func(**kwargs):
-        """
-        This function implements the function for the operator. This process
-        involves doing the sanity check (using the function above), reading
-        inputs from protobuf and applying the activations on top.
-        """
         helper = LayerHelper(op_type, **kwargs)
 
         dtype = infer_and_check_data_type(op_proto, **kwargs)
@@ -277,6 +380,7 @@ def _create_op_func_(op_type):
 
     func.__name__ = op_type
     globals()[op_type] = func
+    func.__doc__ = _generate_doc_string_(op_proto)
     global __all__
     __all__.append(op_type)
 
@@ -352,6 +456,56 @@ def sums(input, main_program=None, startup_program=None):
     return out
 
 
+def assign(input, output, main_program=None):
+    helper = LayerHelper('assign', **locals())
+    helper.append_op(
+        type='scale',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs={'scale': 1.0})
+    return output
+
+
+def split_lod_tensor(input,
+                     mask,
+                     level,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('split_lod_tensor', **locals())
+    out_true = helper.create_tmp_variable(dtype=input.data_type)
+    out_false = helper.create_tmp_variable(dtype=input.data_type)
+    helper.append_op(
+        type='split_lod_tensor',
+        inputs={
+            'X': input,
+            'Mask': mask,
+        },
+        outputs={'OutTrue': out_true,
+                 'OutFalse': out_false},
+        attrs={'level': level})
+    return out_true, out_false
+
+
+def merge_lod_tensor(in_true,
+                     in_false,
+                     x,
+                     mask,
+                     level,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('merge_lod_tensor', **locals())
+    out = helper.create_tmp_variable(dtype=x.data_type)
+    helper.append_op(
+        type='merge_lod_tensor',
+        inputs={'X': x,
+                'Mask': mask,
+                'InTrue': in_true,
+                'InFalse': in_false},
+        outputs={'Out': out},
+        attrs={'level': level})
+    return out
+
+
 def cos_sim(X, Y, **kwargs):
     """
     This function performs the cosine similarity between two tensors
@@ -685,6 +839,23 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
+def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.data_type)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.data_type)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        })
+
+    return sentence_ids, sentence_scores
+
+
 class BlockGuard(object):
     """
     BlockGuard class.
@@ -1276,3 +1447,73 @@ def array_length(array, main_program=None):
     helper.append_op(
         type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
     return tmp
+
+
+class ConditionalBlockGuard(BlockGuard):
+    def __init__(self, block):
+        if not isinstance(block, ConditionalBlock):
+            raise TypeError("block should be conditional block")
+        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
+        self.block = block
+
+    def __enter__(self):
+        return super(ConditionalBlockGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.block.complete()
+        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
+                                                           exc_tb)
+
+
+class ConditionalBlock(object):
+    def __init__(self, inputs, name=None, main_program=None):
+        for each_input in inputs:
+            if not isinstance(each_input, Variable):
+                raise TypeError("Each input should be variable")
+        self.inputs = inputs
+        self.helper = LayerHelper(
+            'conditional_block', name=name, main_program=main_program)
+
+    def block(self):
+        return ConditionalBlockGuard(self)
+
+    def complete(self):
+        inside_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(inside_block.parent_idx)
+
+        intermediate = set()
+        params = set()
+
+        for each_op in inside_block.ops:
+            assert isinstance(each_op, Operator)
+            for iname in each_op.input_names:
+                for in_var_name in each_op.input(iname):
+                    if in_var_name not in intermediate:
+                        params.add(in_var_name)
+
+            for oname in each_op.output_names:
+                for out_var_name in each_op.output(oname):
+                    intermediate.add(out_var_name)
+        input_set = set([ipt.name for ipt in self.inputs])
+
+        param_list = [
+            parent_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name not in intermediate
+        ]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        parent_block.append_op(
+            type='conditional_block',
+            inputs={
+                'X': self.inputs,
+                'Params': param_list,
+            },
+            outputs={'Out': out_list,
+                     'Scope': [step_scope]},
+            attrs={'block': inside_block})
diff --git a/python/paddle/v2/framework/net_drawer.py b/python/paddle/v2/fluid/net_drawer.py
similarity index 96%
rename from python/paddle/v2/framework/net_drawer.py
rename to python/paddle/v2/fluid/net_drawer.py
index 045e267c253e2485e75df3fb95cc0e591ee29ea5..17ad547c2bb5b79ef8225dd1a8f1ef49a6572508 100644
--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/fluid/net_drawer.py
@@ -3,8 +3,8 @@ import json
 import logging
 from collections import defaultdict
 
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
diff --git a/python/paddle/v2/framework/nets.py b/python/paddle/v2/fluid/nets.py
similarity index 98%
rename from python/paddle/v2/framework/nets.py
rename to python/paddle/v2/fluid/nets.py
index 725d2fa7f5e7a862eea0ef9172a9e63858ebd0dd..5e14ca594bc7965dc29039ba57bb7b26b1ce6871 100644
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.layers as layers
+import paddle.v2.fluid.layers as layers
 
 __all__ = ["simple_img_conv_pool", "sequence_conv_pool"]
 
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/fluid/op.py
similarity index 98%
rename from python/paddle/v2/framework/op.py
rename to python/paddle/v2/fluid/op.py
index bc771a964adf9f97cbeae87c06ce954c76051150..5828803497ec06bc7644da18ca752f61469ca53f 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/fluid/op.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 
 def get_all_op_protos():
diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/fluid/optimizer.py
similarity index 84%
rename from python/paddle/v2/framework/optimizer.py
rename to python/paddle/v2/fluid/optimizer.py
index 5b4cdecf2c4285618131657a09fbe437191ea75a..4252a6f08509fec92ac5c45d32169232e1dd190f 100644
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -1,11 +1,11 @@
 from collections import defaultdict
 
-import paddle.v2.framework.framework as framework
-from paddle.v2.framework.framework import unique_name, Program
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.initializer import ConstantInitializer
-from paddle.v2.framework.regularizer import append_regularization_ops
-from paddle.v2.framework.layer_helper import LayerHelper
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.framework import unique_name, Program
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.initializer import ConstantInitializer
+from paddle.v2.fluid.regularizer import append_regularization_ops
+from paddle.v2.fluid.layer_helper import LayerHelper
 
 __all__ = [
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
@@ -35,15 +35,21 @@ class Optimizer(object):
         """
         raise NotImplementedError()
 
-    def _initialize_tensors(self, block):
-        """Create all necessary tensors, that will be shared for all parameter updates.
-
-        Tensors like learning rate should be initialized here.
-
-        Args:
-            block: the block in which the loss variable is present
-        """
-        pass
+    def _create_param_lr(self, param_and_grad):
+        # create learning rate variable for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate']
+        param_lr_shape = [1]
+        param_lr_var = self.helper.create_global_variable(
+            name=unique_name("learning_rate"),
+            dtype='float32',
+            shape=param_lr_shape,
+            lod_level=1,
+            persistable=True)
+        param_lr = param_lr * self._learning_rate
+        self.helper.set_variable_initializer(
+            var=param_lr_var, initializer=ConstantInitializer(param_lr))
+        return param_lr_var
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -161,8 +167,6 @@ class Optimizer(object):
             startup_program=startup_program)
         self._create_accumulators(loss.block,
                                   [p[0] for p in parameters_and_grads])
-        # Create any necessary tensors
-        self._initialize_tensors(loss.block)
 
         optimize_ops = []
         for param_and_grad in parameters_and_grads:
@@ -214,27 +218,16 @@ class SGDOptimizer(Optimizer):
         self.type = "sgd"
         self._learning_rate = learning_rate
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
+
         # create the optimize op
         sgd_op = block.append_op(
             type=self.type,
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={"ParamOut": param_and_grad[0]})
 
@@ -259,19 +252,6 @@ class MomentumOptimizer(Optimizer):
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
 
-    def _initialize_tensors(self, block):
-        assert isinstance(block, framework.Block)
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -290,7 +270,7 @@ class MomentumOptimizer(Optimizer):
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
                 "Velocity": velocity_acc,
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={
                 "ParamOut": param_and_grad[0],
@@ -315,18 +295,6 @@ class AdagradOptimizer(Optimizer):
         self._learning_rate = learning_rate
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -346,7 +314,7 @@ class AdagradOptimizer(Optimizer):
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
                 "Moment": moment_acc,
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={"ParamOut": param_and_grad[0],
                      "MomentOut": moment_acc},
@@ -378,18 +346,6 @@ class AdamOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -433,7 +389,7 @@ class AdamOptimizer(Optimizer):
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr,
+                "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment1": moment1,
                 "Moment2": moment2,
                 "Beta1Pow": self._beta1_pow_acc,
@@ -495,18 +451,6 @@ class AdamaxOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         # Create beta1 power accumulator tensor
         beta_shape = [1]
@@ -536,7 +480,7 @@ class AdamaxOptimizer(Optimizer):
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr,
+                "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment": moment,
                 "InfNorm": inf_norm,
                 "Beta1Pow": self._beta1_pow_acc
diff --git a/python/paddle/v2/framework/regularizer.py b/python/paddle/v2/fluid/regularizer.py
similarity index 98%
rename from python/paddle/v2/framework/regularizer.py
rename to python/paddle/v2/fluid/regularizer.py
index 5111ac5566feb7d334ff4cd8e70daa0cfbd6e552..098cd0dd6439554f49e429ab75fb11bfa2c9d28c 100644
--- a/python/paddle/v2/framework/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework
 
 __all__ = [
     'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'
diff --git a/python/paddle/v2/framework/tests/.gitignore b/python/paddle/v2/fluid/tests/.gitignore
similarity index 100%
rename from python/paddle/v2/framework/tests/.gitignore
rename to python/paddle/v2/fluid/tests/.gitignore
diff --git a/python/paddle/v2/fluid/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e795627bfe9e8ad0c196349a332e62e975f20aa3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
+
+add_subdirectory(book)
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
similarity index 100%
rename from python/paddle/v2/framework/tests/CMakeLists.txt
rename to python/paddle/v2/fluid/tests/book/CMakeLists.txt
diff --git a/python/paddle/v2/framework/tests/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
similarity index 87%
rename from python/paddle/v2/framework/tests/test_fit_a_line.py
rename to python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index 6e09b88dca34de2579131e7bdc16b26cf6cde49c..5ef963bffa4e4fa3992e1f811d7f514662809410 100644
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.io import save_persistables, load_persistables
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.io import save_persistables, load_persistables
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_image_classification_train.py
rename to python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index a4165da9703c55ae3347123409407f0cae30856f..e253b8d27fd29746b41d82a63b11485032e77ebb 100644
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,12 +1,12 @@
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.framework.core as core
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.optimizer as optimizer
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_startup_program, g_main_program
-from paddle.v2.framework.initializer import XavierInitializer
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.initializer import XavierInitializer
 
 
 def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
similarity index 90%
rename from python/paddle/v2/framework/tests/test_recognize_digits_conv.py
rename to python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
index 66c629eb4261a9b971f25611d8e49f0cb671304a..2b723125412c17f3805ee3cae046b0788aa34997 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
similarity index 88%
rename from python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
rename to python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index 076cf882160cd53f45ef291d82ba57ada843a287..2e1a9f236b6621c7334a9eb04272a6eb69c86fab 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -1,12 +1,12 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.regularizer import L2DecayRegularizer
-from paddle.v2.framework.initializer import UniformInitializer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.regularizer import L2DecayRegularizer
+from paddle.v2.fluid.initializer import UniformInitializer
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_recommender_system.py
rename to python/paddle/v2/fluid/tests/book/test_recommender_system.py
index 31562b4391d16b831d53801cfa21c7bdf8c3ab8d..4708dfe3e9209a3254a9e1903cbedf07ebc5d2d0 100644
--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
similarity index 90%
rename from python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
rename to python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
index eb377e9264b6031e9bf484a90b7c2b39442407f1..dc4b63da9b37aff55fc6362f239e3e61004a3866 100644
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_main_program, g_startup_program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program, g_main_program, g_startup_program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d507f4c8e39ba039603a5b7618e7a82d1dcb21b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
@@ -0,0 +1,110 @@
+import paddle.v2 as paddle
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+
+from paddle.v2.fluid.framework import Program, g_main_program, g_startup_program
+from paddle.v2.fluid.executor import Executor
+
+import numpy as np
+
+
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    assert stacked_num % 2 == 1
+    data = layers.data(name="words", shape=[1], data_type="int64")
+    label = layers.data(name="label", shape=[1], data_type="int64")
+
+    emb = layers.embedding(input=data, size=[input_dim, emb_dim])
+    # add bias attr
+
+    # TODO(qijun) linear act
+    fc1 = layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = layers.fc(input=[fc_last, lstm_last],
+                           size=class_dim,
+                           act='softmax')
+    cost = layers.cross_entropy(input=prediction, label=label)
+    avg_cost = layers.mean(x=cost)
+    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    opts = adam_optimizer.minimize(avg_cost)
+    acc = layers.accuracy(input=prediction, label=label)
+    return avg_cost, acc
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    print "load word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    cost, acc = stacked_lstm_net(input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(g_startup_program)
+
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
+
+            label = np.array(map(lambda x: x[1], data)).astype("int64")
+            label = label.reshape([BATCH_SIZE, 1])
+
+            tensor_label = core.LoDTensor()
+            tensor_label.set(label, place)
+
+            outs = exe.run(g_main_program,
+                           feed={"words": tensor_words,
+                                 "label": tensor_label},
+                           fetch_list=[cost, acc])
+            cost_val = np.array(outs[0])
+            acc_val = np.array(outs[1])
+
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+            if cost_val < 1.0 and acc_val > 0.7:
+                exit(0)
+    exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
rename to python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index 26cbd01bc04916e53554e6f70bee7bcf25d6371c..848dcce974a107402c33013e9f84211fd4979e21 100644
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -1,10 +1,10 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import g_main_program, g_startup_program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import g_main_program, g_startup_program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_word2vec.py
rename to python/paddle/v2/fluid/tests/book/test_word2vec.py
index cb9fc2ab62b56348db7a320f7d40d2f0a7bf9d21..054dbd5a3d090ba8a08e8f101de11c69ddd36d8a 100644
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -1,10 +1,10 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
similarity index 98%
rename from python/paddle/v2/framework/tests/op_test.py
rename to python/paddle/v2/fluid/tests/op_test.py
index 4a269341a4be6c1b72fde5166b7dd089236700b8..90269e308a31d2606b23d741ce0d0fa91a0a6aeb 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -2,12 +2,12 @@ import unittest
 import numpy as np
 import random
 import itertools
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import collections
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.op import Operator
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import Program, OpProtoHolder
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import Program, OpProtoHolder
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
diff --git a/python/paddle/v2/framework/tests/test_accuracy_op.py b/python/paddle/v2/fluid/tests/test_accuracy_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_accuracy_op.py
rename to python/paddle/v2/fluid/tests/test_accuracy_op.py
diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_activation_op.py
rename to python/paddle/v2/fluid/tests/test_activation_op.py
diff --git a/python/paddle/v2/framework/tests/test_adadelta_op.py b/python/paddle/v2/fluid/tests/test_adadelta_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adadelta_op.py
rename to python/paddle/v2/fluid/tests/test_adadelta_op.py
diff --git a/python/paddle/v2/framework/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adagrad_op.py
rename to python/paddle/v2/fluid/tests/test_adagrad_op.py
diff --git a/python/paddle/v2/framework/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adam_op.py
rename to python/paddle/v2/fluid/tests/test_adam_op.py
diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/fluid/tests/test_adamax_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adamax_op.py
rename to python/paddle/v2/fluid/tests/test_adamax_op.py
diff --git a/python/paddle/v2/framework/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
similarity index 91%
rename from python/paddle/v2/framework/tests/test_array_read_write_op.py
rename to python/paddle/v2/fluid/tests/test_array_read_write_op.py
index 79e9938216e2abda5432e525804b0bcb9a655655..e019a4e15f0e25deaedf30911b44e576c8f89013 100644
--- a/python/paddle/v2/framework/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -1,9 +1,9 @@
 import unittest
-import paddle.v2.framework.core as core
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.framework import g_main_program
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.framework import g_main_program
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/test_assign_op.py b/python/paddle/v2/fluid/tests/test_assign_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b0c145f1a69678b228bc70e4e4e273f5bcf9888
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_assign_op.py
@@ -0,0 +1,21 @@
+import op_test
+import numpy
+import unittest
+
+
+class TestAssignOp(op_test.OpTest):
+    def setUp(self):
+        self.op_type = "assign"
+        x = numpy.random.random(size=(100, 10))
+        self.inputs = {'X': x}
+        self.outputs = {'Out': x}
+
+    def test_forward(self):
+        self.check_output()
+
+    def test_backward(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_auc_op.py b/python/paddle/v2/fluid/tests/test_auc_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_auc_op.py
rename to python/paddle/v2/fluid/tests/test_auc_op.py
diff --git a/python/paddle/v2/framework/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
similarity index 99%
rename from python/paddle/v2/framework/tests/test_batch_norm_op.py
rename to python/paddle/v2/fluid/tests/test_batch_norm_op.py
index dee339f43c2ee33fc8a691e0915bddf2c1679285..71f9599e0de83c86808f7e62547f80d3d50ffc7d 100644
--- a/python/paddle/v2/framework/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -1,8 +1,8 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 
 
 def grad_var_name(var_name):
diff --git a/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..080ca43b8269e0f6a9f4d0ce3973f4d4a07a8e2a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
@@ -0,0 +1,37 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestBilinearTensorProductOp(OpTest):
+    def setUp(self):
+        self.op_type = "bilinear_tensor_product"
+        batch_size = 6
+        size0 = 3
+        size1 = 4
+        size2 = 5
+        a = np.random.random((batch_size, size0)).astype("float32")
+        b = np.random.random((batch_size, size1)).astype("float32")
+        w = np.random.random((size2, size0, size1)).astype("float32")
+        bias = np.random.random((1, size2)).astype("float32")
+        output = np.zeros((batch_size, size2)).astype("float32")
+        for i in range(size2):
+            w_i = w[i, :, :]
+            output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)
+        self.inputs = {
+            'X': a,
+            'Y': b,
+            'Weight': w,
+            'Bias': bias,
+        }
+        self.outputs = {'Out': output + bias}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y', 'Weight', 'Bias'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cast_op.py b/python/paddle/v2/fluid/tests/test_cast_op.py
similarity index 93%
rename from python/paddle/v2/framework/tests/test_cast_op.py
rename to python/paddle/v2/fluid/tests/test_cast_op.py
index 52ee71a8a4058a1367d9e493e02d8f2469ccfc9f..0c4b6310652e84d3dd7f281a8b98ae0435072afb 100644
--- a/python/paddle/v2/framework/tests/test_cast_op.py
+++ b/python/paddle/v2/fluid/tests/test_cast_op.py
@@ -1,7 +1,7 @@
 import op_test
 import unittest
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestCastOp(op_test.OpTest):
diff --git a/python/paddle/v2/framework/tests/test_chunk_eval_op.py b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_chunk_eval_op.py
rename to python/paddle/v2/fluid/tests/test_chunk_eval_op.py
diff --git a/python/paddle/v2/framework/tests/test_clip_by_norm_op.py b/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_clip_by_norm_op.py
rename to python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
diff --git a/python/paddle/v2/framework/tests/test_clip_op.py b/python/paddle/v2/fluid/tests/test_clip_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_clip_op.py
rename to python/paddle/v2/fluid/tests/test_clip_op.py
diff --git a/python/paddle/v2/framework/tests/test_compare_op.py b/python/paddle/v2/fluid/tests/test_compare_op.py
similarity index 79%
rename from python/paddle/v2/framework/tests/test_compare_op.py
rename to python/paddle/v2/fluid/tests/test_compare_op.py
index bb0256694d77323f12c50856533e93b090dc6198..5d0dfab6ffd1cbbbfbcdb3af60f1868b7b780456 100644
--- a/python/paddle/v2/framework/tests/test_compare_op.py
+++ b/python/paddle/v2/fluid/tests/test_compare_op.py
@@ -23,6 +23,9 @@ def create_test_class(op_type, typename, callback):
 
 for _type_name in {'float32', 'float64', 'int32', 'int64'}:
     create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b)
+    create_test_class('greater_equal', _type_name, lambda _a, _b: _a >= _b)
     create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/framework/tests/test_concat_op.py b/python/paddle/v2/fluid/tests/test_concat_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_concat_op.py
rename to python/paddle/v2/fluid/tests/test_concat_op.py
diff --git a/python/paddle/v2/framework/tests/test_cond_op.py b/python/paddle/v2/fluid/tests/test_cond_op.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_cond_op.py
rename to python/paddle/v2/fluid/tests/test_cond_op.py
index 09a3f5dc97c342fc61cd407bb338c1696e8d6c76..9d1df44b9065f8101e90b87815660f8c0818645f 100644
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
@@ -1,8 +1,8 @@
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
-from paddle.v2.framework.op import Operator, CondOp
+from paddle.v2.fluid.op import Operator, CondOp
 
 
 class PySimpleCond(object):
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..293803f004a1513611fba30634d5552e1da84fef
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -0,0 +1,40 @@
+import unittest
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+import numpy
+
+
+class ConditionalBlock(unittest.TestCase):
+    def test_forward(self):
+        data = layers.data(name='X', shape=[1], data_type='float32')
+        data.stop_gradient = False
+        cond = layers.ConditionalBlock(inputs=[data])
+        out = layers.create_tensor(dtype='float32')
+        with cond.block():
+            hidden = layers.fc(input=data, size=10)
+            layers.assign(hidden, out)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        exe.run(g_startup_program)
+
+        x = core.LoDTensor()
+        x.set(numpy.random.random(size=(10, 1)).astype('float32'), cpu)
+
+        outs = map(numpy.array, exe.run(feed={'X': x}, fetch_list=[out]))[0]
+        print outs
+        loss = layers.mean(x=out)
+        append_backward_ops(loss=loss)
+        outs = map(numpy.array,
+                   exe.run(feed={'X': x},
+                           fetch_list=[
+                               g_main_program.block(0).var(data.name + "@GRAD")
+                           ]))[0]
+        print outs
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv2d_op.py
rename to python/paddle/v2/fluid/tests/test_conv2d_op.py
diff --git a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
rename to python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
diff --git a/python/paddle/v2/framework/tests/test_conv3d_op.py b/python/paddle/v2/fluid/tests/test_conv3d_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv3d_op.py
rename to python/paddle/v2/fluid/tests/test_conv3d_op.py
diff --git a/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
rename to python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
diff --git a/python/paddle/v2/framework/tests/test_conv_shift_op.py b/python/paddle/v2/fluid/tests/test_conv_shift_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv_shift_op.py
rename to python/paddle/v2/fluid/tests/test_conv_shift_op.py
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/fluid/tests/test_cos_sim_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_cos_sim_op.py
rename to python/paddle/v2/fluid/tests/test_cos_sim_op.py
diff --git a/python/paddle/v2/fluid/tests/test_create_op_doc_string.py b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
new file mode 100644
index 0000000000000000000000000000000000000000..42b6f7a3616bbce53a8cae68a5fc1eda411a7422
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
@@ -0,0 +1,11 @@
+import unittest
+import paddle.v2.fluid.layers as layers
+
+
+class TestDocString(unittest.TestCase):
+    def test_layer_doc_string(self):
+        print layers.dropout.__doc__
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_crf_decoding_op.py b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_crf_decoding_op.py
rename to python/paddle/v2/fluid/tests/test_crf_decoding_op.py
diff --git a/python/paddle/v2/framework/tests/test_crop_op.py b/python/paddle/v2/fluid/tests/test_crop_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_crop_op.py
rename to python/paddle/v2/fluid/tests/test_crop_op.py
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_cross_entropy_op.py
rename to python/paddle/v2/fluid/tests/test_cross_entropy_op.py
diff --git a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py b/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
rename to python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
similarity index 94%
rename from python/paddle/v2/framework/tests/test_default_scope_funcs.py
rename to python/paddle/v2/fluid/tests/test_default_scope_funcs.py
index 09a9850d054e3d7e6bf6db363fc577bdff8e9f43..738e69529ea447e87516d5e0efc098910b966ded 100644
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
@@ -1,4 +1,4 @@
-from paddle.v2.framework.default_scope_funcs import *
+from paddle.v2.fluid.default_scope_funcs import *
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_dropout_op.py
rename to python/paddle/v2/fluid/tests/test_dropout_op.py
diff --git a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
rename to python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
index 70af9dbc49f5ff3222cf3d549a110931140b43c4..c2d8b48ea944ae40a451492b8e9fad38dda0835c 100644
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
@@ -1,7 +1,7 @@
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
-from paddle.v2.framework.op import Operator, DynamicRecurrentOp
+from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
 import numpy as np
 
 # for siplicity, just one level LoD
diff --git a/python/paddle/v2/framework/tests/test_elementwise_add_op.py b/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_add_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_add_op.py
diff --git a/python/paddle/v2/framework/tests/test_elementwise_div_op.py b/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_div_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_div_op.py
diff --git a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py b/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_mul_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
diff --git a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py b/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_sub_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
diff --git a/python/paddle/v2/framework/tests/test_evaluator.py b/python/paddle/v2/fluid/tests/test_evaluator.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_evaluator.py
rename to python/paddle/v2/fluid/tests/test_evaluator.py
index 37dbfbc06bcd0da7e11924a048679c74a1cfb373..1d51205b703f83ec32c2e948394e5d3f5c87d1d9 100644
--- a/python/paddle/v2/framework/tests/test_evaluator.py
+++ b/python/paddle/v2/fluid/tests/test_evaluator.py
@@ -1,6 +1,6 @@
-from paddle.v2.framework.evaluator import Evaluator
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.evaluator import Evaluator
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 import unittest
 import op_test
 import numpy as np
diff --git a/python/paddle/v2/framework/tests/test_exception.py b/python/paddle/v2/fluid/tests/test_exception.py
similarity index 89%
rename from python/paddle/v2/framework/tests/test_exception.py
rename to python/paddle/v2/fluid/tests/test_exception.py
index 5ae048817cfcc1ec85e0d0e0c5db749da4521012..b871f40c4a07ae2db7559e5a0f15664b21e94402 100644
--- a/python/paddle/v2/framework/tests/test_exception.py
+++ b/python/paddle/v2/fluid/tests/test_exception.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
similarity index 83%
rename from python/paddle/v2/framework/tests/test_executor_and_mul.py
rename to python/paddle/v2/fluid/tests/test_executor_and_mul.py
index c885cfbebd4b665ddf50adbc43673942dc949a0b..709250d0c86dde84ac22c37d8e2385ca4a80a40a 100644
--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
@@ -1,8 +1,8 @@
 import unittest
-from paddle.v2.framework.layers import mul, data
-import paddle.v2.framework.core as core
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_main_program
+from paddle.v2.fluid.layers import mul, data
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import g_main_program
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/test_expand_op.py b/python/paddle/v2/fluid/tests/test_expand_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440f7a2bb159bab4923683b5d0980e59e0a69c9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_expand_op.py
@@ -0,0 +1,97 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestExpandOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random(12).astype("float32")}
+        self.attrs = {'expand_times': [2]}
+        output = np.tile(self.inputs['X'], 2)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [2, 3]}
+        output = np.tile(self.inputs['X'], (2, 3))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank4(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5, 7)).astype("float32")}
+        self.attrs = {'expand_times': [3, 2, 1, 2]}
+        output = np.tile(self.inputs['X'], (3, 2, 1, 2))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_feed_fetch_method.py b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_feed_fetch_method.py
rename to python/paddle/v2/fluid/tests/test_feed_fetch_method.py
index fbd659ece0188140e197982ea818d7c3897daf4e..178c85b0dd50df61b1fd35ef5d53ebbf39445cb4 100644
--- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
rename to python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
diff --git a/python/paddle/v2/framework/tests/test_fill_constant_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_fill_constant_op.py
rename to python/paddle/v2/fluid/tests/test_fill_constant_op.py
diff --git a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
rename to python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
diff --git a/python/paddle/v2/framework/tests/test_framework_debug_str.py b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
similarity index 85%
rename from python/paddle/v2/framework/tests/test_framework_debug_str.py
rename to python/paddle/v2/fluid/tests/test_framework_debug_str.py
index 8fdf8f91171ee334fac93c05a4d49056fa0e803d..a4cbabdb36362c4ca14b76f366b648d6dbdbf7b3 100644
--- a/python/paddle/v2/framework/tests/test_framework_debug_str.py
+++ b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
@@ -1,5 +1,5 @@
 import unittest
-from paddle.v2.framework.framework import Program
+from paddle.v2.fluid.framework import Program
 
 
 class TestDebugStringFramework(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_gather_op.py b/python/paddle/v2/fluid/tests/test_gather_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_gather_op.py
rename to python/paddle/v2/fluid/tests/test_gather_op.py
diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
similarity index 91%
rename from python/paddle/v2/framework/tests/test_gaussian_random_op.py
rename to python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index 0dc7e091a5c8dd046f36cab7f79a15b2281cdd90..627ab4e23562f14538d85f2e21edeb7d72d940bb 100644
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -1,6 +1,6 @@
 import unittest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_gru_op.py b/python/paddle/v2/fluid/tests/test_gru_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_gru_op.py
rename to python/paddle/v2/fluid/tests/test_gru_op.py
diff --git a/python/paddle/v2/framework/tests/test_gru_unit_op.py b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_gru_unit_op.py
rename to python/paddle/v2/fluid/tests/test_gru_unit_op.py
diff --git a/python/paddle/v2/framework/tests/test_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_huber_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_huber_loss_op.py
rename to python/paddle/v2/fluid/tests/test_huber_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_image_classification_layer.py
rename to python/paddle/v2/fluid/tests/test_image_classification_layer.py
index b1a267ec32b1c937b946bee82e41b846ebbf1288..bf5444107fa1609e67b09823b82e5fb92234b0a4 100644
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -1,8 +1,8 @@
 import unittest
 
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program
 
 
 def conv_block(input,
diff --git a/python/paddle/v2/framework/tests/test_infer_shape.py b/python/paddle/v2/fluid/tests/test_infer_shape.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_infer_shape.py
rename to python/paddle/v2/fluid/tests/test_infer_shape.py
index 2b2995f5e22d8c50d67498688c069252bf6e02fc..9f6695ce02de749178046fbb613a58ba591b3dbc 100644
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/fluid/tests/test_infer_shape.py
@@ -1,6 +1,6 @@
 import unittest
 
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestInferShape(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_inference_model_io.py b/python/paddle/v2/fluid/tests/test_inference_model_io.py
similarity index 90%
rename from python/paddle/v2/framework/tests/test_inference_model_io.py
rename to python/paddle/v2/fluid/tests/test_inference_model_io.py
index 48984f86a1864baade58aeb8e35c6065cc2a4bbb..98b95713b73e8eba93bd6a58eaaed603cfae7952 100644
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.io import save_inference_model, load_inference_model
-import paddle.v2.framework.executor as executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.io import save_inference_model, load_inference_model
+import paddle.v2.fluid.executor as executor
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_initializer.py b/python/paddle/v2/fluid/tests/test_initializer.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_initializer.py
rename to python/paddle/v2/fluid/tests/test_initializer.py
index bd4d2e39d770aebb7468d516f463533185ea8680..f2eb79b209627f5814847db6d96c0a17300d9b5a 100644
--- a/python/paddle/v2/framework/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
@@ -1,8 +1,8 @@
 import numpy as np
 import unittest
 
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.initializer as initializer
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.initializer as initializer
 
 DELTA = 0.00001
 
diff --git a/python/paddle/v2/framework/tests/test_l1_norm_op.py b/python/paddle/v2/fluid/tests/test_l1_norm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_l1_norm_op.py
rename to python/paddle/v2/fluid/tests/test_l1_norm_op.py
diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_layers.py
rename to python/paddle/v2/fluid/tests/test_layers.py
index b42af5ea45d54723e96279f9e16f82a1d52ad236..3d18e7ce3a4dc6c6b917a1000de39fca71f6ac18 100644
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -1,7 +1,7 @@
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program
+import paddle.v2.fluid.core as core
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
rename to python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
diff --git a/python/paddle/v2/framework/tests/test_lod_array_length_op.py b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
similarity index 79%
rename from python/paddle/v2/framework/tests/test_lod_array_length_op.py
rename to python/paddle/v2/fluid/tests/test_lod_array_length_op.py
index af2b4d705e7ec121bd5f1350f0a642ae8c44bf1e..a01ae83772185df218b8c453557dc0cac719673b 100644
--- a/python/paddle/v2/framework/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
@@ -1,7 +1,7 @@
 import unittest
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.executor import Executor
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
similarity index 78%
rename from python/paddle/v2/framework/tests/test_lod_rank_table.py
rename to python/paddle/v2/fluid/tests/test_lod_rank_table.py
index 408145c10f46e24e8a54b05b4f3afa9231b6ffd6..bbc11930b9e804c2769cc590c298c6e90dc36ca6 100644
--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
@@ -1,7 +1,7 @@
-from paddle.v2.framework.layers import lod_rank_table, data
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_main_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.layers import lod_rank_table, data
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import g_main_program
+import paddle.v2.fluid.core as core
 import numpy
 import unittest
 
diff --git a/python/paddle/v2/fluid/tests/test_lod_reset_op.py b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..652ccecfa443fc95f08f52df766709cb550f4049
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
@@ -0,0 +1,64 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLodResetOpByAttr(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0 = [0, 7, 10]
+        self.inputs = {'X': (x, lod)}
+        self.attrs = {'target_lod': target_lod_0}
+        self.outputs = {'Out': (x, [target_lod_0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestLodResetOpByInput(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0 = [0, 4, 7, 10]
+        self.inputs = {
+            'X': (x, lod),
+            'TargetLoD': np.array([target_lod_0]).astype('int32')
+        }
+        self.outputs = {'Out': (x, [target_lod_0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+
+
+class TestLodResetOpBoth(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0_attr = [0, 7, 10]
+        target_lod_0_in = [0, 4, 7, 10]
+        self.inputs = {
+            'X': (x, lod),
+            'TargetLoD': np.array(target_lod_0_in).astype('int32')
+        }
+        self.attrs = {'target_lod': target_lod_0_attr}
+        self.outputs = {'Out': (x, [target_lod_0_in])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lod_tensor_array.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_lod_tensor_array.py
rename to python/paddle/v2/fluid/tests/test_lod_tensor_array.py
index a433bcf622b14a1d2d33b5b98d555e1a21e4b9e8..d6d3e23fd8898a62528d63795d1bff1b72752477 100644
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
@@ -1,5 +1,5 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
rename to python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
index e9713666b3f64d7a39afadab7da6b22f149b8cf8..b18cb6b49fa41f26e1b6de1128690507c5a2f099 100644
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -1,10 +1,10 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import numpy
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_lookup_table_op.py b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lookup_table_op.py
rename to python/paddle/v2/fluid/tests/test_lookup_table_op.py
diff --git a/python/paddle/v2/framework/tests/test_lrn_op.py b/python/paddle/v2/fluid/tests/test_lrn_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lrn_op.py
rename to python/paddle/v2/fluid/tests/test_lrn_op.py
diff --git a/python/paddle/v2/framework/tests/test_lstm_op.py b/python/paddle/v2/fluid/tests/test_lstm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lstm_op.py
rename to python/paddle/v2/fluid/tests/test_lstm_op.py
diff --git a/python/paddle/v2/framework/tests/test_lstm_unit_op.py b/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lstm_unit_op.py
rename to python/paddle/v2/fluid/tests/test_lstm_unit_op.py
diff --git a/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
rename to python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_matmul_op.py b/python/paddle/v2/fluid/tests/test_matmul_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_matmul_op.py
rename to python/paddle/v2/fluid/tests/test_matmul_op.py
diff --git a/python/paddle/v2/framework/tests/test_mean_op.py b/python/paddle/v2/fluid/tests/test_mean_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_mean_op.py
rename to python/paddle/v2/fluid/tests/test_mean_op.py
diff --git a/python/paddle/v2/framework/tests/test_minus_op.py b/python/paddle/v2/fluid/tests/test_minus_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_minus_op.py
rename to python/paddle/v2/fluid/tests/test_minus_op.py
diff --git a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
rename to python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_momentum_op.py b/python/paddle/v2/fluid/tests/test_momentum_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_momentum_op.py
rename to python/paddle/v2/fluid/tests/test_momentum_op.py
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/fluid/tests/test_mul_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_mul_op.py
rename to python/paddle/v2/fluid/tests/test_mul_op.py
diff --git a/python/paddle/v2/framework/tests/test_multiplex_op.py b/python/paddle/v2/fluid/tests/test_multiplex_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_multiplex_op.py
rename to python/paddle/v2/fluid/tests/test_multiplex_op.py
diff --git a/python/paddle/v2/framework/tests/test_nccl_init_op.py b/python/paddle/v2/fluid/tests/test_nccl_init_op.py
similarity index 91%
rename from python/paddle/v2/framework/tests/test_nccl_init_op.py
rename to python/paddle/v2/fluid/tests/test_nccl_init_op.py
index 054909fdf5517a68c6a07971c65a1d5bdc20d4fa..a536800ccd81fdc2f3b7c8320cede4f8ecf3a8cb 100644
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/fluid/tests/test_nccl_init_op.py
@@ -1,8 +1,8 @@
 import unittest, os
 import numpy as np
 import paddle.v2 as paddle
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 from op_test import OpTest, create_op, set_input
 
 if not core.is_compile_gpu():
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/fluid/tests/test_net.py
similarity index 93%
rename from python/paddle/v2/framework/tests/test_net.py
rename to python/paddle/v2/fluid/tests/test_net.py
index 8503257feb8e1a5802f3f889f72c559a2aaa583a..318df08a9e73ac95cab73c34182bc6220ef6c681 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/fluid/tests/test_net.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_op_support_gpu.py b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
similarity index 84%
rename from python/paddle/v2/framework/tests/test_op_support_gpu.py
rename to python/paddle/v2/fluid/tests/test_op_support_gpu.py
index dd36c666c440a5c378dfceac4502cd8277417412..a0eb4bd5fd2cc178ffe0763efdee61524ad6d4bd 100644
--- a/python/paddle/v2/framework/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
@@ -1,5 +1,5 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestOpSupportGPU(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/fluid/tests/test_operator.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_operator.py
rename to python/paddle/v2/fluid/tests/test_operator.py
index 98f6b2f5ee639120557cb85b3ada6d2931f7d0d2..4aa022ef90159cd96eed4e4dbe30cf5d1e8a41a7 100644
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/test_operator.py
@@ -1,7 +1,7 @@
 import unittest
-import paddle.v2.framework.op as op
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.op as op
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 
 class TestGetAllProtos(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_operator_desc.py
rename to python/paddle/v2/fluid/tests/test_operator_desc.py
index a0bc4e0b91602cfc90f91a1e2dd4bce22c0dbf6d..e8362d2e9c6038c04c24dce35de8c53bfde78142 100644
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
@@ -1,6 +1,6 @@
 import unittest
-from paddle.v2.framework.framework import Variable, Program, g_main_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import Variable, Program, g_main_program
+import paddle.v2.fluid.core as core
 
 
 class TestOperator(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_optimizer.py
rename to python/paddle/v2/fluid/tests/test_optimizer.py
index a39e7402600c7a94301de030c90ea51264248cf1..0ebf7cdf208c41eacfdff88f59455584eff4ff8f 100644
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -1,8 +1,8 @@
 import unittest
 
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.optimizer as optimizer
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.backward import append_backward_ops
 
 
 class TestOptimizer(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_pad_op.py b/python/paddle/v2/fluid/tests/test_pad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_pad_op.py
rename to python/paddle/v2/fluid/tests/test_pad_op.py
diff --git a/python/paddle/v2/framework/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py
similarity index 87%
rename from python/paddle/v2/framework/tests/test_parameter.py
rename to python/paddle/v2/fluid/tests/test_parameter.py
index f04eb4cf27276b0f7da0793c97742ac42e4583be..71a1bd2aaf5a9c6362ce0d35c256ed228e942fce 100644
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
@@ -1,6 +1,6 @@
 import unittest
-from paddle.v2.framework.framework import g_main_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import g_main_program
+import paddle.v2.fluid.core as core
 
 
 class TestParameter(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/fluid/tests/test_pool2d_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_pool2d_op.py
rename to python/paddle/v2/fluid/tests/test_pool2d_op.py
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/fluid/tests/test_pool3d_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_pool3d_op.py
rename to python/paddle/v2/fluid/tests/test_pool3d_op.py
diff --git a/python/paddle/v2/framework/tests/test_pool_max_op.py b/python/paddle/v2/fluid/tests/test_pool_max_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_pool_max_op.py
rename to python/paddle/v2/fluid/tests/test_pool_max_op.py
diff --git a/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py b/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
rename to python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
diff --git a/python/paddle/v2/framework/tests/test_precision_recall_op.py b/python/paddle/v2/fluid/tests/test_precision_recall_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_precision_recall_op.py
rename to python/paddle/v2/fluid/tests/test_precision_recall_op.py
diff --git a/python/paddle/v2/framework/tests/test_prelu_op.py b/python/paddle/v2/fluid/tests/test_prelu_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_prelu_op.py
rename to python/paddle/v2/fluid/tests/test_prelu_op.py
diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_program.py
rename to python/paddle/v2/fluid/tests/test_program.py
index 7be67b6614ee3302a319289b821a214a81b6f64e..ef2daf6916e14c015a39ae0193948e7ff6531449 100644
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,8 +1,8 @@
 import unittest
 
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.framework import g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import g_main_program
 
 
 class TestProgram(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/fluid/tests/test_protobuf.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_protobuf.py
rename to python/paddle/v2/fluid/tests/test_protobuf.py
index 848a396b3b6eec57d500b464780b64f339b09e94..e064374176fa221cfd042b7dbd2ddcb3b5ec41ec 100644
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
similarity index 99%
rename from python/paddle/v2/framework/tests/test_protobuf_descs.py
rename to python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 2fd3d5d165ada5026510e0dc3e2c55b6e0596ff3..098a9802dfc6763ce2a2356b7267a439145b7939 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -1,5 +1,5 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestOpDesc(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py b/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
rename to python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
diff --git a/python/paddle/v2/framework/tests/test_proximal_gd_op.py b/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_proximal_gd_op.py
rename to python/paddle/v2/fluid/tests/test_proximal_gd_op.py
diff --git a/python/paddle/v2/framework/tests/test_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_rank_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_rank_loss_op.py
rename to python/paddle/v2/fluid/tests/test_rank_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_recurrent_op.py
rename to python/paddle/v2/fluid/tests/test_recurrent_op.py
index 16100429dd4010eb5c9a3e8896212f39295a4c8a..b623d1231838faff9e91c9234befb1f647fe8ec2 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -1,11 +1,11 @@
 import unittest
 
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class PyRNNBase(object):
diff --git a/python/paddle/v2/framework/tests/test_reduce_op.py b/python/paddle/v2/fluid/tests/test_reduce_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_reduce_op.py
rename to python/paddle/v2/fluid/tests/test_reduce_op.py
diff --git a/python/paddle/v2/framework/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_regularizer.py
rename to python/paddle/v2/fluid/tests/test_regularizer.py
index b21dceb584bdc660e48598a600f57cb6095b3802..f5d1eb3b96211bd7c7335dbe116a1d765d7bae50 100644
--- a/python/paddle/v2/framework/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -1,9 +1,9 @@
 import unittest
 
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.optimizer as optimizer
-import paddle.v2.framework.regularizer as regularizer
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.regularizer as regularizer
+from paddle.v2.fluid.backward import append_backward_ops
 
 
 class TestL2DecayRegularizer(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_reshape_op.py b/python/paddle/v2/fluid/tests/test_reshape_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_reshape_op.py
rename to python/paddle/v2/fluid/tests/test_reshape_op.py
diff --git a/python/paddle/v2/framework/tests/test_rmsprop_op.py b/python/paddle/v2/fluid/tests/test_rmsprop_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_rmsprop_op.py
rename to python/paddle/v2/fluid/tests/test_rmsprop_op.py
diff --git a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
rename to python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
index 731beff17cc96d26c2d9390a956c774b8676b179..a3cba92504a28590083df57e69f7662a887d94a6 100644
--- a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -1,10 +1,10 @@
 import unittest
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 def create_tensor(np_data, place):
diff --git a/python/paddle/v2/framework/tests/test_scale_op.py b/python/paddle/v2/fluid/tests/test_scale_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_scale_op.py
rename to python/paddle/v2/fluid/tests/test_scale_op.py
diff --git a/python/paddle/v2/framework/tests/test_scatter_op.py b/python/paddle/v2/fluid/tests/test_scatter_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_scatter_op.py
rename to python/paddle/v2/fluid/tests/test_scatter_op.py
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/fluid/tests/test_scope.py
similarity index 81%
rename from python/paddle/v2/framework/tests/test_scope.py
rename to python/paddle/v2/fluid/tests/test_scope.py
index 14743654792716e4a7ebce5238b142addc86337e..e4857b590aa6e09f1fa37c4a8a70a3ec9495b085 100644
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/fluid/tests/test_scope.py
@@ -1,22 +1,22 @@
-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import unittest
 
 
 class TestScope(unittest.TestCase):
     def test_create_destroy(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         self.assertIsNotNone(scope)
         scope_with_parent = scope.new_scope()
         self.assertIsNotNone(scope_with_parent)
 
     def test_none_variable(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         self.assertIsNone(scope.find_var("test"))
 
     def test_create_var_get_var(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         var_a = scope.var("var_a")
         self.assertIsNotNone(var_a)
@@ -25,7 +25,7 @@ class TestScope(unittest.TestCase):
         self.assertIsNotNone(scope2.find_var('var_a'))
 
     def test_var_get_int(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         var = scope.var("test_int")
         var.set_int(10)
diff --git a/python/paddle/v2/framework/tests/test_selected_rows.py b/python/paddle/v2/fluid/tests/test_selected_rows.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_selected_rows.py
rename to python/paddle/v2/fluid/tests/test_selected_rows.py
index e8a930cb08c42b48f678bdd7bdb7698923535d4f..93daf37aa2ceb8a599973f7b02874f23fe0763ff 100644
--- a/python/paddle/v2/framework/tests/test_selected_rows.py
+++ b/python/paddle/v2/fluid/tests/test_selected_rows.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_seq_concat_op.py b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
similarity index 99%
rename from python/paddle/v2/framework/tests/test_seq_concat_op.py
rename to python/paddle/v2/fluid/tests/test_seq_concat_op.py
index 7659fa8789ed2f11f46d37397b8bc1ab32571ddb..dccc6ed8afe2315da74f6886878b15d58b26b3c9 100644
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
@@ -2,6 +2,7 @@ import unittest
 import numpy as np
 import sys
 from op_test import OpTest
+exit(0)
 
 
 def to_abs_lod(lod):
diff --git a/python/paddle/v2/framework/tests/test_seq_conv.py b/python/paddle/v2/fluid/tests/test_seq_conv.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_seq_conv.py
rename to python/paddle/v2/fluid/tests/test_seq_conv.py
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/fluid/tests/test_seq_expand.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_seq_expand.py
rename to python/paddle/v2/fluid/tests/test_seq_expand.py
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/fluid/tests/test_seq_pool.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_seq_pool.py
rename to python/paddle/v2/fluid/tests/test_seq_pool.py
diff --git a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sequence_softmax_op.py
rename to python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_sgd_op.py
rename to python/paddle/v2/fluid/tests/test_sgd_op.py
index 01262bba4d43adaed179baef88ccab6e69b0884b..ca05a381f06cfd40b7939dbda8d4f1f4aacd0271 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -1,7 +1,7 @@
 import unittest
 import numpy as np
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 from op_test import OpTest
 
 
diff --git a/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
similarity index 86%
rename from python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
rename to python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index 2090455b969806685b525f1e588b6570e3072430..1a3b88e18e38b88d75ad17a0bb6a2965d1e60406 100644
--- a/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -1,9 +1,9 @@
 import unittest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.executor import Executor
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.framework import g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.framework import g_main_program
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
rename to python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
diff --git a/python/paddle/v2/framework/tests/test_sign_op.py b/python/paddle/v2/fluid/tests/test_sign_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sign_op.py
rename to python/paddle/v2/fluid/tests/test_sign_op.py
diff --git a/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py b/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
rename to python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/fluid/tests/test_softmax_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_softmax_op.py
rename to python/paddle/v2/fluid/tests/test_softmax_op.py
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
rename to python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aed83b2ea3418c54f9540279ae6e2e0045421fa
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -0,0 +1,181 @@
+import unittest
+import paddle.v2.fluid.core as core
+import numpy as np
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+
+
+class TestCPULoDTensorArrayOps(unittest.TestCase):
+    def place(self):
+        return core.CPUPlace()
+
+    def test_split_and_merge_lod_tensor_no_lod(self):
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
+
+        mask_np = np.array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, self.place())
+
+        expect_true_tensor = np.array([2, 3, 4, 5]).astype('int32')
+        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
+        expect_true = core.LoDTensor()
+        expect_true.set(expect_true_tensor, self.place())
+
+        expect_false_tensor = np.array([0, 1, 6, 7, 8, 9]).astype('int32')
+        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
+
+        expect_false = core.LoDTensor()
+        expect_false.set(expect_false_tensor, self.place())
+
+        self.main(
+            tensor=tensor,
+            mask=mask,
+            expect_true=expect_true,
+            expect_false=expect_false,
+            expect_out=tensor)
+
+    def test_split_and_merge_lod_tensor_level_0(self):
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        mask_np = np.array([0, 1, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, self.place())
+
+        expect_true_tensor = np.array([3, 4, 5, 6, 7, 8]).astype('int32')
+        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
+        expect_true = core.LoDTensor()
+        expect_true.set(expect_true_tensor, self.place())
+        expect_true.set_lod([[0, 6]])
+
+        expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32')
+        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
+        expect_false_lod = [[0, 3, 4]]
+
+        expect_false = core.LoDTensor()
+        expect_false.set(expect_false_tensor, self.place())
+        expect_false.set_lod(expect_false_lod)
+
+        self.main(
+            tensor=tensor,
+            mask=mask,
+            expect_true=expect_true,
+            expect_false=expect_false,
+            expect_out=tensor)
+
+    def main(self, tensor, mask, expect_true, expect_false, expect_out,
+             level=0):
+        place = self.place()
+        program = Program()
+        x = layers.data(name='x', shape=[1], main_program=program)
+        x.persistable = True
+
+        y = layers.data(name='y', shape=[1], main_program=program)
+        y.persistable = True
+
+        out_true, out_false = layers.split_lod_tensor(
+            input=x, mask=y, level=level, main_program=program)
+        out_true.persistable = True
+        out_false.persistable = True
+
+        out = layers.merge_lod_tensor(
+            in_true=out_true,
+            in_false=out_false,
+            mask=y,
+            x=x,
+            level=level,
+            main_program=program)
+
+        out.persistable = True
+
+        exe = Executor(place)
+        scope = core.Scope()
+        exe.run(program, feed={'x': tensor, 'y': mask}, scope=scope)
+
+        var_true = scope.find_var(out_true.name).get_tensor()
+
+        var_false = scope.find_var(out_false.name).get_tensor()
+
+        var_out = scope.find_var(out.name).get_tensor()
+
+        self.check_tensor_same(var_true, expect_true)
+        self.check_tensor_same(var_false, expect_false)
+        self.check_tensor_same(var_out, expect_out)
+
+    def check_tensor_same(self, actual, expect):
+        self.assertTrue(np.allclose(np.array(actual), np.array(expect)))
+        self.assertEqual(actual.lod(), expect.lod())
+
+
+class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
+    def test_grad(self):
+        place = core.CPUPlace()
+        program = Program()
+
+        x = layers.data(
+            name='x',
+            shape=[1],
+            data_type='float32',
+            main_program=program,
+            stop_gradient=False)
+        y = layers.data(
+            name='y',
+            shape=[1],
+            data_type='bool',
+            main_program=program,
+            stop_gradient=False)
+
+        level = 0
+
+        out_true, out_false = layers.split_lod_tensor(
+            input=x, mask=y, level=level, main_program=program)
+        out = layers.merge_lod_tensor(
+            in_true=out_true,
+            in_false=out_false,
+            mask=y,
+            x=x,
+            level=level,
+            main_program=program)
+        mean = layers.mean(x=out, main_program=program)
+
+        append_backward_ops(mean)
+
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        mask_np = np.array([0, 1, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, place)
+
+        exe = Executor(place)
+        scope = core.Scope()
+
+        g_vars = program.global_block().var(x.name + "@GRAD")
+        g_out = [
+            item.sum()
+            for item in map(np.array,
+                            exe.run(program,
+                                    feed={'x': tensor,
+                                          'y': mask},
+                                    fetch_list=[g_vars],
+                                    scope=scope))
+        ]
+
+        g_out_sum = np.array(g_out).sum()
+
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_split_op.py b/python/paddle/v2/fluid/tests/test_split_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_split_op.py
rename to python/paddle/v2/fluid/tests/test_split_op.py
diff --git a/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
rename to python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
diff --git a/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
rename to python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
diff --git a/python/paddle/v2/framework/tests/test_sum_op.py b/python/paddle/v2/fluid/tests/test_sum_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sum_op.py
rename to python/paddle/v2/fluid/tests/test_sum_op.py
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/fluid/tests/test_tensor.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_tensor.py
rename to python/paddle/v2/fluid/tests/test_tensor.py
index e0cd2fa8aaf2db2991ad2b9a3053f0d00b509cd4..9f870d9eb3485aa0b54eb781b906f4232d12c49e 100644
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_tensor.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy
 
diff --git a/python/paddle/v2/framework/tests/test_tensor_array.py b/python/paddle/v2/fluid/tests/test_tensor_array.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_tensor_array.py
rename to python/paddle/v2/fluid/tests/test_tensor_array.py
index 50b3e09162a24201ee45cbd017dfef8a60f0da78..d6929ba16e4dae0c57adcceb4f0e78c094eee55c 100644
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/fluid/tests/test_tensor_array.py
@@ -1,5 +1,5 @@
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_top_k_op.py b/python/paddle/v2/fluid/tests/test_top_k_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_top_k_op.py
rename to python/paddle/v2/fluid/tests/test_top_k_op.py
diff --git a/python/paddle/v2/framework/tests/test_transpose_op.py b/python/paddle/v2/fluid/tests/test_transpose_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_transpose_op.py
rename to python/paddle/v2/fluid/tests/test_transpose_op.py
diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
similarity index 90%
rename from python/paddle/v2/framework/tests/test_uniform_random_op.py
rename to python/paddle/v2/fluid/tests/test_uniform_random_op.py
index ded777105e0fc64eb82bf4013bfba7ba9d0ddefa..f736dfb2e85552b321403c961da517f3b3efb100 100644
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -1,6 +1,6 @@
 import unittest
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py
similarity index 93%
rename from python/paddle/v2/framework/tests/test_variable.py
rename to python/paddle/v2/fluid/tests/test_variable.py
index 03115f10a5a494424c6f8310c544c569be818e5b..a3e60a751719666bdca56a3096b688125d09f4b2 100644
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
@@ -1,6 +1,6 @@
 import unittest
-from paddle.v2.framework.framework import Variable, g_main_program, Program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import Variable, g_main_program, Program
+import paddle.v2.fluid.core as core
 import numpy as np
 
 
diff --git a/python/paddle/v2/framework/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
similarity index 94%
rename from python/paddle/v2/framework/tests/test_while_op.py
rename to python/paddle/v2/fluid/tests/test_while_op.py
index 1c344eae49705ecce586154c30c4d4f770022e7e..0f01acb3b94dc55a3536e751108e785ddc6e47bb 100644
--- a/python/paddle/v2/framework/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -1,7 +1,7 @@
 import unittest
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.executor import Executor
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_beam_search_decode_op.py b/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9f180bbaea0f5922bee0a3e2a8c715d683c0d16
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
@@ -0,0 +1,75 @@
+import unittest
+
+import numpy as np
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+
+class TestBeamSearchDecodeOp(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        self.cpu_place = core.CPUPlace()
+
+    def append_lod_tensor(self, tensor_array, lod, data):
+        lod_tensor = core.LoDTensor()
+        lod_tensor.set_lod(lod)
+        lod_tensor.set(data, self.cpu_place)
+        tensor_array.append(lod_tensor)
+
+    def test_get_set(self):
+        ids = self.scope.var("ids").get_lod_tensor_array()
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="int64"))
+
+        scores = self.scope.var("scores").get_lod_tensor_array()
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="float32"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="float32"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="float32"))
+
+        sentence_ids = self.scope.var("sentence_ids").get_tensor()
+        sentence_scores = self.scope.var("sentence_scores").get_tensor()
+
+        beam_search_decode_op = Operator(
+            "beam_search_decode",
+            # inputs
+            Ids="ids",
+            Scores="scores",
+            # outputs
+            SentenceIds="sentence_ids",
+            SentenceScores="sentence_scores")
+
+        ctx = core.DeviceContext.create(self.cpu_place)
+        beam_search_decode_op.run(self.scope, ctx)
+
+        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
+        self.assertEqual(sentence_ids.lod(), expected_lod)
+        self.assertEqual(sentence_scores.lod(), expected_lod)
+
+        expected_data = np.array(
+            [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
+        self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
+        self.assertTrue(
+            np.array_equal(np.array(sentence_scores), expected_data))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 5348c2d8d7e9b5adc5fe93e2943bef149ba047cc..fe91df10daf303bb14d1e5f28817984d261e0880 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -13,8 +13,8 @@ packages=['paddle',
           'paddle.v2.reader',
           'paddle.v2.master',
           'paddle.v2.plot',
-          'paddle.v2.framework',
-          'paddle.v2.framework.proto',
+          'paddle.v2.fluid',
+          'paddle.v2.fluid.proto',
           'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -44,14 +44,14 @@ setup(name='paddlepaddle',
       ext_modules=[Extension('_foo', ['stub.cc'])],
       package_data={
         'paddle.v2.master': ['libpaddle_master.so'],
-        'paddle.v2.framework': ['core.so'],
+        'paddle.v2.fluid': ['core.so'],
         'py_paddle':['*.py','_swig_paddle.so']
       },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
-          # The paddle.v2.framework.proto will be generated while compiling.
+          # The paddle.v2.fluid.proto will be generated while compiling.
           # So that package points to other directory.
-          'paddle.v2.framework.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
       scripts=paddle_bins,