diff --git a/.gitignore b/.gitignore
index 1512c1438e9e0b0b7b6e0c273a24b273cb652b04..7480bd53a403d74932d56409fdb0a9dd7bb6b9d6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,7 +21,7 @@ third_party/
 cmake-build-*
 
 # generated while compiling
-python/paddle/v2/framework/core.so
+python/paddle/v2/fluid/core.so
 paddle/pybind/pybind.h
 CMakeFiles
 cmake_install.cmake
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index f9918c306d0c68412f5b07e694353957c97594ee..05d83ad58ef8485d36829e7aeede79f625cfdc43 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -115,7 +115,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF(${CBLAS_PROVIDER} EQUAL MKLML)
+IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
     ADD_LIBRARY(cblas SHARED ${dummyfile})
 ELSE()
     ADD_LIBRARY(cblas STATIC ${dummyfile})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c311783aa3187678c31c27ddbbd074790ca444f3..b9c1dde97bc444d793d67ff622fd6b13c6435a9a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE AND NOT ANDROID)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
 function(merge_static_libs TARGET_NAME)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2..203506d7ab84e5a5be2232b077eac2d433a99766 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -82,6 +82,11 @@ maxout
 ..  autoclass:: paddle.v2.layer.maxout
     :noindex:
 
+roi_pool
+--------
+..  autoclass:: paddle.v2.layer.roi_pool
+    :noindex:
+
 Norm Layer
 ==========
 
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index fe8da907d9d45a2164031430ac5b7a3d5523967a..16236763a73770f3fe5eadf67645765d0456f875 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -15,6 +15,7 @@
  	- [CMake](#cmake)
 	- [Layers](#layers)
 	- [Activations](#activations)
+	- [Weights](#weights)
 	- [Unit Tests](#unit-tests)
 	- [Protobuf Messages](#protobuf-messages)
 	- [Python API](#python-api)
@@ -45,17 +46,23 @@ Figure 1. PaddlePaddle on IA.
 
 ### Layers
 所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
-`paddle/gserver/layers`中，并且文件名都会一以*Mkldnn*开头。
+`paddle/gserver/layers`中，并且文件名都会一以*MKLDNN*开头。
 
-所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数，子类只需要实现这些函数即可。
 
 ### Activations
-由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口，实现方法还是会在`ActivationFunction.cpp`文件。
+由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
 
-### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。
+### Weights
+由于有些layer是含有参数的，我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
+同时，由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致，我们会在网络训练的开始和结束时分别转换这个layout，使得最终保存的参数格式与PaddlePaddle一致。
 
-Activation的测试，计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
+### Unit Tests
+会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
 
 ### Protobuf Messages
 根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
@@ -82,7 +89,7 @@ if use_mkldnn
 会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
 
 ### Benchmarking
-会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`，添加使用MKL-DNN的测试。
+会添加`benchmark/paddle/image/run_mkldnn.sh`，用于测试使用MKL-DNN之后的性能。
 
 ### Others
 1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
@@ -94,14 +101,16 @@ if use_mkldnn
 
 我们总结出一些特别需要注意的点：
 
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MkldnnLayer`特有的设备ID。
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
 2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MkldnnMatrix`，用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
-4. 创建`MkldnnBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`，和未来可能还会用到`FPGAEngine`等。
-5. 在**Argument**里添加两个`MkldnnMatrixPtr`，取名为`mkldnnValue`和`mkldnnGrad`，用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名)，用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
-6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑，用于判断`deviceId`，并针对device在MKL-DNN和CPU之间不统一的情况，做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
-7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况，所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面，一直保存的是0，所以可以充分利用这个信息，定义一个枚举处理所有MKLDNN的参数格式，从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
+3. 创建`MKLDNNMatrix`，同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
+4. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表input value， input gradient，output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory)，主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时，用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好，每个子类只需要调用定义好的reset buffer函数即可。
+6. 每个`MKLDNNlayer`的resetbuffer相关的函数（包括reset input、output的Value和grad），他们会根据输入参数reset internal和external的memory，当然这两者也可以相等，即表示不需要转换。只需要把握一个原则，每个`MKLDNNlayer`的子类，只需要使用internal的memory就可以了，所有external的转换工作在父类的reset函数中都提前准备好了。
+7. 一般来说，external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存，因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换，那么internal的buffer也会与他们共享内存。
+8. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，同时数据格式就是`nchw`，这样下一个cpu device就能拿到正确的数据。在有cpu device的时候，external的memory的格式始终是`nchw`或者`nc`。
+9. 由于MKL-DNN的输出操作都是覆盖data的，不是在原来的数据上累加，所以当网络出现分支时，在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法，此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中，由分支处的layer负责求和，并把结果放到这个layer的`output_.grad`中。所以整体上，每个子类并不会需要关心分支的事情，也是在父类都实现好了。
+10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
 
 ## References
 
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index 0e939a2671ace8682c90cdc1c1bb2da1dda0d568..b331d9d36e6a279881c3b1a5586835e7186957fb 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -99,7 +99,7 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 利用更多的计算资源
 ++++++++++++++++++
 
-利用更多的计算资源可以分为一下几个方式来进行\:
+利用更多的计算资源可以分为以下几个方式来进行\:
 
 * 单机CPU训练
 
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index c823d7e9fcd63dd7719ac1403952b03c2d2f03c0..6cfc9536f20e88571a9845a50be0341fe4d9f78b 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -214,7 +214,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     ```cpp
     // if use Eigen unsupported module before include head files
-    #define EIGEN_USE_GPU
+    // #define EIGEN_USE_GPU
 
     namespace ops = paddle::operators;
     REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index bf6b8de8cc13779fd0083588f027e0bb5a6f27b1..d5b55e1c95f248f551e6a0a3b39123169dd7784f 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -54,6 +54,46 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
   return kPD_NO_ERROR;
 }
 
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                          paddle_real* value) {
+  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(value, value + width * height, buf);
+  }
+  return kPD_NO_ERROR;
+}
+
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                          paddle_real* result) {
+  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(result, buf, width * height * sizeof(paddle::real));
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(buf, buf + width * height, result);
+  }
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real** rawRowBuffer) {
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 3e6bd5285058a297c4574631e2a5c033b83936e8..876af2aa7615c098d225b56ce2ea0b1529a6e3c6 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -27,18 +27,20 @@ int main() {
   CHECK(paddle_arguments_resize(in_args, 1));
 
   // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
                                            /* size */ 784,
                                            /* useGPU */ false);
   srand(time(0));
-  paddle_real* array;
 
-  // Get First row.
-  CHECK(paddle_matrix_get_row(mat, 0, &array));
+  std::vector<paddle_real> input;
+  input.resize(784 * 10);
 
-  for (int i = 0; i < 784; ++i) {
-    array[i] = rand() / ((float)RAND_MAX);
+  for (int i = 0; i < input.size(); ++i) {
+    input[i] = rand() / ((float)RAND_MAX);
   }
+  
+  // Set value for the input matrix
+  CHECK(paddle_matrix_set_value(mat, input.data()));
 
   CHECK(paddle_arguments_set_value(in_args, 0, mat));
 
@@ -51,11 +53,17 @@ int main() {
 
   CHECK(paddle_arguments_get_value(out_args, 0, prob));
 
-  CHECK(paddle_matrix_get_row(prob, 0, &array));
+  std::std::vector<paddle_real> result;
+  int height;
+  int width;
+
+  CHECK(paddle_matrix_get_shape(prob, &height, &width);
+  result.resize(height * width);
+  CHECK(paddle_matrix_get_value(prob, result.data()));
 
   printf("Prob: ");
-  for (int i = 0; i < 10; ++i) {
-    printf("%.2f ", array[i]);
+  for (int i = 0; i < height * width; ++i) {
+    printf("%.2f ", result[i]);
   }
   printf("\n");
 
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index 03dcbdd40cf8d6c30bb00e5e7283aa794a3d7c48..01b8bad2ee9f528f8622346f43b9ff82225a7e73 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -71,6 +71,16 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real* rowArray);
 
+/**
+ * @brief paddle_matrix_set_value Set value to matrix.
+ * @param mat Target Matrix
+ * @param value Row data.
+ * @return paddle_error
+ * @note  value should contain enough element of data to init the mat
+ */
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                          paddle_real* value);
+
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
  * @param [in] mat Target matrix
@@ -82,6 +92,15 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real** rawRowBuffer);
 
+/**
+ * @brief copy data from the matrix 
+ * @param [in] mat Target matrix
+ * @param [out] result pointer to store the matrix data 
+ * @return paddle_error
+ * @note the space of the result should allocated before invoke this API
+ */
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                          paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
index 4bf9a9d6a9f9161561e9e5612edd2c93cab7ac5b..6940c28448a897cecd78b718fe720441086a5a99 100644
--- a/paddle/capi/tests/test_Matrix.cpp
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -45,3 +45,49 @@ TEST(CAPIMatrix, createNone) {
   paddle_matrix mat = paddle_matrix_create_none();
   ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
 }
+
+TEST(CAPIMatrix, cpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, false);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(CAPIMatrix, gpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, true);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+#endif
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index b6a20615783c4dd9eea3dd4a8a189ea03acb2bef..b3b9c45ded95ce2e735b8898d47760956dcacdce 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -321,8 +321,6 @@ static void CreateGradVarInBlock(
         auto* param = block_desc->FindVarRecursive(pname);
         auto* grad = block_desc->FindVar(arg);
         if (param == nullptr) {
-          LOG(WARNING) << "Cannot find forward variable of " << arg
-                       << ". Set its gradient to FP32";
           grad->SetDataType(DataType::FP32);
         } else {
           grad->SetDataType(param->GetDataType());
@@ -379,6 +377,12 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
   return grad_op_descs;
 }
 
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx);
+
 std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
     ProgramDescBind& program_desc, int block_idx,
     std::unordered_set<std::string>* no_grad_vars,
@@ -394,13 +398,13 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
 
     if ((*it)->Type() == "recurrent") {
       int step_block_idx = (*it)->GetBlockAttr("step_block");
-      auto backward_block_op_descs = MakeBlockBackward(
-          program_desc, step_block_idx, no_grad_vars, grad_to_var);
+      BlockDescBind* backward_block = CreateStepBlock(
+          program_desc, no_grad_vars, grad_to_var, step_block_idx);
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else if ((*it)->Type() == "conditional_block") {
       BlockDescBind* backward_block =
-          program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
-      for (auto& ptr : backward_block_op_descs) {
-        backward_block->AppendAllocatedOp(std::move(ptr));
-      }
+          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
+                          (*it)->GetBlockAttr("block"));
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
     } else {
       op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
@@ -451,6 +455,21 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
   return backward_descs;
 }
 
+static BlockDescBind* CreateStepBlock(
+    ProgramDescBind& program_desc,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx) {
+  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
+                                                   no_grad_vars, grad_to_var);
+  BlockDescBind* backward_block =
+      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+  for (auto& ptr : backward_block_op_descs) {
+    backward_block->AppendAllocatedOp(move(ptr));
+  }
+  return backward_block;
+}
+
 ParamGradInfoMap AppendBackward(
     ProgramDescBind& program_desc, const VarDescBind& target,
     const std::unordered_set<std::string>& no_grad_vars) {
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 9e3d597f3a2c84623a1ce9e4b6f4b956cffde211..11764810e1d40e5e6eb3cd0d8e9b4b63a79855b4 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -50,6 +50,15 @@ VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
   return it->second.get();
 }
 
+VarDescBind *BlockDescBind::FindRecursiveOrCreateVar(
+    const std::string &name_bytes) {
+  VarDescBind *res = FindVarRecursive(name_bytes);
+  if (res == nullptr) {
+    res = Var(name_bytes);
+  }
+  return res;
+}
+
 bool BlockDescBind::HasVarRecursive(const std::string &name) const {
   return FindVarRecursive(name) != nullptr;
 }
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 26adf6a20ff09483b84f479db08efcf402135053..8e967e5378eb47a7869efb59cc96a271f1cbb9a1 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -58,6 +58,8 @@ class BlockDescBind {
 
   VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
 
+  VarDescBind *FindRecursiveOrCreateVar(const std::string &name_bytes);
+
   bool HasVarRecursive(const std::string &var_name) const;
 
   std::set<std::string> LocalVarNames() const {
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index e7cba9e702ce0f96a9680169f0593130df2fd096..39c8def82e1ebb10a0e357a648af760099020c32 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -357,7 +357,8 @@ void OpDescBind::InferVarType(BlockDescBind *block) const {
                 "LOD_TENSOR";
     for (auto &out_pair : this->outputs_) {
       for (auto &out_var_name : out_pair.second) {
-        block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
+        block->FindRecursiveOrCreateVar(out_var_name)
+            ->SetType(VarDesc::LOD_TENSOR);
       }
     }
   }
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index fb2c69105627f663ddcce07d31526c9e4278e863..9428b8a07ea0af005f6e960ddaa02da624ad9d97 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -98,5 +98,23 @@ void Scope::DeleteScope(Scope* scope) {
   delete scope;
 }
 
+void Scope::Rename(const std::string& origin_name,
+                   const std::string& new_name) const {
+  auto origin_it = vars_.find(origin_name);
+  PADDLE_ENFORCE(origin_it != vars_.end(),
+                 "Cannot find original variable with name %s", origin_name);
+  auto new_it = vars_.find(new_name);
+  PADDLE_ENFORCE(new_it == vars_.end(),
+                 "The variable with name %s is already in the scope", new_name);
+  vars_[new_name] = origin_it->second;
+  vars_.erase(origin_it);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  Rename(origin_name, var_name);
+  return var_name;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index fb660949394149ebf2c6172a0ac3f4c7594f4286..c2aafb6ad825f9bd9ffef754923a15afdeaa8e5c 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -68,11 +68,18 @@ class Scope {
   // enumerate all the variables current contains.
   std::vector<std::string> GetAllNames(bool recursive = false) const;
 
+  // Rename variable to a new name
+  void Rename(const std::string& origin_name,
+              const std::string& new_name) const;
+
+  // Rename variable to a new name and return the new name
+  std::string Rename(const std::string& origin_name) const;
+
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
-  std::unordered_map<std::string, Variable*> vars_;
+  mutable std::unordered_map<std::string, Variable*> vars_;
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index d060196bb2c478b776851288cb71a1880d60660d..0f19870bec3e69d07278507cc556a86bbd25d12d 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -27,10 +27,32 @@ inline VarDesc::VarType ToVarType(std::type_index type) {
     return VarDesc_VarType_LOD_RANK_TABLE;
   } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
     return VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+    return VarDesc_VarType_SELECTED_ROWS;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
 }
 
+template <typename Visitor>
+inline void VisitVarType(const Variable& var, Visitor visitor) {
+  switch (ToVarType(var.Type())) {
+    case VarDesc_VarType_LOD_TENSOR:
+      visitor(var.Get<framework::LoDTensor>());
+      return;
+    case VarDesc_VarType_LOD_RANK_TABLE:
+      visitor(var.Get<LoDRankTable>());
+      return;
+    case VarDesc_VarType_LOD_TENSOR_ARRAY:
+      visitor(var.Get<LoDTensorArray>());
+      return;
+    case VarDesc_VarType_SELECTED_ROWS:
+      visitor(var.Get<SelectedRows>());
+      return;
+    default:
+      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
index 6ffe4fbec643e50d27924a989875454d307f5b9b..0f2b67fd758ec1513f42c4cb1a36f2f3915f4740 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -54,7 +54,6 @@ void MKLDNNAddtoLayer::reshape(
   ow = iw;
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
-  printSizeInfo();
 }
 
 void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
index ed3887cbf653878623764a310c9f364f4d8be27f..071bdf54d5dc9538d5ced580a73b9c0fbcea41fb 100644
--- a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -125,7 +125,6 @@ void MKLDNNBatchNormLayer::reshape(
       << "Input channel can not be changed";
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
-  printSizeInfo();
 }
 
 void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index b8120eda1e2dadab943869a05546351a369af6fd..8aa54e0a9efa7adb766cbb6009f6a29410c6ae7d 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -102,8 +102,6 @@ void MKLDNNConvLayer::reshape(
 
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc * oh * ow);
-
-  printSizeInfo();
 }
 
 void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index 1fed0e1c6565b763a3ee73a0853f560ddfbd44c6..9c69136684e5f9005860b476ec6ed1bbc9ceff6c 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -92,7 +92,7 @@ public:
   void printSizeInfo() override {
     MKLDNNLayer::printSizeInfo();
     VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
-                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 3429c53d2396e051d62fe0ae405934758e89f9c2..350ec65fffbc73c3a6e4245f763f4c6aa868f574 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -84,8 +84,6 @@ void MKLDNNFcLayer::reshape(
 
   reshapeOutput(oh, ow);
   resizeOutput(bs, oc);
-
-  printSizeInfo();
 }
 
 void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index 6e89260f49979d4edb4da138507a73dc2bf120de..a18c455beab96ef25b5545281bae4d48cec98d9e 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -71,8 +71,6 @@ void MKLDNNPoolLayer::reshape(
   reshapeOutput(oh, ow);
 
   resizeOutput(bs, oc * oh * ow);
-
-  printSizeInfo();
 }
 
 void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..35d4b12d3d357800fe72899069b5377c252fac5f
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ROIPoolLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(roi_pool, ROIPoolLayer);
+
+bool ROIPoolLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  pooledWidth_ = layerConf.pooled_width();
+  pooledHeight_ = layerConf.pooled_height();
+  spatialScale_ = layerConf.spatial_scale();
+
+  return true;
+}
+
+void ROIPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  height_ = getInput(0).getFrameHeight();
+  if (!height_) height_ = layerConf.height();
+  width_ = getInput(0).getFrameWidth();
+  if (!width_) width_ = layerConf.width();
+  channels_ = getInputValue(0)->getWidth() / width_ / height_;
+
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t numROIs = getInput(1).getBatchSize();
+
+  MatrixPtr dataValue = getInputValue(0);
+  MatrixPtr roiValue = getInputValue(1);
+  resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
+  MatrixPtr outputValue = getOutputValue();
+
+  if (useGpu_) {  // TODO(guosheng): implement on GPU later
+    MatrixPtr dataCpuBuffer;
+    Matrix::resizeOrCreate(dataCpuBuffer,
+                           dataValue->getHeight(),
+                           dataValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    dataCpuBuffer->copyFrom(*dataValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    dataValue = dataCpuBuffer;
+    roiValue = roiCpuBuffer;
+    MatrixPtr outputCpuBuffer;
+    Matrix::resizeOrCreate(outputCpuBuffer,
+                           outputValue->getHeight(),
+                           outputValue->getWidth(),
+                           false,
+                           false);
+    outputCpuBuffer->copyFrom(*outputValue);
+    outputValue = outputCpuBuffer;
+  }
+
+  real* bottomData = dataValue->getData();
+  size_t batchOffset = dataValue->getWidth();
+  size_t channelOffset = height_ * width_;
+  real* bottomROIs = roiValue->getData();
+  size_t roiOffset = roiValue->getWidth();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+
+  real* outputData = outputValue->getData();
+  Matrix::resizeOrCreate(maxIdxs_,
+                         numROIs,
+                         channels_ * pooledHeight_ * pooledWidth_,
+                         false,
+                         false);
+  real* argmaxData = maxIdxs_->getData();
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    // the first five elememts of each RoI should be:
+    // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end
+    size_t roiBatchIdx = bottomROIs[0];
+    size_t roiStartW = round(bottomROIs[1] * spatialScale_);
+    size_t roiStartH = round(bottomROIs[2] * spatialScale_);
+    size_t roiEndW = round(bottomROIs[3] * spatialScale_);
+    size_t roiEndH = round(bottomROIs[4] * spatialScale_);
+    CHECK_GE(roiBatchIdx, 0UL);
+    CHECK_LT(roiBatchIdx, batchSize);
+    size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL);
+    size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL);
+    real binSizeH =
+        static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
+    real binSizeW =
+        static_cast<real>(roiWidth) / static_cast<real>(pooledWidth_);
+    real* batchData = bottomData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t hstart = static_cast<size_t>(std::floor(ph * binSizeH));
+          size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
+          size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
+          size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
+          hstart = std::min(std::max(hstart + roiStartH, 0UL), height_);
+          wstart = std::min(std::max(wstart + roiStartW, 0UL), width_);
+          hend = std::min(std::max(hend + roiStartH, 0UL), height_);
+          wend = std::min(std::max(wend + roiStartW, 0UL), width_);
+
+          bool isEmpty = (hend <= hstart) || (wend <= wstart);
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          if (isEmpty) {
+            outputData[poolIndex] = 0;
+            argmaxData[poolIndex] = -1;
+          }
+
+          for (size_t h = hstart; h < hend; ++h) {
+            for (size_t w = wstart; w < wend; ++w) {
+              size_t index = h * width_ + w;
+              if (batchData[index] > outputData[poolIndex]) {
+                outputData[poolIndex] = batchData[index];
+                argmaxData[poolIndex] = index;
+              }
+            }
+          }
+        }
+      }
+      batchData += channelOffset;
+      outputData += poolChannelOffset;
+      argmaxData += poolChannelOffset;
+    }
+    bottomROIs += roiOffset;
+  }
+  if (useGpu_) {
+    getOutputValue()->copyFrom(*outputValue);
+  }
+}
+
+void ROIPoolLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inGradValue = getInputGrad(0);
+  MatrixPtr outGradValue = getOutputGrad();
+  MatrixPtr roiValue = getInputValue(1);
+
+  if (useGpu_) {
+    MatrixPtr inGradCpuBuffer;
+    Matrix::resizeOrCreate(inGradCpuBuffer,
+                           inGradValue->getHeight(),
+                           inGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr outGradCpuBuffer;
+    Matrix::resizeOrCreate(outGradCpuBuffer,
+                           outGradValue->getHeight(),
+                           outGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    inGradCpuBuffer->copyFrom(*inGradValue);
+    outGradCpuBuffer->copyFrom(*outGradValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    inGradValue = inGradCpuBuffer;
+    outGradValue = outGradCpuBuffer;
+    roiValue = roiCpuBuffer;
+  }
+
+  real* bottomROIs = roiValue->getData();
+  size_t numROIs = getInput(1).getBatchSize();
+  size_t roiOffset = getInputValue(1)->getWidth();
+
+  real* inDiffData = inGradValue->getData();
+  size_t batchOffset = getInputValue(0)->getWidth();
+  size_t channelOffset = height_ * width_;
+
+  real* outDiffData = outGradValue->getData();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+  real* argmaxData = maxIdxs_->getData();
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    size_t roiBatchIdx = bottomROIs[0];
+    real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          if (argmaxData[poolIndex] > 0) {
+            size_t index = static_cast<size_t>(argmaxData[poolIndex]);
+            batchDiffData[index] += outDiffData[poolIndex];
+          }
+        }
+      }
+      batchDiffData += channelOffset;
+      outDiffData += poolChannelOffset;
+      argmaxData += poolChannelOffset;
+    }
+    bottomROIs += roiOffset;
+  }
+
+  if (useGpu_) {
+    getInputGrad(0)->copyFrom(*inGradValue);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f07e49d6fd1eda9fa7bd46e4cec771a75f571be
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+ * feature map.
+ * - Input: This layer needs two input layers: The first input layer is a
+ *          convolution layer; The second input layer contains the ROI data
+ *          which is the output of ProposalLayer in Faster R-CNN. layers for
+ *          generating bbox location offset and the classification confidence.
+ * - Output: The ROIs' feature map.
+ * Reference:
+ *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ * Networks
+ */
+
+class ROIPoolLayer : public Layer {
+protected:
+  size_t channels_;
+  size_t width_;
+  size_t height_;
+  size_t pooledWidth_;
+  size_t pooledHeight_;
+  real spatialScale_;
+
+  // Since there is no int matrix, use real maxtrix instead.
+  MatrixPtr maxIdxs_;
+
+public:
+  explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index df73e6781533def5641635e9dfa9c9e4e8a0b57f..fcbcb5b0f1f4cb07066363c9fa93fb1726459f30 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2056,6 +2056,43 @@ TEST(Layer, CropLayer) {
   }
 }
 
+TEST(Layer, roi_pool) {
+  TestConfig config;
+  config.layerConfig.set_type("roi_pool");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+  roiPoolConf->set_pooled_width(7);
+  roiPoolConf->set_pooled_height(7);
+  roiPoolConf->set_spatial_scale(1. / 16);
+  roiPoolConf->set_width(14);
+  roiPoolConf->set_height(14);
+
+  const size_t roiNum = 10;
+  const size_t roiDim = 10;
+  const size_t batchSize = 5;
+  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+  roiValue->zeroMem();
+  real* roiData = roiValue->getData();
+  for (size_t i = 0; i < roiNum; ++i) {
+    roiData[i * roiDim + 0] = std::rand() % batchSize;
+    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
+    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
+    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
+    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
+    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
+    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+  }
+}
+
 TEST(Layer, SwitchOrderLayer) {
   TestConfig config;
   // config input_0
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index a0e039c2a33b586e21775ad06c1278a10804d654..a859e34c8996d81f14bf1edcb6e23d5a4f687e6b 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -297,7 +297,7 @@ static void getAddtoConfig(TestConfig& cfg,
 }
 
 void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
-  CHECK_GE(nInputs, 1);
+  CHECK_GE(nInputs, 1UL);
   TestConfig dnnConfig;
   getAddtoConfig(dnnConfig, pm, nInputs);
   dnnConfig.layerConfig.set_type("mkldnn_addto");
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 21a8f73c3e650d4b3c3b86247594cd965f4ead35..a710479bab82ed52122cf59bb14a05ccbd4aa05c 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -152,12 +152,7 @@ void MKLDNNMatrix::downSpatial() {
   }
   memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
   memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
-  mkldnn_primitive_t result;
-  mkldnn::error::wrap_c_api(
-      mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
-      "could not create a memory primitive");
-  reset(result);
-  set_data_handle(data_);
+  resetMKLDNNMemory(pd, data_);
 }
 
 }  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index 54cfefe23b3dc70fd12fd2ca8886c941047b59f7..39d40a1f61609a649d3341c170d24b0604921ac2 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -145,6 +145,27 @@ public:
     m_.reset();
   }
 
+  /**
+   * override the CpuMatrix::resize
+   */
+  void resize(size_t newHeight, size_t newWidth) override {
+    m_->resize(newHeight, newWidth);
+    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+      return;
+    }
+    CpuMatrix::setData(data_);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+    auto pd = mkldnn::memory::primitive_desc(
+        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+                             getDtype(),
+                             mkldnn::memory::format::nc),
+        getEngine());
+    resetMKLDNNMemory(pd, data_);
+  }
+
   /**
    * override Matrix::getData
    * check data before return
@@ -215,6 +236,17 @@ protected:
                    memory::format srcFmt,
                    memory::format dstFmt,
                    memory::dims dm);
+  /**
+   * reset this MKLDNN Memory from primitve desc
+   */
+  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+    mkldnn_primitive_t result;
+    mkldnn::error::wrap_c_api(
+        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+        "could not create a memory primitive");
+    reset(result);
+    set_data_handle(data);
+  }
 
 private:
   // save the CpuMatrixPtr in case the buffer released outside
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 29ce44c23308cb5ae1c1df5c9be1412c28abe96f..709f7de2e43093114d096cbfca5b5d49293a6d3e 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -214,6 +214,7 @@ set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
 cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
         rnn/recurrent_op_utils.cc
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index 1968b53d19acfddaa514eca6e24d98a298d8d311..28dbc77f64842a62e88ae8df4ead7adc3b03764b 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
@@ -22,18 +21,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-
 template <typename Place, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..609e915b932e2bc4d5abee1e5f868cc07a7619d3
--- /dev/null
+++ b/paddle/operators/assign_op.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_type.h"
+
+namespace paddle {
+namespace operators {
+class AssignFunctor {
+ public:
+  AssignFunctor(framework::Variable *out,
+                const platform::DeviceContext &dev_ctx)
+      : out_(out), dev_ctx_(dev_ctx) {}
+
+  void operator()(const framework::LoDTensor &lod_tensor) const {
+    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
+    copy_tensor(lod_tensor, &out_tensor);
+  }
+
+  void operator()(const framework::LoDTensorArray &array) const {
+    auto &out_array = *out_->GetMutable<framework::LoDTensorArray>();
+    out_array.resize(array.size());
+    for (size_t i = 0; i < array.size(); ++i) {
+      copy_tensor(array[i], &out_array[i]);
+    }
+  }
+
+  void operator()(const framework::SelectedRows &rows) const {
+    framework::SelectedRows &out_rows =
+        *out_->GetMutable<framework::SelectedRows>();
+    out_rows.set_rows(rows.rows());
+    out_rows.set_height(rows.height());
+    auto &t = rows.value();
+    out_rows.mutable_value()->CopyFrom(t, t.place(), dev_ctx_);
+  }
+
+  template <typename T>
+  void operator()(const T &v) const {
+    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+  }
+
+ private:
+  void copy_tensor(const framework::LoDTensor &lod_tensor,
+                   framework::LoDTensor *out) const {
+    auto &out_tensor = *out;
+    out_tensor.CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_);
+    out_tensor.set_lod(lod_tensor.lod());
+  }
+
+  framework::Variable *out_;
+  const platform::DeviceContext &dev_ctx_;
+};
+
+class AssignOp : public framework::OperatorBase {
+ public:
+  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) {
+      return;
+    }
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(
+        out != nullptr,
+        "The Output(Out) should not be null if the Input(X) is set.");
+    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+  }
+};
+
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignOpProtoMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+             "could be LoDTensor, SelectedRows or LoDTensorArray.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+              "is the same as input X.");
+    AddComment(R"DOC(Assign Operator
+
+Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+  }
+};
+
+class AssignInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    if (context->HasInput("X")) {
+      auto type = context->GetInputsVarType("X")[0];
+      if (type == framework::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::VarDesc_VarType_LOD_TENSOR) {
+        context->SetOutputDim("Out", context->GetInputDim("X"));
+      }
+    }
+  }
+};
+
+class AssignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *op = new framework::OpDescBind();
+    op->SetType("assign");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    return std::unique_ptr<framework::OpDescBind>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
+                  ops::AssignInferShape, ops::AssignOpProtoMaker);
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 8721ca352848fc4d69b206d4ea0ab7c581c8d055..f884e6efa917ce3f8554dce0e248f2b29273e3f3 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -19,9 +19,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
 using EigenArrayMap =
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3904a97d58166cfeeb2be7d2144700dbd8bc5721
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BeamSearchDecodeOp : public framework::OperatorBase {
+ public:
+  BeamSearchDecodeOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+
+    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
+    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
+    const size_t step_num = ids->size();
+    PADDLE_ENFORCE_GT(step_num, 0UL,
+                      "beam search steps should be larger than 0");
+    const size_t source_num = ids->at(0).lod().at(0).size() - 1;
+    PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0");
+
+    for (size_t i = 0; i < step_num; ++i) {
+      PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL,
+                        "Level of LodTensor should be 2");
+    }
+
+    // prepare output
+    LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
+    LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
+
+    BeamSearchDecoder<float> beam_search_decoder;
+    beam_search_decoder.PackAllSteps(*ids, *scores, sentenceIds,
+                                     sentenceScores);
+  }
+};
+
+class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BeamSearchDecodeOpProtoMaker(framework::OpProto* proto,
+                               framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddInput("Scores",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddOutput("SentenceIds",
+              "(LodTensor)"
+              "All possible result sentences of word ids");
+    AddOutput("SentenceScores",
+              "(LodTensor)"
+              "All possible result sentences of word scores");
+    AddComment(R"DOC(
+Pack the result of Beam search op into SentenceIds and SentenceScores.
+)DOC");
+  }
+};
+
+class BeamSearchDecodeInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("Ids"),
+                   "BeamSearchDecodeOp must has input Ids");
+    PADDLE_ENFORCE(context->HasInput("Scores"),
+                   "BeamSearchDecodeOp must has input Scores");
+    PADDLE_ENFORCE(context->HasOutput("SentenceIds"),
+                   "BeamSearchDecodeOp must has output SentenceIds");
+    PADDLE_ENFORCE(context->HasOutput("SentenceScores"),
+                   "BeamSearchDecodeOp must has output SentenceScores");
+  }
+};
+
+class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDescBind& op_desc,
+                  framework::BlockDescBind* block) const override {
+    for (auto& o : op_desc.Output("SentenceIds")) {
+      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+    for (auto& o : op_desc.Output("SentenceScores")) {
+      block->Var(o)->SetType(framework::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp,
+                  paddle::operators::BeamSearchDecodeOpProtoMaker,
+                  paddle::operators::BeamSearchDecodeInferShape,
+                  paddle::operators::BeamSearchDecodeInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f007ec22f9a66572971516a711317f348e1ec5a
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.h
@@ -0,0 +1,280 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoDTensorArray = framework::LoDTensorArray;
+
+// all the lod have 2 levels.
+// The First is source level, the second is sentence level.
+// source level describe how many candidate words for this source.
+// sentence level describe these candidates belong to which prefix
+const size_t kSourceLevel = 0;
+const size_t kSentenceLevel = 1;
+
+template <typename T>
+struct BeamNode {
+  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
+
+  ~BeamNode() {
+    if (parent_) {
+      parent_->DropKid(this);
+      if (parent_->kids_.size() == 0UL) {
+        delete parent_;
+      }
+    }
+    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
+  }
+
+  void AppendTo(BeamNode* parent) {
+    parent_ = parent;
+    parent->kids_.insert(this);
+  }
+
+  void DropKid(BeamNode* kid) { kids_.erase(kid); }
+
+  BeamNode* parent_ = nullptr;
+  std::unordered_set<BeamNode*> kids_;
+  int64_t word_id_;
+  T score_;
+};
+
+template <typename T>
+using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
+
+template <typename T>
+struct Sentence {
+  std::vector<int64_t> word_ids;
+  std::vector<T> scores;
+};
+
+template <typename T>
+using SentenceVector = std::vector<Sentence<T>>;
+
+template <typename T>
+struct BeamSearchDecoder {
+  /**
+   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
+   */
+  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
+
+  /**
+   * Param:
+   *  cur_ids: LoDTensor of One step for word ID
+   *  cur_scores: LoDTensor of One Step for word score
+   *  prefixes_list: prefixes for each source sentence.
+   *  sentence_vector_list: result sentence_vector for each source sentence.
+   * Return:
+   *  a new prefixes list for each source of current step
+   */
+  std::vector<BeamNodeVector<T>> PackTwoSteps(
+      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+      std::vector<BeamNodeVector<T>>& prefixes_list,
+      std::vector<SentenceVector<T>>* sentence_vector_list) const;
+
+  /**
+   * convert the result sentence_vector for each source sentence into two
+   * LodTensor.
+   * One is all candidate sentences with word id, one is all candidate sentences
+   * with word score.
+   * Param:
+   *  sentence_vector_list: sentence_vector for each source sentence.
+   *  id_tensor: result LoDTensor for sentences of id.
+   *  score_tensor: result LoDTensor for sentences of score.
+   */
+  void ConvertSentenceVectorToLodTensor(
+      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+      LoDTensor* score_tensor) const;
+
+  /**
+   * Pack all steps of id/score LodTensor into sentence LoDTensor
+   * it's main logic is:
+   * ```python
+   *   prefix
+   *   result_sentence
+   *   result_lod_tensor
+   *
+   *   for (step in steps):
+   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
+   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
+   * ```
+   */
+  void PackAllSteps(const LoDTensorArray& step_ids,
+                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                    LoDTensor* score_tensor) const;
+};
+
+template <typename T>
+Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
+  Sentence<T> sentence;
+  while (node != nullptr) {
+    sentence.word_ids.emplace_back(node->word_id_);
+    sentence.scores.emplace_back(node->score_);
+    node = node->parent_;
+  }
+
+  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
+  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
+
+  return sentence;
+}
+
+template <typename T>
+std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
+    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+    std::vector<BeamNodeVector<T>>& prefixes_list,
+    std::vector<SentenceVector<T>>* sentence_vector_list) const {
+  std::vector<BeamNodeVector<T>> result;
+
+  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
+       ++src_idx) {
+    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+
+    BeamNodeVector<T> beam_nodes;
+
+    // if prefixes size is 0, it means this is the first step. In this step,
+    // all candidate id is the start of candidate sentences.
+    if (prefixes_list.empty()) {
+      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
+                        cur_ids.lod().at(kSentenceLevel).back(),
+                        "in the first step");
+      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
+        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
+            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
+      }
+    } else {
+      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
+      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
+
+      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
+                        "prefix and candidate set number should be the same");
+
+      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
+      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
+        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
+        size_t candidate_start = candidate_offset[src_start + prefix_idx];
+        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
+        if (candidate_start == candidate_end) {
+          VLOG(3) << "this sentence has no more candidate, "
+                     "add to result sentence and rm it from beam tree";
+          sentence_vector.push_back(MakeSentence(prefix.get()));
+          prefix.reset();
+        } else {
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            auto* candidate =
+                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
+                                cur_scores.data<T>()[candidate_idx]);
+            candidate->AppendTo(prefix.get());
+            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
+          }
+          prefix.release();
+        }
+      }
+    }
+    result.push_back(std::move(beam_nodes));
+  }
+  return result;
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
+    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+    LoDTensor* score_tensor) const {
+  size_t src_num = sentence_vector_list.size();
+
+  PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
+
+  std::vector<size_t> source_level_lod = {0};
+  std::vector<size_t> sentence_level_lod = {0};
+  std::vector<int64_t> id_data;
+  std::vector<T> score_data;
+
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
+      id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                     sentence.word_ids.end());
+      score_data.insert(score_data.end(), sentence.scores.begin(),
+                        sentence.scores.end());
+      sentence_level_lod.push_back(sentence_level_lod.back() +
+                                   sentence.word_ids.size());
+    }
+    source_level_lod.push_back(source_level_lod.back() +
+                               sentence_vector_list[src_idx].size());
+  }
+
+  auto cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+
+  framework::LoD lod;
+  lod.push_back(source_level_lod);
+  lod.push_back(sentence_level_lod);
+
+  id_tensor->set_lod(lod);
+  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
+  id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
+  id_tensor->CopyFromVector<int64_t>(id_data, cpu_ctx);
+
+  score_tensor->set_lod(lod);
+  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
+  score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
+  score_tensor->CopyFromVector<T>(score_data, cpu_ctx);
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
+                                        const LoDTensorArray& step_scores,
+                                        LoDTensor* id_tensor,
+                                        LoDTensor* score_tensor) const {
+  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
+  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
+                    "step_ids and step_scores should be the same");
+  const size_t step_num = step_ids.size();
+  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+
+  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
+
+  // previous prefixes for each step,
+  // the init length is 0, means this is the first step.
+  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
+  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
+
+  // pack all steps for one batch first, then another batch
+  for (size_t step_id = 0; step_id < step_num; ++step_id) {
+    beamnode_vector_list =
+        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
+                     beamnode_vector_list, &sentence_vector_list);
+  }
+  // append last beam_node to result
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
+      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
+      beam_node.reset();
+    }
+  }
+
+  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
+                                   score_tensor);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/beam_search_decode_op_test.cc b/paddle/operators/beam_search_decode_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ac23991f3c7768abaf94f3a4b750697de0ef114
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op_test.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+#include "gtest/gtest.h"
+
+using CPUPlace = paddle::platform::CPUPlace;
+using LoD = paddle::framework::LoD;
+using LoDTensor = paddle::framework::LoDTensor;
+using LoDTensorArray = paddle::framework::LoDTensorArray;
+
+template <typename T>
+using BeamNode = paddle::operators::BeamNode<T>;
+template <typename T>
+using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
+template <typename T>
+using Sentence = paddle::operators::Sentence<T>;
+template <typename T>
+using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
+template <typename T>
+using SentenceVector = paddle::operators::SentenceVector<T>;
+
+namespace paddle {
+namespace test {
+
+void GenerateExample(const std::vector<size_t>& level_0,
+                     const std::vector<size_t>& level_1,
+                     const std::vector<int>& data, LoDTensorArray* ids,
+                     LoDTensorArray* scores) {
+  PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1,
+                    "source level is used to describe candidate set");
+  PADDLE_ENFORCE_EQ(level_1.back(), data.size(),
+                    "the lowest level is used to describe data"
+                    ", so it's last element should be data length");
+
+  CPUPlace place;
+
+  LoD lod;
+  lod.push_back(level_0);
+  lod.push_back(level_1);
+
+  // Ids
+  LoDTensor tensor_id;
+  tensor_id.set_lod(lod);
+  tensor_id.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  int64_t* id_ptr = tensor_id.mutable_data<int64_t>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    id_ptr[i] = static_cast<int64_t>(data.at(i));
+  }
+
+  // Scores
+  LoDTensor tensor_score;
+  tensor_score.set_lod(lod);
+  tensor_score.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  float* score_ptr = tensor_score.mutable_data<float>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    score_ptr[i] = static_cast<float>(data.at(i));
+  }
+
+  ids->push_back(tensor_id);
+  scores->push_back(tensor_score);
+}
+
+}  // namespace test
+}  // namespace paddle
+
+TEST(BeamSearchDecodeOp, DeleteBeamNode) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* b2 = new BeamNode<float>(2, 2);
+  auto* b3 = new BeamNode<float>(3, 3);
+
+  b1->AppendTo(root);
+  b2->AppendTo(root);
+  b3->AppendTo(b1);
+
+  delete b3;
+  delete b2;
+}
+
+TEST(BeamSearchDecodeOp, MakeSentence) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* end = new BeamNode<float>(2, 2);
+  b1->AppendTo(root);
+  end->AppendTo(b1);
+
+  BeamSearchDecoder<float> helper;
+  Sentence<float> sentence = helper.MakeSentence(end);
+  delete end;
+
+  std::vector<int64_t> expect_ids = {0, 1, 2};
+  ASSERT_EQ(sentence.word_ids, expect_ids);
+
+  std::vector<float> expect_scores = {0, 1, 2};
+  ASSERT_EQ(sentence.scores, expect_scores);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
+  CPUPlace place;
+
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  BeamSearchDecoder<float> helper;
+  beamnode_vector_list = helper.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoSteps) {
+  CPUPlace place;
+
+  // first source has three prefix
+  BeamNodeVector<float> source0_prefixes;
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
+
+  // second source has two prefix
+  BeamNodeVector<float> source1_prefixes;
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  beamnode_vector_list.push_back(std::move(source0_prefixes));
+  beamnode_vector_list.push_back(std::move(source1_prefixes));
+
+  // generate data for one step
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
+                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  BeamSearchDecoder<float> helper1;
+  beamnode_vector_list = helper1.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+
+  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
+  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
+}
+
+TEST(BeamSearchDecodeOp, PackAllSteps) {
+  CPUPlace place;
+
+  // we will constuct a sample data with 3 steps and 2 source sentences
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
+      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
+                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  ASSERT_EQ(ids.size(), 3UL);
+  ASSERT_EQ(scores.size(), 3UL);
+
+  BeamSearchDecoder<float> helper;
+
+  LoDTensor id_tensor;
+  LoDTensor score_tensor;
+  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+
+  LoD lod = id_tensor.lod();
+  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  EXPECT_EQ(lod[0], expect_source_lod);
+  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  EXPECT_EQ(lod[1], expect_sentence_lod);
+  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
+  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
+                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
+  for (size_t i = 0; i < expect_data.size(); ++i) {
+    ASSERT_EQ(id_tensor.data<int64_t>()[i],
+              static_cast<int64_t>(expect_data[i]));
+  }
+  for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) {
+    ASSERT_EQ(score_tensor.data<float>()[i],
+              static_cast<float>(id_tensor.data<int64_t>()[i]));
+  }
+}
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c65ba7eb262f3aabe2c00837b79806c0b40b60fd
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class BilinearTensorProductOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
+                      "The input(Weight) must be a 3D tensor.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The first dimension(batch_size) of input(X) must be "
+                      "equal to the first dimension of the input(Y).");
+    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
+                      "The second dimension of input(X) must be equal to "
+                      "the second dimension of the input(Weight).");
+    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
+                      "The second dimension of input(Y) must be equal to "
+                      "the third dimension of the input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL,
+                     "The Input(Bias) must be a 2-D tensor with "
+                     "the 2nd dimension fixed to 1 (a row vector).");
+      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
+                        "The second dimension of input(Bias) must be equal "
+                        "to the first dimension of the input(Weight).");
+    }
+
+    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BilinearTensorProductOpMaker(framework::OpProto* proto,
+                               framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of bilinear_tensor_product operator.");
+    AddInput("Y", "The second input of bilinear_tensor_product operator.");
+    AddInput("Weight",
+             "The learnable parameters of bilinear_tensor_product operator.");
+    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
+        .AsDispensable();
+    AddOutput("Out", "The output of bilinear_tensor_product operator.");
+    AddComment(R"DOC(
+Bilinear Tensor Product operator.
+Given input X and Y, a 3D tensor weight, and bias. Each column of the
+output is computed by one slice i = 1, . . . , k of the tensor:
+
+    M =  (X W_i) \cdot Y
+    Out_i = \sum_i {M_i} + Bias_i
+
+)DOC");
+  }
+};
+
+class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
+                      "The input(Out@GRAD) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[0], out_dims[0],
+        "The first dimension(batch_size) of input(Out@GRAD) must be "
+        "equal to the first dimension of the Input(X).");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[0], out_dims[1],
+        "The second dimension of input(Out@GRAD) must be equal to "
+        "the third dimension of the Input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE_EQ(
+          bias_dims[1], out_dims[1],
+          "The second dimension of input(Out@GRAD) must be equal to "
+          "the second dimension of the Input(Bias).");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    auto weight_grad_name = framework::GradVarName("Weight");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+    if (ctx->HasOutput(weight_grad_name)) {
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
+            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
+            ops::BilinearTensorProductOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..858d2668d01379afe8082cd1eda32a2a5d09bd18
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffa4f43a327418498c1f110504127e7d2878409d
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class BilinearTensorProductKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto output_mat = EigenMatrix<T>::From(*out);
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+    auto place = ctx.GetEigenDevice<Place>();
+
+    // Create the intermediate variable to caculate the result of
+    // Input(X) multiplied by Input(Weight_i), the formula is:
+    // left_mul = X Weight_i.
+    Tensor left_mul;
+    left_mul.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                             ctx.GetPlace());
+    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
+
+    for (int i = 0; i < out_dim; ++i) {
+      auto output_col_vec = output_mat.chip(i, 1);
+      Tensor weight_mat =
+          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
+      math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                           batch_size, y_dim, x_dim, 1, x->data<T>(),
+                           weight_mat.data<T>(), 0, left_mul.data<T>());
+      output_col_vec.device(place) =
+          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
+    }
+    if (bias) {
+      auto bias_vec = EigenMatrix<T>::From(*bias);
+      Eigen::DSizes<int, 2> bcast(batch_size, 1);
+      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* weight = ctx.Input<Tensor>("Weight");
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+
+    auto x_mat = EigenMatrix<T>::From(*x);
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto d_out_mat = EigenMatrix<T>::From(*d_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    // Create the intermediate variable to caculate the Output(Y@Grad).
+    Tensor x_scale;
+    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
+                            ctx.GetPlace());
+    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
+
+    // Create the intermediate variable to caculate the Output(X@Grad).
+    Tensor y_scale;
+    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                            ctx.GetPlace());
+    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
+
+    math::SetConstant<Place, T> set_zero;
+
+    // Set Output(X@Grad) be zero.
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_x, static_cast<T>(0));
+    }
+
+    // Set Output(Y@Grad) be zero.
+    if (d_y) {
+      d_y->mutable_data<T>(ctx.GetPlace());
+      set_zero(ctx.device_context(), d_y, static_cast<T>(0));
+    }
+
+    // Caculate the Output(X@Grad) and Output(Y@Grad).
+    if (d_x || d_y) {
+      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
+      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor weight_i = weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        if (d_x) {
+          y_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_x) *
+              y_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasTrans,
+                               batch_size, x_dim, y_dim, 1, y_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_x->data<T>());
+        }
+        if (d_y) {
+          x_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_y) *
+              x_mat;
+          math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
+                               batch_size, y_dim, x_dim, 1, x_scale.data<T>(),
+                               weight_i.data<T>(), 1, d_y->data<T>());
+        }
+      }
+    }
+
+    // Caculate the gradient of Input(Weight).
+    if (d_weight) {
+      d_weight->mutable_data<T>(ctx.GetPlace());
+      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        x_scale_mat.device(place) =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_weight) *
+            x_mat;
+        math::gemm<Place, T>(ctx.device_context(), CblasTrans, CblasNoTrans,
+                             x_dim, y_dim, batch_size, 1, x_scale.data<T>(),
+                             y->data<T>(), 0, d_weight_i.data<T>());
+      }
+    }
+
+    // Caculate the gradient of Input(Bias).
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_mat = EigenMatrix<T>::From(*d_bias);
+      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..309660b01fe7052de2f9300acdf00779d0228221
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/chunk_eval_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ChunkEvalOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Inference"),
+                   "Input(Inference) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Precision"),
+                   "Output(Precision) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Recall"),
+                   "Output(Recall) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
+                   "Output(F1-Score) of ChunkEvalOp should not be null.");
+
+    auto inference_dim = ctx->GetInputDim("Inference");
+    auto label_dim = ctx->GetInputDim("Label");
+
+    PADDLE_ENFORCE(inference_dim == label_dim,
+                   "Inference's shape must be the same as Label's shape.");
+
+    ctx->SetOutputDim("Precision", {1});
+    ctx->SetOutputDim("Recall", {1});
+    ctx->SetOutputDim("F1-Score", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(framework::DataType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChunkEvalOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Inference",
+             "(Tensor, default: Tensor<int>). Predictions from the network.");
+    AddInput("Label",
+             "(Tensor, default: Tensor<int>). The true tag sequences.");
+    AddOutput("Precision",
+              "(float). The evaluated precision (called positive predictive "
+              "value) of chunks on the given mini-batch.");
+    AddOutput("Recall",
+              "(float). The evaluated recall (true positive rate or "
+              "sensitivity) of chunks on the given mini-batch.");
+    AddOutput("F1-Score",
+              "(float). The evaluated F1-Score on the given mini-batch.");
+    AddAttr<int>("num_chunk_types",
+                 "(int). The number of chunk type. See below for details.");
+    AddAttr<std::string>(
+        "chunk_scheme",
+        "(string, default IOB). The labeling scheme indicating "
+        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
+        "for details.")
+        .SetDefault("IOB");
+    AddAttr<std::vector<int>>("excluded_chunk_types",
+                              "(list<int>) A list including chunk type ids "
+                              "indicating chunk types that are not counted. "
+                              "See below for details.")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(
+For some basics of chunking, please refer to
+‘Chunking with Support Vector Mechines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+
+
+CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+Here is a NER example of labeling for these tagging schemes:
+
+ 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+
+There are three chunk types(named entity types) including PER(person), ORG(orgnazation)
+and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
+
+Since the calculations actually use label ids rather than labels, extra attention
+should be paid when mapping labels to ids to make CheckEvalOp work. The key point
+is that the listed equations are satisfied by ids.
+
+    tag_type = label % num_tag_type
+    chunk_type = label / num_tag_type
+
+where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
+is the num of chunk types, and `tag_type` get its value from the following table.
+
+    Scheme Begin Inside End   Single
+     plain   0     -      -     -
+     IOB     0     1      -     -
+     IOE     -     0      1     -
+     IOBES   0     1      2     3
+
+Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
+PER and LOC. To satisfy the above equations, the label map can be like this:
+
+    B-ORG  0
+    I-ORG  1
+    B-PER  2
+    I-PER  3
+    B-LOC  4
+    I-LOC  5
+    O      6
+
+It’s not hard to verify the equations noting that the num of chunk types
+is 3 and the num of tag types in IOB scheme is 2. For example, the label
+id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
+I-LOC is 2, which consistent with the results from the equations.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp,
+                             ops::ChunkEvalOpMaker);
+REGISTER_OP_CPU_KERNEL(chunk_eval,
+                       ops::ChunkEvalKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..81aa07817b673b2ff85a35a51cc43742b7ad7fed
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.h
@@ -0,0 +1,219 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <set>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename Place, typename T>
+class ChunkEvalKernel : public framework::OpKernel<T> {
+ public:
+  struct Segment {
+    int begin;
+    int end;
+    int type;
+    bool operator==(const Segment& y) const {
+      return begin == y.begin && end == y.end && type == y.type;
+    }
+  };
+
+  void GetSegments(const int* label, int length, std::vector<Segment>& segments,
+                   int num_chunk_types, int num_tag_types, int other_chunk_type,
+                   int tag_begin, int tag_inside, int tag_end,
+                   int tag_single) const {
+    segments.clear();
+    segments.reserve(length);
+    int chunk_start = 0;
+    bool in_chunk = false;
+    int tag = -1;
+    int type = other_chunk_type;
+    for (int i = 0; i < length; ++i) {
+      int prev_tag = tag;
+      int prev_type = type;
+      PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types);
+      tag = label[i] % num_tag_types;
+      type = label[i] / num_tag_types;
+      if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type,
+                               tag_begin, tag_inside, tag_end, tag_single)) {
+        Segment segment{
+            chunk_start,  // begin
+            i - 1,        // end
+            prev_type,
+        };
+        segments.push_back(segment);
+        in_chunk = false;
+      }
+      if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
+                     tag_begin, tag_inside, tag_end, tag_single)) {
+        chunk_start = i;
+        in_chunk = true;
+      }
+    }
+    if (in_chunk) {
+      Segment segment{
+          chunk_start,  // begin
+          length - 1,   // end
+          type,
+      };
+      segments.push_back(segment);
+    }
+  }
+
+  bool ChunkEnd(int prev_tag, int prev_type, int tag, int type,
+                int other_chunk_type, int tag_begin, int tag_inside,
+                int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return false;
+    if (type == other_chunk_type) return true;
+    if (type != prev_type) return true;
+    if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_end) return true;
+    if (prev_tag == tag_single) return true;
+    return false;
+  }
+
+  bool ChunkBegin(int prev_tag, int prev_type, int tag, int type,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return type != other_chunk_type;
+    if (type == other_chunk_type) return false;
+    if (type != prev_type) return true;
+    if (tag == tag_begin) return true;
+    if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_single) return true;
+    return false;
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    // initialize to parse configurations
+    int num_chunk_types, num_tag_types;
+    int other_chunk_type;
+    int tag_begin, tag_inside, tag_end, tag_single;
+    std::vector<Segment> label_segments;
+    std::vector<Segment> output_segments;
+    std::set<int> excluded_chunk_types;
+    int64_t num_output_segments = 0;
+    int64_t num_label_segments = 0;
+    int64_t num_correct = 0;
+    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
+      num_tag_types = 2;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = -1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOE") {
+      num_tag_types = 2;
+      tag_begin = -1;
+      tag_inside = 0;
+      tag_end = 1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOBES") {
+      num_tag_types = 4;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = 2;
+      tag_single = 3;
+    } else if (context.Attr<std::string>("chunk_scheme") == "plain") {
+      num_tag_types = 1;
+      tag_begin = -1;
+      tag_inside = -1;
+      tag_end = -1;
+      tag_single = -1;
+    } else {
+      PADDLE_THROW("Unknown chunk scheme.");
+    }
+    other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
+    excluded_chunk_types.insert(
+        context.Attr<std::vector<int>>("excluded_chunk_types").begin(),
+        context.Attr<std::vector<int>>("excluded_chunk_types").end());
+
+    auto* inference = context.Input<LoDTensor>("Inference");
+    auto* label = context.Input<LoDTensor>("Label");
+    auto* precision = context.Output<Tensor>("Precision");
+    auto* recall = context.Output<Tensor>("Recall");
+    auto* f1 = context.Output<Tensor>("F1-Score");
+
+    const int* inference_data = inference->data<int>();
+    const int* label_data = label->data<int>();
+    T* precision_data = precision->mutable_data<T>(context.GetPlace());
+    T* racall_data = recall->mutable_data<T>(context.GetPlace());
+    T* f1_data = f1->mutable_data<T>(context.GetPlace());
+
+    auto lod = label->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE(lod == inference->lod(),
+                   "LoD must be same between Inference and Label.");
+    int num_sequences = lod[0].size() - 1;
+    for (int i = 0; i < num_sequences; ++i) {
+      int seq_length = lod[0][i + 1] - lod[0][i];
+      EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
+                 output_segments, label_segments, num_output_segments,
+                 num_label_segments, num_correct, num_chunk_types,
+                 num_tag_types, other_chunk_type, tag_begin, tag_inside,
+                 tag_end, tag_single, excluded_chunk_types);
+    }
+    *precision_data = !num_output_segments ? 0 : static_cast<T>(num_correct) /
+                                                     num_output_segments;
+    *racall_data = !num_label_segments ? 0 : static_cast<T>(num_correct) /
+                                                 num_label_segments;
+    *f1_data = !num_correct ? 0 : 2 * (*precision_data) * (*racall_data) /
+                                      ((*precision_data) + (*racall_data));
+  }
+
+  void EvalOneSeq(const int* output, const int* label, int length,
+                  std::vector<Segment>& output_segments,
+                  std::vector<Segment>& label_segments,
+                  int64_t& num_output_segments, int64_t& num_label_segments,
+                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single,
+                  const std::set<int>& excluded_chunk_types) const {
+    GetSegments(output, length, output_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    size_t i = 0, j = 0;
+    while (i < output_segments.size() && j < label_segments.size()) {
+      if (output_segments[i] == label_segments[j] &&
+          excluded_chunk_types.count(output_segments[i].type) != 1) {
+        ++num_correct;
+      }
+      if (output_segments[i].end < label_segments[j].end) {
+        ++i;
+      } else if (output_segments[i].end > label_segments[j].end) {
+        ++j;
+      } else {
+        ++i;
+        ++j;
+      }
+    }
+    for (auto& segment : label_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+    }
+    for (auto& segment : output_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5b124682d755ffb39f32c9f001a3cf113a01a2c
--- /dev/null
+++ b/paddle/operators/conditional_block_op.cc
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <algorithm>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ConditionalOp : public framework::OperatorBase {
+ public:
+  ConditionalOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  std::vector<const framework::LoDTensor *> InputTensors(
+      const framework::Scope &scope) const {
+    std::vector<const framework::LoDTensor *> retv;
+    auto xs = Inputs("X");
+    retv.resize(xs.size(), nullptr);
+    std::transform(
+        xs.begin(), xs.end(), retv.begin(),
+        [&scope](const std::string &var_name) -> const framework::LoDTensor * {
+          auto *var = scope.FindVar(var_name);
+          PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name);
+          return &var->Get<framework::LoDTensor>();
+        });
+    return retv;
+  }
+};
+
+class ConditionalBlockOp : public ConditionalOp {
+ public:
+  ConditionalBlockOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Output("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
+      scopes->resize(1);
+      scopes->front() = &scope.NewScope();
+      auto &cur_scope = *scopes->front();
+
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+    }
+  }
+};
+
+class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConditionalBlockOpProtoMaker(framework::OpProto *proto,
+                               framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The conditional variable of this operator. If X is empty, the "
+             "whole sub-block will not be executed.")
+        .AsDuplicable();
+    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
+    AddOutput("Scope",
+              "(std::vector<Scope*>) The step scope of conditional block. To "
+              "unify the conditional block, rnn and while op, the type of "
+              "scope is std::vector<Scope*>");
+    AddAttr<framework::BlockDescBind *>(
+        "block", "The step block of conditional block operator");
+    AddComment(R"DOC(Conditional block operator
+
+Run the sub-block if X is not empty. Params is the other inputs and Out is the
+outputs of the sub-block.
+)DOC");
+  }
+};
+
+class ConditionalBlockGradOp : public ConditionalOp {
+ public:
+  ConditionalBlockGradOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto xs = this->InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Input("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
+      framework::Scope &cur_scope = *scopes[0];
+
+      auto *block = Attr<framework::BlockDescBind *>("block");
+      framework::Executor exec(dev_ctx);
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("Params"),
+                                  Outputs(framework::GradVarName("Params")));
+
+      AssignLocalGradientToGlobal(dev_ctx, cur_scope, Inputs("X"),
+                                  Outputs(framework::GradVarName("X")));
+    }
+  }
+
+ private:
+  void AssignLocalGradientToGlobal(
+      const platform::DeviceContext &dev_ctx, const framework::Scope &cur_scope,
+      const std::vector<std::string> &p_names,
+      const std::vector<std::string> &pg_names) const {
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto out_grad_name = pg_names[i];
+      auto in_grad_name = framework::GradVarName(p_names[i]);
+      auto *in_var = cur_scope.FindVar(in_grad_name);
+      if (in_var == nullptr) {
+        continue;
+      }
+      auto new_in_grad_name = cur_scope.Rename(in_grad_name);
+      auto assign =
+          framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}},
+                                          {{"Out", {out_grad_name}}}, {});
+      assign->Run(cur_scope, dev_ctx);
+      cur_scope.Rename(new_in_grad_name, in_grad_name);
+    }
+  }
+};
+
+class ConditionalBlockGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInputs("X"));
+    if (context->HasInputs("Params")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
+      context->SetOutputsDim(framework::GradVarName("Params"),
+                             context->GetInputsDim("Params"));
+    }
+    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
+    context->SetOutputsDim(framework::GradVarName("X"),
+                           context->GetInputsDim("X"));
+  }
+};
+
+class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto grad_op = new framework::OpDescBind();
+    grad_op->SetType("conditional_block_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetInput("Scope", Output("Scope"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(framework::GradVarName("Params"), InputGrad("Params"));
+    grad_op->SetBlockAttr("block", *this->grad_block_[0]);
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
+                  ops::ConditionalBlockOpProtoMaker,
+                  ops::ConditionalBlockGradMaker);
+REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
+                  ops::ConditionalBlockGradInferShape);
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..282775fcda45fe3bbd72bf04a7ae828f2c840ab7
--- /dev/null
+++ b/paddle/operators/expand_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto x_dims = ctx->GetInputDim("X");
+
+    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
+                      "The number of Attr(expand_times)'s value must be equal "
+                      "to the rank of Input(X).");
+    PADDLE_ENFORCE_LE(x_dims.size(), 6,
+                      "The rank of Input(X) must not be greater than 6.");
+
+    std::vector<int64_t> out_shape(x_dims.size());
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_GE(expand_times[i], 1,
+                        "Each value of Attr(expand_times) should not be "
+                        "less than 1.");
+      out_shape[i] = x_dims[i] * expand_times[i];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+};
+
+class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+             "X is the input tensor to be expanded.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+              "The rank of Output(Out) is same as Input(X) except that each "
+              "dimension size of Output(Out) is equal to corresponding "
+              "dimension size of Input(X) multiplying corresponding value of "
+              "Attr(expand_times).");
+    AddAttr<std::vector<int>>("expand_times",
+                              "Expand times number for each dimension.");
+    AddComment(R"DOC(
+Expand operator tiles the input by given times number. You should set times
+number for each dimension by providing attribute 'expand_times'. The rank of X
+should be in [1, 6]. Please notice that size of 'expand_times' must be same with
+X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(expand_times):  [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class ExpandGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
+                        "Each dimension size of Input(Out@GRAD) should be "
+                        "equal to multiplication of crroresponding dimension "
+                        "size of Input(X) and Attr(expand_times) value.");
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
+            ops::ExpandGradOp);
+REGISTER_OP_CPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad, ops::ExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6744562b6c21dd8bfeb7e4cb6b809dc7913aa3a5
--- /dev/null
+++ b/paddle/operators/expand_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    expand_grad, ops::ExpandGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ae2c11a5d31dafc1b90d129054ebfabfb761bfe
--- /dev/null
+++ b/paddle/operators/expand_op.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+  case n + 1: {                     \
+    Expand<n + 1>(context);         \
+    break;                          \
+  }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n)                                               \
+  BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
+                         BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n)                                        \
+  case n: {                                                        \
+    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                         \
+  }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class ExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
+      default:
+        PADDLE_ENFORCE(false,
+                       "Only support tensor with rank being between 1 and 6.");
+    }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    auto x_dims = in0->dims();
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      bcast_dims[i] = expand_times[i];
+    }
+    auto x = EigenTensor<T, Rank>::From(*in0);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = x.broadcast(bcast_dims);
+  }
+};
+
+template <typename Place, typename T>
+class ExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto x_dims = in0->dims();
+    // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
+    //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
+    //    dimensions [expand_times[i], x_dims[i]].
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      if (expand_times[i] == 1) {
+        reshape_dims_vec.push_back(x_dims[i]);
+      } else {
+        if (x_dims[i] == 1) {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+        } else {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+          reshape_dims_vec.push_back(x_dims[i]);
+        }
+      }
+    }
+
+    int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED +
+               reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1;
+    // no need reduce, just copy
+    if (reduce_dims_vec.size() == 0) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      out0->CopyFrom(*in0, context.GetPlace(), context.device_context());
+    } else {
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(72)
+        default:
+          PADDLE_ENFORCE(
+              false, "Only support tensor with rank being between 1 and 6.");
+      }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandBackward(const framework::ExecutionContext& context,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1;
+    size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1;
+    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reshape dimensions.");
+    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reduce dimensions.");
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto x = EigenVector<T>::Flatten(*(context.Input<Tensor>("X")));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims / MAX_RANK_SUPPORTED + 1> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims % MAX_RANK_SUPPORTED + 1> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(context.GetEigenDevice<Place>()) =
+        out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu b/paddle/operators/fill_constant_batch_size_like_op.cu
index cfa5df001e9d6c606751e3ca3cddda02812ef180..298c196f1dfef388640e34153264986bd518a11a 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cu
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_constant_batch_size_like_op.h"
 
diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h
index a360e6683ec7204ea5bdbe27ca88a0ac51c983ac..339d97a30a5819ab488e83990651ba99212239ec 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/operators/fill_constant_batch_size_like_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -27,9 +27,8 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(ctx.GetPlace());
     auto value = ctx.Attr<float>("value");
 
-    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
-    auto place = ctx.GetEigenDevice<Place>();
-    out_eigen.device(place) = out_eigen.constant(static_cast<T>(value));
+    math::SetConstant<Place, T> setter;
+    setter(ctx.device_context(), out, static_cast<T>(value));
   }
 };
 
diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu
index fdbcf520a0d7b4ddfe3fc1837a21e0ce88b8e8fa..a6d4ba64bde534ea76867c456537b130a45b9496 100644
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_zeros_like_op.h"
 
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index cdf56a723b117fe7b08ef2749aa2c2978c923d44..7e7d78eea2bce427d6ad4dfb77bcb4ace35cd287 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -23,10 +23,11 @@ template <typename Place, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* output = context.Output<framework::Tensor>("Y");
-    output->mutable_data<T>(context.GetPlace());
-    auto t = framework::EigenVector<T>::Flatten(*output);
-    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+    auto* out = context.Output<framework::Tensor>("Y");
+    out->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<Place, T> setter;
+    setter(context.device_context(), out, static_cast<T>(0));
   }
 };
 
diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h
index de459818ad83d389e5a95e0303ae40b32743c4e7..3c60dc3dc7415f34ed9d238e6f41b197ec404883 100644
--- a/paddle/operators/l1_norm_op.h
+++ b/paddle/operators/l1_norm_op.h
@@ -29,7 +29,7 @@ class L1NormKernel : public framework::OpKernel<T> {
     Out->mutable_data<T>(context.GetPlace());
 
     auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
     auto place = context.GetEigenDevice<Place>();
 
     out.device(place) = x.abs().sum();
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index ce010fcb91873b3099f6bf52cfe20c1ff61846ea..f7d4db1947b83fecf57575e17fafe26795c92bdd 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -66,7 +66,8 @@ class LoDRankTableInferVarType : public framework::VarTypeInference {
   void operator()(const framework::OpDescBind &op_desc,
                   framework::BlockDescBind *block) const override {
     for (auto &o : op_desc.Output("Out")) {
-      block->Var(o)->SetType(framework::VarDesc::LOD_RANK_TABLE);
+      block->FindRecursiveOrCreateVar(o)->SetType(
+          framework::VarDesc::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32831cb1e2cf188a507773ef1e00b22de98d82ab
--- /dev/null
+++ b/paddle/operators/lod_reset_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDResetOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LoDResetOp should not be null.");
+    // If target LoD is not set form Input(), then it must be set from Attr().
+    if (!ctx->HasInput("TargetLoD")) {
+      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
+      PADDLE_ENFORCE(level0.size() > 1,
+                     "Target LoD is not found, should be set to be a valid one "
+                     "through Input() or Attr().");
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDResetOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
+    AddInput("TargetLoD",
+             "(Tensor, optional) The target level 0 LoD from Input().")
+        .AsDispensable();
+    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddAttr<std::vector<int>>("target_lod",
+                              "The target level 0 LoD from Attr().")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(LoDReset operator
+
+Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
+Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
+Currently the lod_reset operator only supports the reset of level 0 LoD.
+At least one of Input(TargetLoD) and Attr(target_lod) must be set,
+and if both of them are set, Input(TargetLoD) will be chosen as the
+target LoD.
+
+An example:
+Given a float LoDTensor X with shape (6, 1), its transpose form represents
+
+    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+
+with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+
+    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+
+If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
+the sequences that the LoDTensor Output(Out) contains becomes:
+
+    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+
+)DOC");
+  }
+};
+
+class LoDResetGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
+            ops::LoDResetGradOp);
+REGISTER_OP_CPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5244a17c3aad01909e3b8cf5f4d5abf8a44edc7f
--- /dev/null
+++ b/paddle/operators/lod_reset_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::GPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2bb916ccee80c83a02ea429fe95f5fafc86ccfa6
--- /dev/null
+++ b/paddle/operators/lod_reset_op.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class LoDResetKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
+
+    std::vector<int> level0;
+    if (lod_t) {
+      auto* lod = lod_t->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor lod_cpu;
+        lod_cpu.CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context());
+        lod = lod_cpu.data<int>();
+      }
+      level0 = std::vector<int>(lod, lod + lod_t->numel());
+    } else {
+      level0 = ctx.Attr<std::vector<int>>("target_lod");
+    }
+
+    PADDLE_ENFORCE(level0.size() > 1UL,
+                   "The size of target LoD should be greater than 1.");
+    PADDLE_ENFORCE(level0[0] == 0,
+                   "Target LoD should be a vector starting from 0.");
+    PADDLE_ENFORCE(level0.back() == in->dims()[0],
+                   "Target LoD should be a vector end with the "
+                   "first dimension of Input(X).");
+    for (size_t i = 0; i < level0.size() - 1; ++i) {
+      PADDLE_ENFORCE(level0[i + 1] > level0[i],
+                     "Target LoD should be an ascending vector.");
+    }
+
+    out->ShareDataWith(*in);
+    // cast level0 to size_t
+    std::vector<size_t> ulevel0(level0.size(), 0);
+    std::transform(level0.begin(), level0.end(), ulevel0.begin(),
+                   [](int a) { return static_cast<size_t>(a); });
+    framework::LoD target_lod;
+    target_lod.push_back(ulevel0);
+    out->set_lod(target_lod);
+  }
+};
+
+template <typename Place, typename T>
+class LoDResetGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->ShareDataWith(*d_out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 90bc9f4f922e7aa09523bad8ffb3ef477dd89857..ab7f23f57043844d45c36acc475422613164bee1 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -13,7 +13,7 @@ if(WITH_GPU)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 09c3f0b1e6f787547b9253d3aeadf70674708ba0..1b0d4c8bdc683b5203a4bc4b3838560cffe00bc8 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -234,8 +234,8 @@ void gemv<platform::CPUPlace, double>(const platform::DeviceContext& context,
 
 template struct SetConstant<platform::CPUPlace, float>;
 
-struct TensorSetConstant {
-  TensorSetConstant(framework::Tensor* tensor, float value)
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
   template <typename T>
   void operator()() const {
@@ -252,7 +252,7 @@ void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
   framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstant(tensor, value));
+                           TensorSetConstantCPU(tensor, value));
 }
 
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 255e480680499877ff599b96b8336a968cccbb34..817deec94314bdfd2ed7e4b0ba5212c72b813455 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -233,8 +233,8 @@ void gemv<platform::GPUPlace, double>(const platform::DeviceContext& context,
 
 template struct SetConstant<platform::GPUPlace, float>;
 
-struct TensorSetConstant {
-  TensorSetConstant(const platform::DeviceContext& context,
+struct TensorSetConstantGPU {
+  TensorSetConstantGPU(const platform::DeviceContext& context,
                     framework::Tensor* tensor, float value)
       : context_(context), tensor_(tensor), value_(value) {}
 
@@ -254,7 +254,7 @@ void set_constant_with_place<platform::GPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
   framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstant(context, tensor, value));
+                           TensorSetConstantGPU(context, tensor, value));
 }
 
 }  // namespace math
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
index 5ce30740c90b5cd0bd4f8ab183cf985ed5d827c1..4f565946d596b5e5fbf90f16c0c13c780c36886c 100644
--- a/paddle/operators/matmul_op.h
+++ b/paddle/operators/matmul_op.h
@@ -74,11 +74,10 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
   Tensor output;
   auto in_dims = input.dims();
   if (in_dims.size() == 3) {
-    output.Resize(in_dims);
+    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
     output.mutable_data<T>(context.GetPlace());
     EigenTranspose<Place, T, 3>(context, input, output, {1, 0, 2});
-    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
-    output.Resize(make_ddim(out_dims));
+    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
   } else {
     output.ShareDataWith(input);
   }
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80460c476921b63ec5228a9780880c7db3c85217
--- /dev/null
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class MergeLoDTensorOp : public framework::OperatorBase {
+ public:
+  MergeLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
+    auto &in_false =
+        scope.FindVar(Input("InFalse"))->Get<framework::LoDTensor>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    int rank = in_true.dims().size();
+    platform::Place place = in_true.place();
+    std::type_index data_type = in_true.type();
+    framework::DDim in_true_dims =
+        framework::slice_ddim(in_true.dims(), 1, rank);
+
+    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
+
+    auto in_true_dim_vec = framework::vectorize(in_true_dims);
+    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+
+    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+
+    // Build LoDTensor `out`
+
+    size_t in_true_idx = 0;
+    size_t in_false_idx = 0;
+    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+      const framework::LoDTensor *input = nullptr;
+      size_t *in_idx = nullptr;
+      if (static_cast<int>(mask_data[i]) == 0) {
+        input = &in_false;
+        in_idx = &in_false_idx;
+      } else {
+        input = &in_true;
+        in_idx = &in_true_idx;
+      }
+      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+          input->lod(), *in_idx, (*in_idx) + 1, 0);
+      auto &lod_length = lod_and_offset.first;
+
+      framework::AppendLoD(out_lod, lod_length);
+
+      size_t start_offset = lod_and_offset.second.first;
+      size_t end_offset = lod_and_offset.second.second;
+
+      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      size_t len = end_offset - start_offset;
+      if (len == 0) {
+        continue;
+      }
+      out->Slice(out_offset, out_offset + len)
+          .CopyFrom(input->Slice(start_offset, end_offset), place, dev_ctx);
+      out_offset += len;
+      (*in_idx) += 1;
+    }
+
+    for (size_t i = 0; i < level; i++) {
+      out_lod->insert(out_lod->begin(), x.lod()[i]);
+    }
+  }
+};
+
+class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MergeLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input LoDTensor, contains complete lod information to "
+             "construct the output");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddInput("InTrue", "The True branch to be merged");
+    AddInput("InFalse", "The False branch to be merged");
+    AddOutput("Out", "The merged output LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to rank.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Merge True and False branches of LoDTensor into a single Output,
+        with a mask at certain lod level. X is used to obtain complete
+        lod information. Please refer to SplitLoDTensorOp.)DOC");
+  }
+};
+
+class MergeLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "MergeLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "MergeLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasInput("InTrue"),
+                   "MergeLoDTensorOp must has input InTrue.");
+    PADDLE_ENFORCE(context->HasInput("InFalse"),
+                   "MergeLoDTensorOp must has input InFalse.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "MergeLoDTensorOp must has output Out");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
+  }
+};
+
+class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("split_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
+    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
+                  ops::MergeLoDTensorOpProtoMaker,
+                  ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index a81444dbe63edeecedc5d822c65ff56c42b5db90..66dc3d6d106a18640adad413d4e967fa101abcfc 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index bd1bdb4f81b88256822d663fe42ad314338c91ff..0eb9df41e9415845f88af283de63856158b447f9 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -16,16 +16,12 @@
 
 #include "paddle/operators/math/math_function.h"
 
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
 class MulKernel : public framework::OpKernel<T> {
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 5858cd4839d367bb888b2b98cde2225751391162..48e322f99398a7f1d6af9cab653d0cc92d981fe0 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -35,6 +35,7 @@ constexpr int kInvalidGPUId = -1;
 struct Communicator {
   std::vector<ncclComm_t> comms_;
   std::unordered_map<int, int> comm_id_map_;
+  bool inited_;
 
   Communicator() {}
 
@@ -42,17 +43,21 @@ struct Communicator {
 
   void InitAll(const std::vector<int>& gpus) {
     comms_.resize(gpus.size());
+    inited_ = false;
     for (size_t i = 0; i < gpus.size(); ++i) {
       comm_id_map_[gpus[i]] = i;
     }
     PADDLE_ENFORCE(
         dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
+    inited_ = true;
   }
 
   ~Communicator() {
-    for (size_t i = 0; i < comms_.size(); ++i) {
-      // FIXME(dzh) : PADDLE_ENFORCE return void
-      dynload::ncclCommDestroy(comms_[i]);
+    if (inited_) {
+      for (size_t i = 0; i < comms_.size(); ++i) {
+        // FIXME(dzh) : PADDLE_ENFORCE return void
+        dynload::ncclCommDestroy(comms_[i]);
+      }
     }
   }
 
diff --git a/paddle/operators/nccl_op_test.cu b/paddle/operators/nccl_op_test.cu
index e5927d56ae7cfbd09e941c993041af46ecd8d70d..56ba57854955c08031214d1f751c17fbb8bb882c 100644
--- a/paddle/operators/nccl_op_test.cu
+++ b/paddle/operators/nccl_op_test.cu
@@ -26,7 +26,6 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/program_desc.h"
 #include "paddle/framework/var_desc.h"
-#include "paddle/operators/math/math_function.h"
 #include "paddle/operators/nccl/nccl_gpu_common.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index b0e87b7059eab3772c179fe31cdb09477b589ed1..0075ccd24271bf83f139e121efad00c2316cc11b 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -387,8 +387,8 @@ class RecurrentGradOp : public RecurrentBase {
         auto &p_names = Inputs(kParameters);
         PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
 
-        for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) {
-          auto inside_grad_name = framework::GradVarName(p_names[prog_id]);
+        for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+          auto inside_grad_name = framework::GradVarName(p_names[param_id]);
 
           // If does not compute gradient of that variable inside rnn, just
           // continue
@@ -406,27 +406,19 @@ class RecurrentGradOp : public RecurrentBase {
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs);
+                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
             zero_op->Run(scope, dev_ctx);
           }
 
+          auto new_inside_name = cur_scope.Rename(inside_grad_name);
           // sum gradient
-          auto *outside_var = scope.FindVar(pg_names[prog_id]);
-          PADDLE_ENFORCE(outside_var != nullptr);
-          auto &outside_tensor =
-              *outside_var->GetMutable<framework::LoDTensor>();
-
-          std::string result_var_name;
-          auto *local_result_var = cur_scope.Var(&result_var_name);
-          auto &local_result_tensor =
-              *local_result_var->GetMutable<framework::LoDTensor>();
-
-          local_result_tensor.ShareDataWith(outside_tensor);
 
           auto sum_op = framework::OpRegistry::CreateOp(
-              "sum", {{"X", {result_var_name, inside_grad_name}}},
-              {{"Out", {result_var_name}}}, {});
+              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+              {{"Out", {pg_names[param_id]}}}, {});
           sum_op->Run(cur_scope, dev_ctx);
+
+          cur_scope.Rename(new_inside_name, inside_grad_name);
         }
       }
       VLOG(5) << "Accumulate Parameter finished ";
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index 64097ef2525d734f79f22ddd7957b3216b06ee7b..d1de0b444712a8c304c33bd194e306dfe3c41f02 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -47,7 +47,7 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                         framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "(vector<LoDTensor>) Input is a vector of LoDTensor, "
+             "(LodTensorArray) Input is a vector of LoDTensor, "
              "each of which is a variable-length sequence or nested sequence.")
         .AsDuplicable();
     AddOutput("Out",
@@ -68,38 +68,42 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The level should be less than the level number of inputs.")
         .SetDefault(0);
     AddComment(R"DOC(
-Sequence Concat Operator.
-
-The sequence_concat operator concatenates multiple LoDTensors.
-It supports a sequence (LoD Tensor with level number is 1)
+The sequence_concat operator concatenates multiple LoDTensors. 
+It only supports sequence (LoD Tensor with level number is 1) 
 or a nested sequence (LoD tensor with level number is 2) as its input.
-The following examples explain how the operator works:
 - Case1:
   If the axis is other than 0(here, axis is 1 and level is 1),
-  each input should have the same LoD information and the LoD
+  each input should have the same LoD information and the LoD 
   information of the output keeps the same as the input.
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
-    LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
+  LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
 
 - Case2:
-  If the axis is 0(here, leve is 0), the inputs are concatenated along
+  If the axis is 0(here, leve is 0), the inputs are concatenated along 
   time steps, the LoD information of the output need to re-compute.
+  The LoD information of level-1 should be same.
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,3,5}, {0,1,2,3,5}}; Dims(x1) = (5,3,4)
-    LoD(Out) = {{0,5,9}, {0,1,2,3,4,5,6,7,9}}; Dims(Out) = (9,3,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4)
 
 - Case3:
   If the axis is 0(here, level is 1).
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,3,5}, {0,1,3,4,5}}; Dims(x1) = (5,3,4)
-    LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4)
 
-NOTE: The levels of all the inputs should be the same.
+- Case4:
+  If the LoD number is 1, axis is 0, level is 0
 
+  LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4)
+
+NOTE: The levels of all the inputs should be the same.
     )DOC");
   }
 };
diff --git a/paddle/operators/sequence_concat_op.cu b/paddle/operators/sequence_concat_op.cu
index 8dc4764785871262d21a5631cc9e8b805ba84244..9ca99c2258f547e6f9c23be0d394bc3ea2bb6678 100644
--- a/paddle/operators/sequence_concat_op.cu
+++ b/paddle/operators/sequence_concat_op.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
-
 #include "paddle/operators/sequence_concat_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
index 6adf96120c99f9b84a1ff947058e65ac3ddff1d4..09212070aa90b0f080f6140a312924229162aaec 100644
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
@@ -24,28 +24,38 @@ using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
 template <typename T>
-LoD concatLoD(const std::vector<const T*> ins, const size_t axis,
-              const size_t level) {
+LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
   auto out_lod = ins[0]->lod();
+  auto numLevels = ins[0]->NumLevels();
   const size_t n = ins.size();
-  if (axis == 0UL) {
-    for (size_t i = 1; i < n; ++i) {
-      for (size_t j = 0; j < ins[i]->lod()[0].size(); ++j) {
-        out_lod[0][j] += ins[i]->lod()[0][j];
-      }
+  const size_t level_idx = ins[0]->NumLevels() - 1 - level;
+  for (size_t i = 1; i < n; ++i) {
+    for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) {
+      out_lod[level_idx][j] += ins[i]->lod()[level_idx][j];
+    }
+  }
 
-      if (ins[0]->NumLevels() == 2) {
-        for (size_t j = 1; j < ins[i]->lod()[1].size(); ++j) {
-          if (level == 0UL) {
-            out_lod[1].push_back(out_lod[1].back() + ins[i]->lod()[1][j] -
-                                 ins[i]->lod()[1][j - 1]);
-          } else if (level == 1UL) {
-            out_lod[1][j] += ins[1]->lod()[1][j];
-          }
+  for (size_t i = level_idx; i < numLevels - 1; ++i) {
+    size_t lod_len = 1;
+    for (size_t j = 0; j < n; ++j) {
+      lod_len += ins[j]->lod()[i + 1].size() - 1;
+    }
+    out_lod[i + 1].clear();
+    out_lod[i + 1].resize(lod_len);
+
+    size_t idx = 1;
+    for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) {
+      for (size_t k = 0; k < n; ++k) {
+        for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) {
+          out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] +
+                                ins[k]->lod()[i + 1][m + 1] -
+                                ins[k]->lod()[i + 1][m];
+          idx++;
         }
       }
     }
   }
+
   return out_lod;
 }
 
@@ -82,18 +92,21 @@ class SequenceConcatOpKernel : public framework::OpKernel<T> {
                       "should be greater than the specify level");
 
     out->mutable_data<T>(ctx.GetPlace());
-    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
+    auto out_lod = ins[0]->lod();
+    if (axis == 0) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
     out->set_lod(out_lod);
 
-    auto out_lod_level = out_lod[level];
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
       Tensor out_t = out->Slice(static_cast<int>(out_lod_level[i]),
                                 static_cast<int>(out_lod_level[i + 1]));
       auto out_stride = framework::stride(out_t.dims());
       size_t offset = 0;
-
       for (size_t j = 0; j < n; ++j) {
-        auto in_lod_level = ins[j]->lod()[level];
+        auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx];
         auto in_stride = framework::stride(ins[j]->dims());
         Tensor in_t = ins[j]->Slice(static_cast<int>(in_lod_level[i]),
                                     static_cast<int>(in_lod_level[i + 1]));
@@ -124,9 +137,12 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
       x_grads[i]->set_lod(ins[i]->lod());
       x_grads[i]->mutable_data<T>(ctx.GetPlace());
     }
-
-    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
-    auto out_lod_level = out_lod[level];
+    auto out_lod = ins[0]->lod();
+    if (axis == 0UL) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
 
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
       Tensor out_grad_t =
@@ -136,7 +152,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
       size_t offset = 0;
 
       for (size_t j = 0; j < n; ++j) {
-        auto x_grad_lod_level = x_grads[j]->lod()[level];
+        auto x_grad_lod_level =
+            framework::ToAbsOffset(x_grads[j]->lod())[level_idx];
         auto x_grad_stride = framework::stride(x_grads[j]->dims());
         Tensor x_grad_t =
             x_grads[j]->Slice(static_cast<int>(x_grad_lod_level[i]),
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index 2b8a25c2414c20efaffedfc8603697b3a104634f..7f136d8cf0e1eaae7b4de32988b60ae8a5034cc6 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -126,6 +126,7 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
       auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
       Eigen::DSizes<int, 2> bcast(h, 1);
 
       if (pooltype == "AVERAGE") {
@@ -136,9 +137,9 @@ class SequencePoolGradKernel : public framework::OpKernel<T> {
         in_g_e.device(place) =
             (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
       } else if (pooltype == "LAST") {
-        in_g_e.chip(h - 1, 0).device(place) = out_g_e;
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
       } else if (pooltype == "FIRST") {
-        in_g_e.chip(0, 0).device(place) = out_g_e;
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
       } else {
         PADDLE_THROW("unsupported pooling pooltype");
       }
diff --git a/paddle/operators/sequence_softmax_op.cu b/paddle/operators/sequence_softmax_op.cu
index f2a1e3d5e31ef21b95a51b287bdd1d4aa9221e89..7023795a3b5777c250a9323a304a54849d763e9e 100644
--- a/paddle/operators/sequence_softmax_op.cu
+++ b/paddle/operators/sequence_softmax_op.cu
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define EIGEN_USE_GPU
-
 #include "paddle/operators/sequence_softmax_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
index 3eb1e2844dff6ac94e86dcf4586bb51bc33adbec..1b68dd0662ddfffc57b187945fe131e202c55174 100644
--- a/paddle/operators/sequence_softmax_op.h
+++ b/paddle/operators/sequence_softmax_op.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/softmax.h"
 
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index 2e99a89699dbdcafc8055c47debf9e49f10507e6..013ace19ae3d4a1af29b570ba33fea3e4595fe5b 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -12,7 +12,6 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#define EIGEN_USE_GPU
 #include "paddle/operators/softmax_op.h"
 
 namespace ops = paddle::operators;
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 2c08853f4f615bfe95f51aa20776ddddcdaa8f61..44d1e63f1bb4798144218cd1caf01f133825bcff 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/softmax.h"
 
@@ -21,9 +20,6 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
 class SoftmaxKernel : public framework::OpKernel<T> {
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..db635f2ba0804143c9a2e04ff006dfbc8744f3fc
--- /dev/null
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -0,0 +1,186 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+using LoD = framework::LoD;
+
+class SplitLoDTensorOp : public framework::OperatorBase {
+ public:
+  SplitLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto *out_true =
+        scope.FindVar(Output("OutTrue"))->GetMutable<framework::LoDTensor>();
+    auto *out_false =
+        scope.FindVar(Output("OutFalse"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+    auto &x_lod = x.lod();
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
+
+    // set out_true/out_false lod
+    for (size_t t = 0; t < 2; t++) {
+      LoD *lod = nullptr;
+      if (t == 0) {
+        lod = out_false->mutable_lod();
+      } else {
+        lod = out_true->mutable_lod();
+      }
+      lod->clear();
+      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+        if (static_cast<size_t>(mask_data[i]) == t) {
+          size_t start_idx = i;
+          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+              x_lod, start_idx, start_idx + 1, level);
+
+          auto &lod_length = lod_and_offset.first;
+          framework::AppendLoD(lod, lod_length);
+
+          size_t start_offset = lod_and_offset.second.first;
+          size_t end_offset = lod_and_offset.second.second;
+          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+        }
+      }
+    }
+
+    for (size_t t = 0; t < 2; ++t) {
+      framework::LoDTensor *out;
+      if (t == 0) {
+        out = out_false;
+      } else {
+        out = out_true;
+      }
+      auto &ranges = copy_ranges[t];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out->Resize(x_dim);
+      out->mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[offset: offset+len] = x[each_range.begin: each_range.end]
+        out->Slice(static_cast<int>(offset), static_cast<int>(offset + len))
+            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                              static_cast<int>(each_range.end)),
+                      x.place(), dev_ctx);
+        offset += len;
+      }
+    }
+  }
+};
+
+class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitLoDTensorOpProtoMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input LoDTensor");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddOutput("OutTrue", "True branch of input LoDTensor");
+    AddOutput("OutFalse", "False branch of input LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to split.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Split a LoDTensor with a Mask at certain level. The input LoDTensor
+        has 3 sequence at certain lod level. The Mask is a bool column vector,
+        such as [0, 1, 0] at the same level. The first and third sequence will
+        be send to False Output LoDTensor; whereas the second sequence will
+        be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC");
+  }
+};
+
+class SplitLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "SplitLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "SplitLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasOutput("OutTrue"),
+                   "SplitLoDTensorOp must has output OutTrue.");
+    PADDLE_ENFORCE(context->HasOutput("OutFalse"),
+                   "SplitLoDTensorOp must has output OutFalse.");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
+    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
+  }
+};
+
+class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("merge_lod_tensor");
+    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
+    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp,
+                  ops::SplitLoDTensorOpProtoMaker,
+                  ops::SplitLoDTensorInferShape,
+                  ops::SplitLoDTensorArrayGradMaker);
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h
index c8d37ac40c1533a77acf78e6a42e1659555127e1..48d7b1c2d56882f04330dbf27b0a92e37cb8874c 100644
--- a/paddle/operators/squared_l2_norm_op.h
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -29,7 +29,7 @@ class SquaredL2NormKernel : public framework::OpKernel<T> {
     Out->mutable_data<T>(context.GetPlace());
 
     auto x = framework::EigenVector<T>::Flatten(*X);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = framework::EigenScalar<T>::From(*Out);
     auto place = context.GetEigenDevice<Place>();
 
     out.device(place) = x.square().sum();
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 750f96296a8414019265b26095d50eefb7dbb2dd..57b99bdb3a9359bbfdbe62a6fc9afca6c4d5df9e 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -99,11 +99,12 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [block](const std::string& name) {
-          return block->Var(name)->GetType() == framework::VarDesc::LOD_TENSOR;
+          return block->FindRecursiveOrCreateVar(name)->GetType() ==
+                 framework::VarDesc::LOD_TENSOR;
         });
 
     auto is_tensor_array = [block](const std::string& name) {
-      return block->Var(name)->GetType() ==
+      return block->FindRecursiveOrCreateVar(name)->GetType() ==
              framework::VarDesc::LOD_TENSOR_ARRAY;
     };
 
@@ -120,7 +121,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
     }
 
     auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(var_type);
+    block->FindRecursiveOrCreateVar(out_var_name)->SetType(var_type);
   }
 };
 
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index eaf6352748729fa04ccc9b9901608cb89f489c28..62e15604c47f25c458abc69ecd1cabf964de39bb 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -87,7 +87,8 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
                   framework::BlockDescBind *block) const override {
     for (auto &out_var : op_desc.OutputArgumentNames()) {
       VLOG(10) << "Set Variable " << out_var << " as LOD_TENSOR_ARRAY";
-      block->Var(out_var)->SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
+      block->FindRecursiveOrCreateVar(out_var)->SetType(
+          framework::VarDesc::LOD_TENSOR_ARRAY);
     }
   }
 };
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ca6c8507a48507fd29a9c9acae2bdf36ed936ee
--- /dev/null
+++ b/paddle/operators/while_op.cc
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <vector>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+using StepScopeVar = std::vector<framework::Scope *>;
+using LoDTensor = framework::LoDTensor;
+
+constexpr char kStepBlock[] = "step_block";
+constexpr char kCondition[] = "Condition";
+constexpr char kStepScopes[] = "StepScopes";
+constexpr char kParamGrads[] = "X@Grad";
+constexpr char kParameters[] = "X";
+
+class WhileOp : public framework::OperatorBase {
+ public:
+  WhileOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+    framework::Executor executor(dev_ctx);
+    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *program = block->Program();
+
+    auto step_scopes =
+        scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+    while (cond.data<bool>()[0]) {
+      auto &current_scope = scope.NewScope();
+      step_scopes->push_back(&current_scope);
+
+      executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+    }
+  }
+};
+
+class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WhileOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kParameters,
+             "A set of variables, which are required by operators inside the "
+             "block of While Op.")
+        .AsDuplicable();
+    AddInput(
+        kCondition,
+        "(Bool) An scalar. When it's False, the While Op will be terminated.")
+        .AsDuplicable();
+    AddOutput("Out",
+              "A set of variables, which will be assigned with values "
+              "generated by perators inside the block of While Op.")
+        .AsDuplicable();
+    AddOutput(kStepScopes,
+              "(StepScopeVar) A vector of local scope, which size equals the "
+              "step number of While Op. The i'th scope storages temporary "
+              "variables generated in the i'th step.");
+    AddAttr<framework::BlockDescBind *>(kStepBlock,
+                                        "The step block inside WhileOp");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+class WhileGradOp : public framework::OperatorBase {
+ public:
+  WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    //    PADDLE_ENFORCE(...)
+
+    framework::Executor executor(dev_ctx);
+    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
+    auto *program = block->Program();
+
+    auto *step_scopes =
+        scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
+
+    for (auto cur_scope_iter = step_scopes->rbegin();
+         cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
+      executor.Run(*program, *cur_scope_iter, block->ID(), false);
+
+      auto &pg_names = Outputs(kParamGrads);
+      auto &p_names = Inputs(kParameters);
+      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
+      for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) {
+        auto inside_grad_name = framework::GradVarName(p_names[prog_id]);
+
+        //  // TODO(tonyyang-savil: Not sure we need the following
+        //  // If does not compute gradient of that variable inside rnn,
+        //  just
+        //  // continue
+        //  if (local_var_names.find(inside_grad_name) ==
+        //  local_var_names.end()) {
+        //    continue;
+        //  }
+
+        // zero gradient variable in step 0
+        if (cur_scope_iter == step_scopes->rbegin()) {
+          auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
+          PADDLE_ENFORCE_NOT_NULL(var);
+          if (var->IsType<LoDTensor>()) {
+            auto &inside_tensor = var->Get<framework::LoDTensor>();
+            framework::AttributeMap attrs;
+            attrs["data_type"] = framework::ToDataType(inside_tensor.type());
+            attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
+            attrs["value"] = 0.0f;
+
+            auto zero_op = framework::OpRegistry::CreateOp(
+                "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs);
+            zero_op->Run(scope, dev_ctx);
+          }
+        }
+
+        // sum gradient
+        auto *outside_var = scope.FindVar(pg_names[prog_id]);
+        PADDLE_ENFORCE_NOT_NULL(outside_var);
+        auto &outside_tensor = *outside_var->GetMutable<framework::LoDTensor>();
+
+        std::string result_var_name;
+        auto *local_result_var = (*cur_scope_iter)->Var(&result_var_name);
+        auto &local_result_tensor =
+            *local_result_var->GetMutable<framework::LoDTensor>();
+
+        local_result_tensor.ShareDataWith(outside_tensor);
+
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {result_var_name, inside_grad_name}}},
+            {{"Out", {result_var_name}}}, {});
+        sum_op->Run(**cur_scope_iter, dev_ctx);
+      }
+    }
+  }
+};
+
+class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
+    auto *grad = new framework::OpDescBind();
+    grad->SetType("while_grad");
+    for (auto &input_param : this->InputNames()) {
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(framework::GradVarName(input_param),
+                      this->InputGrad(input_param));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      grad->SetInput(output_param, this->Output(output_param));
+      if (output_param != kStepScopes) {
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->OutputGrad(output_param));
+      }
+    }
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+
+    return std::unique_ptr<framework::OpDescBind>(grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(while, paddle::operators::WhileOp,
+                  paddle::operators::WhileOpMaker,
+                  paddle::operators::WhileGradOpDescMaker);
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9f49527dcf150fcb35d3af512088f75dec0b5c6
--- /dev/null
+++ b/paddle/platform/call_once.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+
+namespace paddle {
+namespace platform {
+
+/*
+ The current implementation of std::call_once has a bug described in
+ https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
+ This is likely caused by a deeper bug of pthread_once, which is discussed in
+ https://patchwork.ozlabs.org/patch/482350/
+
+ This wrap is a hack to avoid this bug.
+*/
+template <typename Callable, typename... Args>
+inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
+  bool good = false;
+  std::exception ex;
+  std::call_once(flag,
+                 [&](Args&&... args) {
+                   try {
+                     f(args...);
+                     good = true;
+                   } catch (const std::exception& e) {
+                     ex = e;
+                   } catch (...) {
+                     ex = std::runtime_error("excption caught in call_once");
+                   }
+                 },
+                 args...);
+  if (!good) {
+    throw std::exception(ex);
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index 0618c7414fd1235e81ee9d92a3a07b53d6ad6ebc..981b2ab258a34ce92f02ee12b5957f88ba61d1c0 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -17,6 +17,7 @@
 #include <dlfcn.h>
 #include <nccl.h>
 #include <mutex>
+#include "paddle/platform/call_once.h"
 #include "paddle/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -27,18 +28,18 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                    \
-  struct DynLoad__##__name {                                      \
-    template <typename... Args>                                   \
-    auto operator()(Args... args) -> decltype(__name(args...)) {  \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);   \
-      std::call_once(nccl_dso_flag,                               \
-                     paddle::platform::dynload::GetNCCLDsoHandle, \
-                     &nccl_dso_handle);                           \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);    \
-    }                                                             \
-  };                                                              \
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                         \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);        \
+      platform::call_once(nccl_dso_flag,                               \
+                          paddle::platform::dynload::GetNCCLDsoHandle, \
+                          &nccl_dso_handle);                           \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);              \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);         \
+    }                                                                  \
+  };                                                                   \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 0f906e0e470b7f95bb2103ae55330fc1831aa78f..3d8d3f1d2fd3977f945928c723db5fcafffeae85 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -42,6 +42,9 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #endif
 
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
 namespace paddle {
 namespace pybind {
 static size_t UniqueIntegerGenerator(const std::string &prefix) {
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 2d7ff1df98a9a448b447890537f20dd416a9ae9d..2c2cc6245932d4af56a68d6399ce31f008bf3748 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -321,6 +321,14 @@ message ClipConfig {
   required double max = 2;
 }
 
+message ROIPoolConfig {
+  required uint32 pooled_width = 1;
+  required uint32 pooled_height = 2;
+  required float spatial_scale = 3;
+  optional uint32 height = 4 [ default = 1 ];
+  optional uint32 width = 5 [ default = 1 ];
+}
+
 message ScaleSubRegionConfig {
   required ImageConfig image_conf = 1;
   required float value = 2;
@@ -348,6 +356,7 @@ message LayerInputConfig {
   optional DetectionOutputConfig detection_output_conf = 17;
   optional ClipConfig clip_conf = 18;
   optional ScaleSubRegionConfig scale_sub_region_conf = 19;
+  optional ROIPoolConfig roi_pool_conf = 20;
 }
 
 message LayerConfig {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 32578ad7799c0a276972ccef7770c2eae8438069..c8632295a25b160513a8e154bf1a5453c0005031 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -37,10 +37,10 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 
-add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)
 
 
 add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -66,7 +66,7 @@ if (WITH_TESTING)
     add_subdirectory(paddle/v2/tests)
     add_subdirectory(paddle/v2/reader/tests)
     add_subdirectory(paddle/v2/plot/tests)
-    add_subdirectory(paddle/v2/framework/tests)
+    add_subdirectory(paddle/v2/fluid/tests)
   endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9e2c6f59bd0af1627c79a8a29bd1515ae5c9c6b5..43d02bf70e74c3903d50a4a2177059f4f474045a 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1969,6 +1969,18 @@ class DetectionOutputLayer(LayerBase):
         self.config.size = size
 
 
+@config_layer('roi_pool')
+class ROIPoolLayer(LayerBase):
+    def __init__(self, name, inputs, pooled_width, pooled_height, spatial_scale,
+                 num_channels, **xargs):
+        super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs)
+        config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs')
+        self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width
+        self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height
+        self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale
+        self.set_cnn_layer(name, pooled_height, pooled_width, num_channels)
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self,
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 36406ef86b812bcb4c671a0e1b1f29e391d79b99..93ea5815d89ee866c382086c629638005002ad46 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -122,6 +122,7 @@ __all__ = [
     'cross_channel_norm_layer',
     'multibox_loss_layer',
     'detection_output_layer',
+    'roi_pool_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -221,6 +222,7 @@ class LayerType(object):
     PRIORBOX_LAYER = 'priorbox'
     MULTIBOX_LOSS_LAYER = 'multibox_loss'
     DETECTION_OUTPUT_LAYER = 'detection_output'
+    ROI_POOL_LAYER = 'roi_pool'
 
     CTC_LAYER = 'ctc'
     WARP_CTC_LAYER = 'warp_ctc'
@@ -1305,6 +1307,50 @@ def detection_output_layer(input_loc,
         name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
 
 
+@wrap_name_default("roi_pool")
+def roi_pool_layer(input,
+                   rois,
+                   pooled_width,
+                   pooled_height,
+                   spatial_scale,
+                   num_channels=None,
+                   name=None):
+    """
+    A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+    feature map.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param rois: The input ROIs' data.
+    :type rois: LayerOutput.
+    :param pooled_width: The width after pooling.
+    :type pooled_width: int
+    :param pooled_height: The height after pooling.
+    :type pooled_height: int
+    :param spatial_scale: The spatial scale between the image and feature map.
+    :type spatial_scale: float
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :return: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    size = num_channels * pooled_width * pooled_height
+    Layer(
+        name=name,
+        type=LayerType.ROI_POOL_LAYER,
+        inputs=[input.name, rois.name],
+        pooled_width=pooled_width,
+        pooled_height=pooled_height,
+        spatial_scale=spatial_scale,
+        num_channels=num_channels)
+    return LayerOutput(
+        name, LayerType.ROI_POOL_LAYER, parents=[input, rois], size=size)
+
+
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """
@@ -3546,10 +3592,9 @@ def lstm_step_layer(input,
     :type gate_act: BaseActivation
     :param state_act: State Activation Type. TanhActivation is the default.
     :type state_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param bias_attr: The parameter attribute for bias. If this parameter is
+                     set to True or None, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | True
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -3604,9 +3649,10 @@ def gru_step_layer(input,
     :param name: The name of this layer. It is optional.
     :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
     :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: the parameter_attribute for transforming the output_mem
                        from previous step.
@@ -3666,9 +3712,10 @@ def gru_step_naive_layer(input,
     :type act: BaseActivation
     :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
     :type gate_act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr:
     :param layer_attr:
@@ -3798,9 +3845,10 @@ def recurrent_layer(input,
     :type input: LayerOutput
     :param act: Activation type. TanhActivation is the default.
     :type act: BaseActivation
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to 
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If the parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: parameter attribute.
     :type param_attr: ParameterAttribute
@@ -4790,9 +4838,10 @@ def tensor_layer(a,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute | None
@@ -4854,9 +4903,10 @@ def selective_fc_layer(input,
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The bias attribute. If the parameter is set to False or an object
-                      whose type is not ParameterAttribute, no bias is defined. If the
-                      parameter is set to True, the bias is initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
     :type layer_attr: ExtraLayerAttribute | None
@@ -5539,10 +5589,10 @@ def nce_layer(input,
                              to the num_classes. Each member of the list defines
                              the probability of a class given input x.
     :type neg_distribution: list | tuple | collections.Sequence | None
-    :param bias_attr: The attribute for bias. If this parameter is set False or
-                      any object whose type is not ParameterAttribute, no bias
-                      is added. If this parameter is set True, the bias is
-                      initialized to zero.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -6452,9 +6502,9 @@ def gated_unit_layer(input,
     :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
                             for details.
     :type gate_param_attr: ParameterAttribute
-    :param gate_bias_attr: The bias attribute of the gate. If the parameter is set to False or
+    :param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
                            an object whose type is not ParameterAttribute, no bias is defined.
-                           If the parameter is set to True, the bias is initialized to zero.
+                           If this parameter is set to True, the bias is initialized to zero.
     :type gate_bias_attr: ParameterAttribute | bool | None | Any
     :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
                         details.
@@ -6462,9 +6512,9 @@ def gated_unit_layer(input,
     :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
                               for details.
     :type inproj_param_attr: ParameterAttribute
-    :param inproj_bias_attr: The bias attribute of the projection. If the parameter is set to False
+    :param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
                              or an object whose type is not ParameterAttribute, no bias is defined.
-                             If the parameter is set to True, the bias is initialized to zero.
+                             If this parameter is set to True, the bias is initialized to zero.
     :type inproj_bias_attr: ParameterAttribute | bool | None | Any
     :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
                        details.
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 3821d075cba5d39b5808a39093b8570d9302b667..d323d34c3ff47614342934c2a02492f66d27dc10 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -681,34 +681,42 @@ def lstmemory_unit(input,
                                    state_act=TanhActivation())
 
 
-    :param input: input layer.
+    :param input: Input layer.
     :type input: LayerOutput
-    :param out_memory: output of previous time step
+    :param out_memory: The output of previous time step.
     :type out_memory: LayerOutput | None
-    :param name: lstmemory unit name.
+    :param name: The lstmemory unit name.
     :type name: basestring
-    :param size: lstmemory unit size.
+    :param size: The lstmemory unit size.
     :type size: int
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
     :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
     :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
     :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
     :type state_act: BaseActivation
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
     :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param lstm_layer_attr: extra attribute of lstm layer.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
     :type lstm_layer_attr: ExtraLayerAttribute
-    :return: lstmemory unit name.
+    :return: The lstmemory unit name.
     :rtype: LayerOutput
     """
     if size is None:
@@ -786,34 +794,42 @@ def lstmemory_group(input,
                                     gate_act=SigmoidActivation(),
                                     state_act=TanhActivation())
 
-    :param input: input layer.
+    :param input: Input layer.
     :type input: LayerOutput
-    :param size: lstmemory group size.
+    :param size: The lstmemory group size.
     :type size: int
-    :param name: name of lstmemory group.
+    :param name: The name of lstmemory group.
     :type name: basestring
-    :param out_memory: output of previous time step.
+    :param out_memory: The output of previous time step.
     :type out_memory: LayerOutput | None
-    :param reverse: process the input in a reverse order or not.
+    :param reverse: Process the input in a reverse order or not.
     :type reverse: bool
-    :param param_attr: parameter attribute, None means default attribute.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
     :type param_attr: ParameterAttribute
-    :param act: last activiation type of lstm.
+    :param act: The last activiation type of lstm.
     :type act: BaseActivation
-    :param gate_act: gate activiation type of lstm.
+    :param gate_act: The gate activiation type of lstm.
     :type gate_act: BaseActivation
-    :param state_act: state activiation type of lstm.
+    :param state_act: The state activiation type of lstm.
     :type state_act: BaseActivation
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                           False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False|None
-    :param input_proj_bias_attr: bias attribute for input to hidden projection.
-                False means no bias, None means default bias.
-    :type input_proj_bias_attr: ParameterAttribute|False|None
-    :param input_proj_layer_attr: extra layer attribute for input to hidden
-                projection of the LSTM unit, such as dropout, error clipping.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
     :type input_proj_layer_attr: ExtraLayerAttribute
-    :param lstm_layer_attr: lstm layer's extra attribute.
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
     :type lstm_layer_attr: ExtraLayerAttribute
     :return: the lstmemory group.
     :rtype: LayerOutput
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 42aaed7a6469342086b8273eb5b80eaea905f851..1c7451e0abf5dc1b99671f292e2ffc2d2282abe9 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -9,7 +9,7 @@ test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
-test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
+test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
 test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f1bc65b3aee7488700a9d24e049adb510649c475
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
@@ -0,0 +1,98 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 588
+  active_type: ""
+  height: 14
+  width: 14
+}
+layers {
+  name: "rois"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 3136
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 3
+      output_x: 14
+      img_size: 14
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+      output_y: 14
+      img_size_y: 14
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 14
+  width: 14
+}
+layers {
+  name: "__roi_pool_0__"
+  type: "roi_pool"
+  size: 784
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    roi_pool_conf {
+      pooled_width: 7
+      pooled_height: 7
+      spatial_scale: 0.0625
+    }
+  }
+  inputs {
+    input_layer_name: "rois"
+  }
+  height: 7
+  width: 7
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 432
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+input_layer_names: "rois"
+output_layer_names: "__roi_pool_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "rois"
+  layer_names: "__conv_0__"
+  layer_names: "__roi_pool_0__"
+  input_layer_names: "data"
+  input_layer_names: "rois"
+  output_layer_names: "__roi_pool_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b739a81b8505c94a2312ac735647fb114982f1f7
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
@@ -0,0 +1,23 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=3 * 14 * 14, height=14, width=14)
+
+rois = data_layer(name='rois', size=10)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=3,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+roi_pool = roi_pool_layer(
+    input=conv,
+    rois=rois,
+    pooled_width=7,
+    pooled_height=7,
+    spatial_scale=1. / 16)
+
+outputs(roi_pool)
diff --git a/python/paddle/v2/framework/.gitignore b/python/paddle/v2/fluid/.gitignore
similarity index 100%
rename from python/paddle/v2/framework/.gitignore
rename to python/paddle/v2/fluid/.gitignore
diff --git a/python/paddle/v2/framework/__init__.py b/python/paddle/v2/fluid/__init__.py
similarity index 100%
rename from python/paddle/v2/framework/__init__.py
rename to python/paddle/v2/fluid/__init__.py
diff --git a/python/paddle/v2/framework/backward.py b/python/paddle/v2/fluid/backward.py
similarity index 97%
rename from python/paddle/v2/framework/backward.py
rename to python/paddle/v2/fluid/backward.py
index 678efd5d20585355a684bb2df16fdb57a69e0eeb..f188582178f667125ec95cd230100fdb10ce7e88 100644
--- a/python/paddle/v2/framework/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -1,4 +1,4 @@
-from paddle.v2.framework import framework as framework
+from paddle.v2.fluid import framework as framework
 
 __all__ = ['append_backward_ops']
 
diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/fluid/default_scope_funcs.py
similarity index 92%
rename from python/paddle/v2/framework/default_scope_funcs.py
rename to python/paddle/v2/fluid/default_scope_funcs.py
index c07f9a6ab96ac86fd6d20fbe0bc560845107f063..60c6165b6bd959f7bb3d92afed667f00f73f144f 100644
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/fluid/default_scope_funcs.py
@@ -13,7 +13,7 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope. 
 """
 
-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import threading
 
 __tl_scope__ = threading.local()
@@ -27,13 +27,13 @@ __all__ = [
 def get_cur_scope():
     """
     Get current scope.
-    :rtype: paddle.v2.framework.core.Scope
+    :rtype: paddle.v2.fluid.core.Scope
     """
     cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
     if cur_scope_stack is None:
         __tl_scope__.cur_scope = list()
     if len(__tl_scope__.cur_scope) == 0:
-        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
+        __tl_scope__.cur_scope.append(paddle.v2.fluid.core.Scope())
     return __tl_scope__.cur_scope[-1]
 
 
diff --git a/python/paddle/v2/framework/evaluator.py b/python/paddle/v2/fluid/evaluator.py
similarity index 94%
rename from python/paddle/v2/framework/evaluator.py
rename to python/paddle/v2/fluid/evaluator.py
index 254dd5f1a33eef17ad7a0117541255a4399ef23c..180d0135ffe8fa8982cfcde242033b5a69eed1cf 100644
--- a/python/paddle/v2/framework/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -1,6 +1,6 @@
-import paddle.v2.framework.op as op
+import paddle.v2.fluid.op as op
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 def avg_accumulate(accumulated_var, per_eval, num_batches, place):
@@ -22,7 +22,7 @@ class Evaluator(object):
         NOTE: default run on CPUPlace(), running on GPUPlace doesn't improve performance much.
 
         :param scope: the scope instance contains the input.
-        :type scope: paddle.v2.framework.core.scope
+        :type scope: paddle.v2.fluid.core.scope
         :param operator: operator name for caculating the evaluation for each mini-batch.
         :type operator: string
         :param input: output variable name of forward network.
diff --git a/python/paddle/v2/framework/executor.py b/python/paddle/v2/fluid/executor.py
similarity index 94%
rename from python/paddle/v2/framework/executor.py
rename to python/paddle/v2/fluid/executor.py
index f5c833190e73a277bef2509e02c4be051768933d..ed1c2c06daa7ede97e138049a1f7044d071c31e8 100644
--- a/python/paddle/v2/framework/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Block, Program, g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import Block, Program, g_main_program
 
 g_scope = core.Scope()
 
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/fluid/framework.py
similarity index 99%
rename from python/paddle/v2/framework/framework.py
rename to python/paddle/v2/fluid/framework.py
index 8fb3cca91e5f8759b8a83b12428c78d222f382ac..e2587b4f74506c6eb2b253fa9b24db4838bfedbc 100644
--- a/python/paddle/v2/framework/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import collections
 import numpy as np
 import copy
@@ -285,7 +285,7 @@ class Operator(object):
         self.desc.check_attrs()
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
-            'rnn_memory_helper_grad'
+            'rnn_memory_helper_grad', 'conditional_block', 'while'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
diff --git a/python/paddle/v2/framework/initializer.py b/python/paddle/v2/fluid/initializer.py
similarity index 99%
rename from python/paddle/v2/framework/initializer.py
rename to python/paddle/v2/fluid/initializer.py
index 98a87bfa86efb39f381b9f99b2b1f0d7ec7d9833..ded144ecd5db83ce50ca0dc6243fdc52ac0b7a2f 100644
--- a/python/paddle/v2/framework/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework
 import numpy as np
 
 __all__ = [
diff --git a/python/paddle/v2/framework/io.py b/python/paddle/v2/fluid/io.py
similarity index 98%
rename from python/paddle/v2/framework/io.py
rename to python/paddle/v2/fluid/io.py
index 5c247904a330e25b1a9f53db431947840db3f615..394a171c67a99ffb0c7caaf71e850fe541f8286e 100644
--- a/python/paddle/v2/framework/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -1,7 +1,7 @@
 import os
 import cPickle as pickle
 
-from paddle.v2.framework.framework import Program, Parameter, g_main_program, \
+from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
     Variable
 
 __all__ = [
diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
similarity index 94%
rename from python/paddle/v2/framework/layer_helper.py
rename to python/paddle/v2/fluid/layer_helper.py
index c38346b79fecfb2f82a60b360c505da16ecdf3c0..9dc3c119ea47ca11956d85119ce1ec6d3d6bb7e8 100644
--- a/python/paddle/v2/framework/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -1,10 +1,10 @@
 import copy
 import itertools
 
-from paddle.v2.framework.framework import Variable, g_main_program, \
+from paddle.v2.fluid.framework import Variable, g_main_program, \
     g_startup_program, unique_name, Program
-from paddle.v2.framework.initializer import ConstantInitializer, \
-    UniformInitializer
+from paddle.v2.fluid.initializer import ConstantInitializer, \
+    UniformInitializer, XavierInitializer
 
 
 class LayerHelper(object):
@@ -61,7 +61,7 @@ class LayerHelper(object):
 
     @property
     def param_attr(self):
-        default = {'name': None, 'initializer': UniformInitializer()}
+        default = {'name': None, 'initializer': XavierInitializer()}
         actual = self.kwargs.get('param_attr', None)
         if actual is None:
             actual = default
@@ -70,10 +70,11 @@ class LayerHelper(object):
                 actual[default_field] = default[default_field]
         return actual
 
+    @property
     def bias_attr(self):
-        default = {'name': None, 'initializer': ConstantInitializer()}
+        default = {'name': None, 'initializer': XavierInitializer()}
         bias_attr = self.kwargs.get('bias_attr', None)
-        if bias_attr is True:
+        if bias_attr is None:
             bias_attr = default
 
         if isinstance(bias_attr, dict):
@@ -166,7 +167,7 @@ class LayerHelper(object):
                 num_flatten_dims = 1
 
         size = list(input_var.shape[num_flatten_dims:])
-        bias_attr = self.bias_attr()
+        bias_attr = self.bias_attr
         if not bias_attr:
             return input_var
 
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/fluid/layers.py
similarity index 76%
rename from python/paddle/v2/framework/layers.py
rename to python/paddle/v2/fluid/layers.py
index f40c3cf43a6a400f67732ebd4f55afd35f98c01c..8a1aa1c42d5a006539d221f96e3535434c9a4c43 100644
--- a/python/paddle/v2/framework/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -1,22 +1,24 @@
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, \
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
     Operator
-from paddle.v2.framework.initializer import ConstantInitializer, \
+from paddle.v2.fluid.initializer import ConstantInitializer, \
     NormalInitializer
-from paddle.v2.framework.layer_helper import LayerHelper, unique_name
+from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
+import cStringIO
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
     'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
-    'batch_norm', 'accuracy'
+    'batch_norm', 'accuracy', 'split_lod_tensor'
 ]
 
 
 def fc(input,
        size,
        param_attr=None,
-       bias_attr=True,
+       bias_attr=None,
        name=None,
        act=None,
        num_flatten_dims=1,
@@ -125,6 +127,55 @@ def embedding(input,
     return tmp
 
 
+# TODO(qijun): expose H0 and C0
+def dynamic_lstm(input,
+                 size,
+                 data_type='float32',
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('lstm', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=data_type)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=data_type, suffix='b')
+
+    hidden = helper.create_tmp_variable(data_type)
+    cell = helper.create_tmp_variable(data_type)
+    batch_gate = helper.create_tmp_variable(data_type)
+    batch_cell_pre_act = helper.create_tmp_variable(data_type)
+
+    helper.append_op(
+        type='lstm',
+        inputs={'Input': input,
+                'Weight': weight,
+                'Bias': bias},
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
+
+
 def data(name,
          shape,
          data_type='float32',
@@ -175,6 +226,11 @@ def data(name,
         stop_gradient=stop_gradient)
 
 
+def create_tensor(dtype, name=None, main_program=None):
+    helper = LayerHelper("create_tensor", **locals())
+    return helper.create_variable(name=helper.name, dtype=dtype)
+
+
 def _convert_(name):
     """
     Formatting.
@@ -191,6 +247,58 @@ def _convert_(name):
     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
 
 
+def _generate_doc_string_(op_proto):
+    """
+    Generate docstring by OpProto
+    
+    Args:
+        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
+
+    Returns:
+        str: the document string
+    """
+
+    def _type_to_str_(tp):
+        return framework_pb2.AttrType.Name(tp)
+
+    if not isinstance(op_proto, framework_pb2.OpProto):
+        raise TypeError("OpProto should be `framework_pb2.OpProto`")
+
+    buf = cStringIO.StringIO()
+    buf.write(op_proto.comment)
+    buf.write('\nArgs:\n')
+    for each_input in op_proto.inputs:
+        line_begin = '    {0}: '.format(_convert_(each_input.name))
+        buf.write(line_begin)
+        buf.write(each_input.comment)
+        buf.write('\n')
+        buf.write(' ' * len(line_begin))
+        buf.write('Duplicable: ')
+        buf.write(str(each_input.duplicable))
+        buf.write('  Optional: ')
+        buf.write(str(each_input.dispensable))
+        buf.write('\n')
+
+    for each_attr in op_proto.attrs:
+        buf.write('    ')
+        buf.write(each_attr.name)
+        buf.write(' (')
+        buf.write(_type_to_str_(each_attr.type))
+        buf.write('): ')
+        buf.write(each_attr.comment)
+        buf.write('\n')
+
+    if len(op_proto.outputs) != 0:
+        buf.write('\nReturns:\n')
+        buf.write('    ')
+        for each_opt in op_proto.outputs:
+            if not each_opt.intermediate:
+                break
+        buf.write(each_opt.comment)
+
+    return buf.getvalue()
+
+
 def _create_op_func_(op_type):
     """
     Create an Operator for a Function.
@@ -249,11 +357,6 @@ def _create_op_func_(op_type):
         return dtype
 
     def func(**kwargs):
-        """
-        This function implements the function for the operator. This process
-        involves doing the sanity check (using the function above), reading
-        inputs from protobuf and applying the activations on top.
-        """
         helper = LayerHelper(op_type, **kwargs)
 
         dtype = infer_and_check_data_type(op_proto, **kwargs)
@@ -277,6 +380,7 @@ def _create_op_func_(op_type):
 
     func.__name__ = op_type
     globals()[op_type] = func
+    func.__doc__ = _generate_doc_string_(op_proto)
     global __all__
     __all__.append(op_type)
 
@@ -352,6 +456,56 @@ def sums(input, main_program=None, startup_program=None):
     return out
 
 
+def assign(input, output, main_program=None):
+    helper = LayerHelper('assign', **locals())
+    helper.append_op(
+        type='scale',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs={'scale': 1.0})
+    return output
+
+
+def split_lod_tensor(input,
+                     mask,
+                     level,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('split_lod_tensor', **locals())
+    out_true = helper.create_tmp_variable(dtype=input.data_type)
+    out_false = helper.create_tmp_variable(dtype=input.data_type)
+    helper.append_op(
+        type='split_lod_tensor',
+        inputs={
+            'X': input,
+            'Mask': mask,
+        },
+        outputs={'OutTrue': out_true,
+                 'OutFalse': out_false},
+        attrs={'level': level})
+    return out_true, out_false
+
+
+def merge_lod_tensor(in_true,
+                     in_false,
+                     x,
+                     mask,
+                     level,
+                     main_program=None,
+                     startup_program=None):
+    helper = LayerHelper('merge_lod_tensor', **locals())
+    out = helper.create_tmp_variable(dtype=x.data_type)
+    helper.append_op(
+        type='merge_lod_tensor',
+        inputs={'X': x,
+                'Mask': mask,
+                'InTrue': in_true,
+                'InFalse': in_false},
+        outputs={'Out': out},
+        attrs={'level': level})
+    return out
+
+
 def cos_sim(X, Y, **kwargs):
     """
     This function performs the cosine similarity between two tensors
@@ -685,6 +839,23 @@ def batch_norm(input,
     return helper.append_activation(batch_norm_out)
 
 
+def beam_search_decode(ids, scores, main_program=None, startup_program=None):
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.data_type)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.data_type)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        })
+
+    return sentence_ids, sentence_scores
+
+
 class BlockGuard(object):
     """
     BlockGuard class.
@@ -717,7 +888,7 @@ class StaticRNNGuard(BlockGuard):
 
     def __init__(self, rnn):
         if not isinstance(rnn, StaticRNN):
-            raise TypeError("StaticRNNGuard takes an StaticRNN")
+            raise TypeError("StaticRNNGuard takes a StaticRNN")
         super(StaticRNNGuard, self).__init__(rnn.helper.main_program)
         self.rnn = rnn
 
@@ -964,6 +1135,82 @@ class StaticRNN(object):
             })
 
 
+class WhileGuard(BlockGuard):
+    def __init__(self, while_op):
+        if not isinstance(while_op, While):
+            raise TypeError("WhileGuard takes a while op")
+        super(WhileGuard, self).__init__(while_op.helper.main_program)
+        self.while_op = while_op
+
+    def __enter__(self):
+        self.while_op.status = While.IN_WHILE_BLOCK
+        return super(WhileGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.while_op.status = While.AFTER_WHILE_BLOCK
+        self.while_op.complete()
+        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class While(object):
+    BEFORE_WHILE_BLOCK = 0
+    IN_WHILE_BLOCK = 1
+    AFTER_WHILE_BLOCK = 2
+
+    def __init__(self, cond, name=None, main_program=None):
+        self.helper = LayerHelper("while", name=name, main_program=main_program)
+        self.status = While.BEFORE_WHILE_BLOCK
+        if not isinstance(cond, Variable):
+            raise TypeError("condition should be a variable")
+        assert isinstance(cond, Variable)
+        if cond.data_type != core.DataType.BOOL:
+            raise TypeError("condition should be a bool variable")
+        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
+            raise TypeError("condition should be a bool scalar")
+        self.cond_var = cond
+
+    def block(self):
+        return WhileGuard(self)
+
+    def complete(self):
+        main_program = self.helper.main_program
+        while_block = main_program.current_block()
+        parent_block = main_program.block(main_program.current_block()
+                                          .parent_idx)
+
+        inner_outputs = {self.cond_var.name}
+        x_name_list = set()
+        for op in while_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        parent_block.append_op(
+            type='while',
+            inputs={
+                'X': [parent_block.var(x_name) for x_name in x_name_list],
+                'Condition': [self.cond_var]
+            },
+            outputs={'Out': out_vars,
+                     'StepScopes': [step_scope]},
+            attrs={'step_block': while_block})
+
+
 def lstm(x,
          c_pre_init,
          hidden_dim,
@@ -1102,10 +1349,10 @@ def increment(x, value=1.0, in_place=True, main_program=None):
     operation is performed in-place by default.
     """
     helper = LayerHelper("increment", **locals())
-    if in_place:
-        out = x
-    else:
+    if not in_place:
         out = helper.create_tmp_variable(dtype=x.data_type)
+    else:
+        out = x
     helper.append_op(
         type='increment',
         inputs={'X': [x]},
@@ -1133,6 +1380,26 @@ def array_write(x, i, array=None, main_program=None):
     return array
 
 
+def create_array(dtype, main_program=None):
+    helper = LayerHelper("array", **locals())
+    return helper.create_variable(
+        name="{0}.out".format(helper.name),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=dtype)
+
+
+def less_than(x, y, cond=None, main_program=None):
+    helper = LayerHelper("less_than", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='less_than', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
 def array_read(array, i, main_program=None):
     """
     This function creates an operator to read the data in as a
@@ -1180,3 +1447,73 @@ def array_length(array, main_program=None):
     helper.append_op(
         type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
     return tmp
+
+
+class ConditionalBlockGuard(BlockGuard):
+    def __init__(self, block):
+        if not isinstance(block, ConditionalBlock):
+            raise TypeError("block should be conditional block")
+        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
+        self.block = block
+
+    def __enter__(self):
+        return super(ConditionalBlockGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.block.complete()
+        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
+                                                           exc_tb)
+
+
+class ConditionalBlock(object):
+    def __init__(self, inputs, name=None, main_program=None):
+        for each_input in inputs:
+            if not isinstance(each_input, Variable):
+                raise TypeError("Each input should be variable")
+        self.inputs = inputs
+        self.helper = LayerHelper(
+            'conditional_block', name=name, main_program=main_program)
+
+    def block(self):
+        return ConditionalBlockGuard(self)
+
+    def complete(self):
+        inside_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(inside_block.parent_idx)
+
+        intermediate = set()
+        params = set()
+
+        for each_op in inside_block.ops:
+            assert isinstance(each_op, Operator)
+            for iname in each_op.input_names:
+                for in_var_name in each_op.input(iname):
+                    if in_var_name not in intermediate:
+                        params.add(in_var_name)
+
+            for oname in each_op.output_names:
+                for out_var_name in each_op.output(oname):
+                    intermediate.add(out_var_name)
+        input_set = set([ipt.name for ipt in self.inputs])
+
+        param_list = [
+            parent_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name not in intermediate
+        ]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        parent_block.append_op(
+            type='conditional_block',
+            inputs={
+                'X': self.inputs,
+                'Params': param_list,
+            },
+            outputs={'Out': out_list,
+                     'Scope': [step_scope]},
+            attrs={'block': inside_block})
diff --git a/python/paddle/v2/framework/net_drawer.py b/python/paddle/v2/fluid/net_drawer.py
similarity index 96%
rename from python/paddle/v2/framework/net_drawer.py
rename to python/paddle/v2/fluid/net_drawer.py
index 045e267c253e2485e75df3fb95cc0e591ee29ea5..17ad547c2bb5b79ef8225dd1a8f1ef49a6572508 100644
--- a/python/paddle/v2/framework/net_drawer.py
+++ b/python/paddle/v2/fluid/net_drawer.py
@@ -3,8 +3,8 @@ import json
 import logging
 from collections import defaultdict
 
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
diff --git a/python/paddle/v2/framework/nets.py b/python/paddle/v2/fluid/nets.py
similarity index 98%
rename from python/paddle/v2/framework/nets.py
rename to python/paddle/v2/fluid/nets.py
index 725d2fa7f5e7a862eea0ef9172a9e63858ebd0dd..5e14ca594bc7965dc29039ba57bb7b26b1ce6871 100644
--- a/python/paddle/v2/framework/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.layers as layers
+import paddle.v2.fluid.layers as layers
 
 __all__ = ["simple_img_conv_pool", "sequence_conv_pool"]
 
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/fluid/op.py
similarity index 98%
rename from python/paddle/v2/framework/op.py
rename to python/paddle/v2/fluid/op.py
index bc771a964adf9f97cbeae87c06ce954c76051150..5828803497ec06bc7644da18ca752f61469ca53f 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/fluid/op.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 
 def get_all_op_protos():
diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/fluid/optimizer.py
similarity index 84%
rename from python/paddle/v2/framework/optimizer.py
rename to python/paddle/v2/fluid/optimizer.py
index 5b4cdecf2c4285618131657a09fbe437191ea75a..4252a6f08509fec92ac5c45d32169232e1dd190f 100644
--- a/python/paddle/v2/framework/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -1,11 +1,11 @@
 from collections import defaultdict
 
-import paddle.v2.framework.framework as framework
-from paddle.v2.framework.framework import unique_name, Program
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.initializer import ConstantInitializer
-from paddle.v2.framework.regularizer import append_regularization_ops
-from paddle.v2.framework.layer_helper import LayerHelper
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.framework import unique_name, Program
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.initializer import ConstantInitializer
+from paddle.v2.fluid.regularizer import append_regularization_ops
+from paddle.v2.fluid.layer_helper import LayerHelper
 
 __all__ = [
     'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
@@ -35,15 +35,21 @@ class Optimizer(object):
         """
         raise NotImplementedError()
 
-    def _initialize_tensors(self, block):
-        """Create all necessary tensors, that will be shared for all parameter updates.
-
-        Tensors like learning rate should be initialized here.
-
-        Args:
-            block: the block in which the loss variable is present
-        """
-        pass
+    def _create_param_lr(self, param_and_grad):
+        # create learning rate variable for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate']
+        param_lr_shape = [1]
+        param_lr_var = self.helper.create_global_variable(
+            name=unique_name("learning_rate"),
+            dtype='float32',
+            shape=param_lr_shape,
+            lod_level=1,
+            persistable=True)
+        param_lr = param_lr * self._learning_rate
+        self.helper.set_variable_initializer(
+            var=param_lr_var, initializer=ConstantInitializer(param_lr))
+        return param_lr_var
 
     def _create_accumulators(self, block, parameters):
         """Create all accumulators needed by the parameters
@@ -161,8 +167,6 @@ class Optimizer(object):
             startup_program=startup_program)
         self._create_accumulators(loss.block,
                                   [p[0] for p in parameters_and_grads])
-        # Create any necessary tensors
-        self._initialize_tensors(loss.block)
 
         optimize_ops = []
         for param_and_grad in parameters_and_grads:
@@ -214,27 +218,16 @@ class SGDOptimizer(Optimizer):
         self.type = "sgd"
         self._learning_rate = learning_rate
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
+
         # create the optimize op
         sgd_op = block.append_op(
             type=self.type,
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={"ParamOut": param_and_grad[0]})
 
@@ -259,19 +252,6 @@ class MomentumOptimizer(Optimizer):
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
 
-    def _initialize_tensors(self, block):
-        assert isinstance(block, framework.Block)
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -290,7 +270,7 @@ class MomentumOptimizer(Optimizer):
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
                 "Velocity": velocity_acc,
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={
                 "ParamOut": param_and_grad[0],
@@ -315,18 +295,6 @@ class AdagradOptimizer(Optimizer):
         self._learning_rate = learning_rate
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -346,7 +314,7 @@ class AdagradOptimizer(Optimizer):
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
                 "Moment": moment_acc,
-                "LearningRate": self._lr
+                "LearningRate": self._create_param_lr(param_and_grad)
             },
             outputs={"ParamOut": param_and_grad[0],
                      "MomentOut": moment_acc},
@@ -378,18 +346,6 @@ class AdamOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
 
@@ -433,7 +389,7 @@ class AdamOptimizer(Optimizer):
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr,
+                "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment1": moment1,
                 "Moment2": moment2,
                 "Beta1Pow": self._beta1_pow_acc,
@@ -495,18 +451,6 @@ class AdamaxOptimizer(Optimizer):
         self._beta2 = beta2
         self._epsilon = epsilon
 
-    def _initialize_tensors(self, block):
-        lr_shape = [1]
-        # create a variable for learning_rate
-        self._lr = self.helper.create_global_variable(
-            name=unique_name("learning_rate"),
-            dtype='float32',
-            shape=lr_shape,
-            lod_level=1,
-            persistable=True)
-        self.helper.set_variable_initializer(
-            var=self._lr, initializer=ConstantInitializer(self._learning_rate))
-
     def _create_accumulators(self, block, parameters):
         # Create beta1 power accumulator tensor
         beta_shape = [1]
@@ -536,7 +480,7 @@ class AdamaxOptimizer(Optimizer):
             inputs={
                 "Param": param_and_grad[0],
                 "Grad": param_and_grad[1],
-                "LearningRate": self._lr,
+                "LearningRate": self._create_param_lr(param_and_grad),
                 "Moment": moment,
                 "InfNorm": inf_norm,
                 "Beta1Pow": self._beta1_pow_acc
diff --git a/python/paddle/v2/framework/regularizer.py b/python/paddle/v2/fluid/regularizer.py
similarity index 98%
rename from python/paddle/v2/framework/regularizer.py
rename to python/paddle/v2/fluid/regularizer.py
index 5111ac5566feb7d334ff4cd8e70daa0cfbd6e552..098cd0dd6439554f49e429ab75fb11bfa2c9d28c 100644
--- a/python/paddle/v2/framework/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.framework as framework
+import paddle.v2.fluid.framework as framework
 
 __all__ = [
     'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'
diff --git a/python/paddle/v2/framework/tests/.gitignore b/python/paddle/v2/fluid/tests/.gitignore
similarity index 100%
rename from python/paddle/v2/framework/tests/.gitignore
rename to python/paddle/v2/fluid/tests/.gitignore
diff --git a/python/paddle/v2/fluid/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e795627bfe9e8ad0c196349a332e62e975f20aa3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
@@ -0,0 +1,7 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
+
+add_subdirectory(book)
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
similarity index 100%
rename from python/paddle/v2/framework/tests/CMakeLists.txt
rename to python/paddle/v2/fluid/tests/book/CMakeLists.txt
diff --git a/python/paddle/v2/framework/tests/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
similarity index 87%
rename from python/paddle/v2/framework/tests/test_fit_a_line.py
rename to python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index 6e09b88dca34de2579131e7bdc16b26cf6cde49c..5ef963bffa4e4fa3992e1f811d7f514662809410 100644
--- a/python/paddle/v2/framework/tests/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.io import save_persistables, load_persistables
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.io import save_persistables, load_persistables
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_image_classification_train.py
rename to python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index a4165da9703c55ae3347123409407f0cae30856f..e253b8d27fd29746b41d82a63b11485032e77ebb 100644
--- a/python/paddle/v2/framework/tests/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,12 +1,12 @@
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.framework.core as core
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.optimizer as optimizer
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_startup_program, g_main_program
-from paddle.v2.framework.initializer import XavierInitializer
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.initializer import XavierInitializer
 
 
 def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
similarity index 90%
rename from python/paddle/v2/framework/tests/test_recognize_digits_conv.py
rename to python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
index 66c629eb4261a9b971f25611d8e49f0cb671304a..2b723125412c17f3805ee3cae046b0788aa34997 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
similarity index 88%
rename from python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
rename to python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index 076cf882160cd53f45ef291d82ba57ada843a287..2e1a9f236b6621c7334a9eb04272a6eb69c86fab 100644
--- a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -1,12 +1,12 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
-
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.regularizer import L2DecayRegularizer
-from paddle.v2.framework.initializer import UniformInitializer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.regularizer import L2DecayRegularizer
+from paddle.v2.fluid.initializer import UniformInitializer
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_recommender_system.py
rename to python/paddle/v2/fluid/tests/book/test_recommender_system.py
index 31562b4391d16b831d53801cfa21c7bdf8c3ab8d..4708dfe3e9209a3254a9e1903cbedf07ebc5d2d0 100644
--- a/python/paddle/v2/framework/tests/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
similarity index 90%
rename from python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
rename to python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
index eb377e9264b6031e9bf484a90b7c2b39442407f1..dc4b63da9b37aff55fc6362f239e3e61004a3866 100644
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program, g_main_program, g_startup_program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program, g_main_program, g_startup_program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d507f4c8e39ba039603a5b7618e7a82d1dcb21b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
@@ -0,0 +1,110 @@
+import paddle.v2 as paddle
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
+
+from paddle.v2.fluid.framework import Program, g_main_program, g_startup_program
+from paddle.v2.fluid.executor import Executor
+
+import numpy as np
+
+
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    assert stacked_num % 2 == 1
+    data = layers.data(name="words", shape=[1], data_type="int64")
+    label = layers.data(name="label", shape=[1], data_type="int64")
+
+    emb = layers.embedding(input=data, size=[input_dim, emb_dim])
+    # add bias attr
+
+    # TODO(qijun) linear act
+    fc1 = layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = layers.fc(input=[fc_last, lstm_last],
+                           size=class_dim,
+                           act='softmax')
+    cost = layers.cross_entropy(input=prediction, label=label)
+    avg_cost = layers.mean(x=cost)
+    adam_optimizer = optimizer.AdamOptimizer(learning_rate=0.002)
+    opts = adam_optimizer.minimize(avg_cost)
+    acc = layers.accuracy(input=prediction, label=label)
+    return avg_cost, acc
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    print "load word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    cost, acc = stacked_lstm_net(input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(g_startup_program)
+
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
+
+            label = np.array(map(lambda x: x[1], data)).astype("int64")
+            label = label.reshape([BATCH_SIZE, 1])
+
+            tensor_label = core.LoDTensor()
+            tensor_label.set(label, place)
+
+            outs = exe.run(g_main_program,
+                           feed={"words": tensor_words,
+                                 "label": tensor_label},
+                           fetch_list=[cost, acc])
+            cost_val = np.array(outs[0])
+            acc_val = np.array(outs[1])
+
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+            if cost_val < 1.0 and acc_val > 0.7:
+                exit(0)
+    exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
rename to python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index 26cbd01bc04916e53554e6f70bee7bcf25d6371c..848dcce974a107402c33013e9f84211fd4979e21 100644
--- a/python/paddle/v2/framework/tests/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -1,10 +1,10 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import g_main_program, g_startup_program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import g_main_program, g_startup_program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_word2vec.py
rename to python/paddle/v2/fluid/tests/book/test_word2vec.py
index cb9fc2ab62b56348db7a320f7d40d2f0a7bf9d21..054dbd5a3d090ba8a08e8f101de11c69ddd36d8a 100644
--- a/python/paddle/v2/framework/tests/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -1,10 +1,10 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
 
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
similarity index 96%
rename from python/paddle/v2/framework/tests/op_test.py
rename to python/paddle/v2/fluid/tests/op_test.py
index 2e6710b5fcfe5a531067498e38a4cb93d3165602..90269e308a31d2606b23d741ce0d0fa91a0a6aeb 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -2,12 +2,12 @@ import unittest
 import numpy as np
 import random
 import itertools
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import collections
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.op import Operator
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import Program, OpProtoHolder
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import Program, OpProtoHolder
 
 
 def randomize_probability(batch_size, class_num, dtype='float32'):
@@ -215,7 +215,11 @@ class OpTest(unittest.TestCase):
             if isinstance(input_vars[var_name], list):
                 for name, np_value in self.inputs[var_name]:
                     tensor = core.LoDTensor()
-                    tensor.set(np_value, place)
+                    if isinstance(np_value, tuple):
+                        tensor.set(np_value[0], place)
+                        tensor.set_lod(np_value[1])
+                    else:
+                        tensor.set(np_value, place)
                     feed_map[name] = tensor
             else:
                 tensor = core.LoDTensor()
@@ -236,7 +240,6 @@ class OpTest(unittest.TestCase):
 
         inputs = append_input_output(block, op_proto, self.inputs, True)
         outputs = append_input_output(block, op_proto, self.outputs, False)
-
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -397,9 +400,11 @@ class OpTest(unittest.TestCase):
                 if not isinstance(item[0], basestring):
                     item = [[param_name] + list(item)]
                 if len(item) == 2:
-                    # only set var name and value, set lod to None
-                    var[i] = list(item) + [None]
-
+                    if isinstance(item[1], tuple):
+                        var[i] = [item[0], item[1][0], item[1][1]]
+                    else:
+                        # only set var name and value, set lod to None
+                        var[i] = list(item) + [None]
             var_descs = [(block.create_var(
                 name=name, shape=each.shape, dtype=each.dtype), each, lod)
                          for name, each, lod in var]
diff --git a/python/paddle/v2/framework/tests/test_accuracy_op.py b/python/paddle/v2/fluid/tests/test_accuracy_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_accuracy_op.py
rename to python/paddle/v2/fluid/tests/test_accuracy_op.py
diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_activation_op.py
rename to python/paddle/v2/fluid/tests/test_activation_op.py
diff --git a/python/paddle/v2/framework/tests/test_adadelta_op.py b/python/paddle/v2/fluid/tests/test_adadelta_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adadelta_op.py
rename to python/paddle/v2/fluid/tests/test_adadelta_op.py
diff --git a/python/paddle/v2/framework/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adagrad_op.py
rename to python/paddle/v2/fluid/tests/test_adagrad_op.py
diff --git a/python/paddle/v2/framework/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adam_op.py
rename to python/paddle/v2/fluid/tests/test_adam_op.py
diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/fluid/tests/test_adamax_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_adamax_op.py
rename to python/paddle/v2/fluid/tests/test_adamax_op.py
diff --git a/python/paddle/v2/framework/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
similarity index 91%
rename from python/paddle/v2/framework/tests/test_array_read_write_op.py
rename to python/paddle/v2/fluid/tests/test_array_read_write_op.py
index 79e9938216e2abda5432e525804b0bcb9a655655..e019a4e15f0e25deaedf30911b44e576c8f89013 100644
--- a/python/paddle/v2/framework/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -1,9 +1,9 @@
 import unittest
-import paddle.v2.framework.core as core
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.framework import g_main_program
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.framework import g_main_program
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/test_assign_op.py b/python/paddle/v2/fluid/tests/test_assign_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b0c145f1a69678b228bc70e4e4e273f5bcf9888
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_assign_op.py
@@ -0,0 +1,21 @@
+import op_test
+import numpy
+import unittest
+
+
+class TestAssignOp(op_test.OpTest):
+    def setUp(self):
+        self.op_type = "assign"
+        x = numpy.random.random(size=(100, 10))
+        self.inputs = {'X': x}
+        self.outputs = {'Out': x}
+
+    def test_forward(self):
+        self.check_output()
+
+    def test_backward(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_auc_op.py b/python/paddle/v2/fluid/tests/test_auc_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_auc_op.py
rename to python/paddle/v2/fluid/tests/test_auc_op.py
diff --git a/python/paddle/v2/framework/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
similarity index 99%
rename from python/paddle/v2/framework/tests/test_batch_norm_op.py
rename to python/paddle/v2/fluid/tests/test_batch_norm_op.py
index dee339f43c2ee33fc8a691e0915bddf2c1679285..71f9599e0de83c86808f7e62547f80d3d50ffc7d 100644
--- a/python/paddle/v2/framework/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -1,8 +1,8 @@
 import unittest
 import numpy as np
 from op_test import OpTest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 
 
 def grad_var_name(var_name):
diff --git a/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..080ca43b8269e0f6a9f4d0ce3973f4d4a07a8e2a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
@@ -0,0 +1,37 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestBilinearTensorProductOp(OpTest):
+    def setUp(self):
+        self.op_type = "bilinear_tensor_product"
+        batch_size = 6
+        size0 = 3
+        size1 = 4
+        size2 = 5
+        a = np.random.random((batch_size, size0)).astype("float32")
+        b = np.random.random((batch_size, size1)).astype("float32")
+        w = np.random.random((size2, size0, size1)).astype("float32")
+        bias = np.random.random((1, size2)).astype("float32")
+        output = np.zeros((batch_size, size2)).astype("float32")
+        for i in range(size2):
+            w_i = w[i, :, :]
+            output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)
+        self.inputs = {
+            'X': a,
+            'Y': b,
+            'Weight': w,
+            'Bias': bias,
+        }
+        self.outputs = {'Out': output + bias}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y', 'Weight', 'Bias'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cast_op.py b/python/paddle/v2/fluid/tests/test_cast_op.py
similarity index 93%
rename from python/paddle/v2/framework/tests/test_cast_op.py
rename to python/paddle/v2/fluid/tests/test_cast_op.py
index 52ee71a8a4058a1367d9e493e02d8f2469ccfc9f..0c4b6310652e84d3dd7f281a8b98ae0435072afb 100644
--- a/python/paddle/v2/framework/tests/test_cast_op.py
+++ b/python/paddle/v2/fluid/tests/test_cast_op.py
@@ -1,7 +1,7 @@
 import op_test
 import unittest
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestCastOp(op_test.OpTest):
diff --git a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..48673296a67716c4de804da533f0fd2567f10e2e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
@@ -0,0 +1,179 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class Segment(object):
+    def __init__(self, chunk_type, start_idx, end_idx):
+        self.chunk_type = chunk_type
+        self.start_idx = start_idx
+        self.end_idx = end_idx
+
+    def __str__(self):
+        return '(Segment: %s, %s, %s)' % (self.chunk_type, self.start_idx,
+                                          self.end_idx)
+
+    __repr__ = __str__
+
+
+class TestChunkEvalOp(OpTest):
+    num_sequences = 5
+    batch_size = 50
+
+    def parse_scheme(self):
+        if self.scheme == 'IOB':
+            self.num_tag_types = 2
+        elif self.scheme == 'IOE':
+            self.num_tag_types = 2
+
+    def fill_with_chunks(self, data, chunks):
+        for chunk in chunks:
+            if self.scheme == 'IOB':
+                data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
+                data[chunk.start_idx + 1:
+                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                         self.num_tag_types - 1)
+                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                    self.num_tag_types - 1
+                ) if chunk.start_idx < chunk.end_idx else data[chunk.start_idx]
+            elif self.scheme == 'IOE':
+                data[chunk.start_idx:
+                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types
+                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                    self.num_tag_types - 1)
+
+    def rand_chunks(self, starts, num_chunks):
+        if num_chunks < 0:
+            num_chunks = np.random.randint(starts[-1])
+        chunks = []
+        # generate chunk beginnings
+        chunk_begins = sorted(
+            np.random.choice(
+                range(starts[-1]), num_chunks, replace=False))
+        seq_chunk_begins = []
+        begin_idx = 0
+        # divide chunks into sequences
+        for i in range(len(starts) - 1):
+            tmp_chunk_begins = []
+            while begin_idx < len(chunk_begins) and chunk_begins[
+                    begin_idx] < starts[i + 1]:
+                tmp_chunk_begins.append(chunk_begins[begin_idx])
+                begin_idx += 1
+            seq_chunk_begins.append(tmp_chunk_begins)
+        # generate chunk ends
+        chunk_ends = []
+        for i in range(len(seq_chunk_begins)):
+            for j in range(len(seq_chunk_begins[i])):
+                low = seq_chunk_begins[i][j]
+                high = seq_chunk_begins[i][j + 1] if j < len(seq_chunk_begins[
+                    i]) - 1 else starts[i + 1]
+                chunk_ends.append(np.random.randint(low, high))
+        # generate chunks
+        for chunk_pos in zip(chunk_begins, chunk_ends):
+            chunk_type = np.random.randint(self.num_chunk_types)
+            chunks.append(Segment(chunk_type, *chunk_pos))
+        return chunks
+
+    def gen_chunks(self, infer, label, starts):
+        chunks = self.rand_chunks(starts,
+                                  self.num_infer_chunks + self.num_label_chunks
+                                  - self.num_correct_chunks)
+        correct_chunks = np.random.choice(
+            range(len(chunks)), self.num_correct_chunks, replace=False)
+        infer_chunks = np.random.choice(
+            [x for x in range(len(chunks)) if x not in correct_chunks],
+            self.num_infer_chunks - self.num_correct_chunks,
+            replace=False)
+        infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
+        label_chunks = np.random.choice(
+            [x for x in range(len(chunks)) if x not in infer_chunks],
+            self.num_label_chunks - self.num_correct_chunks,
+            replace=False)
+        label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
+        self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
+        self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
+        # exclude types in excluded_chunk_types
+        if len(self.excluded_chunk_types) > 0:
+            for idx in correct_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_correct_chunks -= 1
+            for idx in infer_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_infer_chunks -= 1
+            for idx in label_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_label_chunks -= 1
+        return self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks
+
+    def set_confs(self):
+        # Use the IOB scheme and labels with 2 chunk types
+        self.scheme = 'IOB'
+        self.num_chunk_types = 2
+        self.excluded_chunk_types = []
+        self.other_chunk_type = self.num_chunk_types
+        self.attrs = {
+            'num_chunk_types': self.num_chunk_types,
+            'chunk_scheme': self.scheme,
+            'excluded_chunk_types': self.excluded_chunk_types
+        }
+        self.parse_scheme()
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9
+
+    def set_data(self):
+        infer = np.zeros((self.batch_size, )).astype('int32')
+        infer.fill(self.num_chunk_types * self.num_tag_types)
+        label = np.copy(infer)
+        starts = np.random.choice(
+            range(1, self.batch_size), self.num_sequences - 1,
+            replace=False).tolist()
+        starts.extend([0, self.batch_size])
+        starts = sorted(starts)
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
+            infer, label, starts)
+        self.inputs = {
+            'Inference': (infer, [starts]),
+            'Label': (label, [starts])
+        }
+        precision = float(
+            self.num_correct_chunks
+        ) / self.num_infer_chunks if self.num_infer_chunks else 0
+        recall = float(self.num_correct_chunks
+                       ) / self.num_label_chunks if self.num_label_chunks else 0
+        f1 = float(2 * precision * recall) / (
+            precision + recall) if self.num_correct_chunks else 0
+        self.outputs = {
+            'Precision': np.asarray(
+                [precision], dtype='float32'),
+            'Recall': np.asarray(
+                [recall], dtype='float32'),
+            'F1-Score': np.asarray(
+                [f1], dtype='float32')
+        }
+
+    def setUp(self):
+        self.op_type = 'chunk_eval'
+        self.set_confs()
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestChunkEvalOpWithExclude(TestChunkEvalOp):
+    def set_confs(self):
+        # Use the IOE scheme and labels with 3 chunk types
+        self.scheme = 'IOE'
+        self.num_chunk_types = 3
+        self.excluded_chunk_types = [1]
+        self.other_chunk_type = self.num_chunk_types
+        self.attrs = {
+            'num_chunk_types': self.num_chunk_types,
+            'chunk_scheme': self.scheme,
+            'excluded_chunk_types': self.excluded_chunk_types
+        }
+        self.parse_scheme()
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 15, 18, 20
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_clip_by_norm_op.py b/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_clip_by_norm_op.py
rename to python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
diff --git a/python/paddle/v2/framework/tests/test_clip_op.py b/python/paddle/v2/fluid/tests/test_clip_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_clip_op.py
rename to python/paddle/v2/fluid/tests/test_clip_op.py
diff --git a/python/paddle/v2/framework/tests/test_compare_op.py b/python/paddle/v2/fluid/tests/test_compare_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_compare_op.py
rename to python/paddle/v2/fluid/tests/test_compare_op.py
diff --git a/python/paddle/v2/framework/tests/test_concat_op.py b/python/paddle/v2/fluid/tests/test_concat_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_concat_op.py
rename to python/paddle/v2/fluid/tests/test_concat_op.py
diff --git a/python/paddle/v2/framework/tests/test_cond_op.py b/python/paddle/v2/fluid/tests/test_cond_op.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_cond_op.py
rename to python/paddle/v2/fluid/tests/test_cond_op.py
index 09a3f5dc97c342fc61cd407bb338c1696e8d6c76..9d1df44b9065f8101e90b87815660f8c0818645f 100644
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
@@ -1,8 +1,8 @@
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
-from paddle.v2.framework.op import Operator, CondOp
+from paddle.v2.fluid.op import Operator, CondOp
 
 
 class PySimpleCond(object):
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..293803f004a1513611fba30634d5552e1da84fef
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -0,0 +1,40 @@
+import unittest
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+import numpy
+
+
+class ConditionalBlock(unittest.TestCase):
+    def test_forward(self):
+        data = layers.data(name='X', shape=[1], data_type='float32')
+        data.stop_gradient = False
+        cond = layers.ConditionalBlock(inputs=[data])
+        out = layers.create_tensor(dtype='float32')
+        with cond.block():
+            hidden = layers.fc(input=data, size=10)
+            layers.assign(hidden, out)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        exe.run(g_startup_program)
+
+        x = core.LoDTensor()
+        x.set(numpy.random.random(size=(10, 1)).astype('float32'), cpu)
+
+        outs = map(numpy.array, exe.run(feed={'X': x}, fetch_list=[out]))[0]
+        print outs
+        loss = layers.mean(x=out)
+        append_backward_ops(loss=loss)
+        outs = map(numpy.array,
+                   exe.run(feed={'X': x},
+                           fetch_list=[
+                               g_main_program.block(0).var(data.name + "@GRAD")
+                           ]))[0]
+        print outs
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv2d_op.py
rename to python/paddle/v2/fluid/tests/test_conv2d_op.py
diff --git a/python/paddle/v2/framework/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv2d_transpose_op.py
rename to python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
diff --git a/python/paddle/v2/framework/tests/test_conv3d_op.py b/python/paddle/v2/fluid/tests/test_conv3d_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv3d_op.py
rename to python/paddle/v2/fluid/tests/test_conv3d_op.py
diff --git a/python/paddle/v2/framework/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv3d_transpose_op.py
rename to python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
diff --git a/python/paddle/v2/framework/tests/test_conv_shift_op.py b/python/paddle/v2/fluid/tests/test_conv_shift_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_conv_shift_op.py
rename to python/paddle/v2/fluid/tests/test_conv_shift_op.py
diff --git a/python/paddle/v2/framework/tests/test_cos_sim_op.py b/python/paddle/v2/fluid/tests/test_cos_sim_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_cos_sim_op.py
rename to python/paddle/v2/fluid/tests/test_cos_sim_op.py
diff --git a/python/paddle/v2/fluid/tests/test_create_op_doc_string.py b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
new file mode 100644
index 0000000000000000000000000000000000000000..42b6f7a3616bbce53a8cae68a5fc1eda411a7422
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
@@ -0,0 +1,11 @@
+import unittest
+import paddle.v2.fluid.layers as layers
+
+
+class TestDocString(unittest.TestCase):
+    def test_layer_doc_string(self):
+        print layers.dropout.__doc__
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_crf_decoding_op.py b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_crf_decoding_op.py
rename to python/paddle/v2/fluid/tests/test_crf_decoding_op.py
diff --git a/python/paddle/v2/framework/tests/test_crop_op.py b/python/paddle/v2/fluid/tests/test_crop_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_crop_op.py
rename to python/paddle/v2/fluid/tests/test_crop_op.py
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_cross_entropy_op.py
rename to python/paddle/v2/fluid/tests/test_cross_entropy_op.py
diff --git a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py b/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
rename to python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
similarity index 94%
rename from python/paddle/v2/framework/tests/test_default_scope_funcs.py
rename to python/paddle/v2/fluid/tests/test_default_scope_funcs.py
index 09a9850d054e3d7e6bf6db363fc577bdff8e9f43..738e69529ea447e87516d5e0efc098910b966ded 100644
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
@@ -1,4 +1,4 @@
-from paddle.v2.framework.default_scope_funcs import *
+from paddle.v2.fluid.default_scope_funcs import *
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_dropout_op.py
rename to python/paddle/v2/fluid/tests/test_dropout_op.py
diff --git a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
rename to python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
index 70af9dbc49f5ff3222cf3d549a110931140b43c4..c2d8b48ea944ae40a451492b8e9fad38dda0835c 100644
--- a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
@@ -1,7 +1,7 @@
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
-from paddle.v2.framework.op import Operator, DynamicRecurrentOp
+from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
 import numpy as np
 
 # for siplicity, just one level LoD
diff --git a/python/paddle/v2/framework/tests/test_elementwise_add_op.py b/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_add_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_add_op.py
diff --git a/python/paddle/v2/framework/tests/test_elementwise_div_op.py b/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_div_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_div_op.py
diff --git a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py b/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_mul_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
diff --git a/python/paddle/v2/framework/tests/test_elementwise_sub_op.py b/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_elementwise_sub_op.py
rename to python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
diff --git a/python/paddle/v2/framework/tests/test_evaluator.py b/python/paddle/v2/fluid/tests/test_evaluator.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_evaluator.py
rename to python/paddle/v2/fluid/tests/test_evaluator.py
index 37dbfbc06bcd0da7e11924a048679c74a1cfb373..1d51205b703f83ec32c2e948394e5d3f5c87d1d9 100644
--- a/python/paddle/v2/framework/tests/test_evaluator.py
+++ b/python/paddle/v2/fluid/tests/test_evaluator.py
@@ -1,6 +1,6 @@
-from paddle.v2.framework.evaluator import Evaluator
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.evaluator import Evaluator
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 import unittest
 import op_test
 import numpy as np
diff --git a/python/paddle/v2/framework/tests/test_exception.py b/python/paddle/v2/fluid/tests/test_exception.py
similarity index 89%
rename from python/paddle/v2/framework/tests/test_exception.py
rename to python/paddle/v2/fluid/tests/test_exception.py
index 5ae048817cfcc1ec85e0d0e0c5db749da4521012..b871f40c4a07ae2db7559e5a0f15664b21e94402 100644
--- a/python/paddle/v2/framework/tests/test_exception.py
+++ b/python/paddle/v2/fluid/tests/test_exception.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
similarity index 83%
rename from python/paddle/v2/framework/tests/test_executor_and_mul.py
rename to python/paddle/v2/fluid/tests/test_executor_and_mul.py
index c885cfbebd4b665ddf50adbc43673942dc949a0b..709250d0c86dde84ac22c37d8e2385ca4a80a40a 100644
--- a/python/paddle/v2/framework/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
@@ -1,8 +1,8 @@
 import unittest
-from paddle.v2.framework.layers import mul, data
-import paddle.v2.framework.core as core
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_main_program
+from paddle.v2.fluid.layers import mul, data
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import g_main_program
 import numpy
 
 
diff --git a/python/paddle/v2/fluid/tests/test_expand_op.py b/python/paddle/v2/fluid/tests/test_expand_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440f7a2bb159bab4923683b5d0980e59e0a69c9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_expand_op.py
@@ -0,0 +1,97 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestExpandOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random(12).astype("float32")}
+        self.attrs = {'expand_times': [2]}
+        output = np.tile(self.inputs['X'], 2)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [2, 3]}
+        output = np.tile(self.inputs['X'], (2, 3))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank4(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5, 7)).astype("float32")}
+        self.attrs = {'expand_times': [3, 2, 1, 2]}
+        output = np.tile(self.inputs['X'], (3, 2, 1, 2))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_feed_fetch_method.py b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_feed_fetch_method.py
rename to python/paddle/v2/fluid/tests/test_feed_fetch_method.py
index fbd659ece0188140e197982ea818d7c3897daf4e..178c85b0dd50df61b1fd35ef5d53ebbf39445cb4 100644
--- a/python/paddle/v2/framework/tests/test_feed_fetch_method.py
+++ b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_fill_constant_batch_size_like_op.py
rename to python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
diff --git a/python/paddle/v2/framework/tests/test_fill_constant_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_fill_constant_op.py
rename to python/paddle/v2/fluid/tests/test_fill_constant_op.py
diff --git a/python/paddle/v2/framework/tests/test_fill_zeros_like_op.py b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_fill_zeros_like_op.py
rename to python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
diff --git a/python/paddle/v2/framework/tests/test_framework_debug_str.py b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
similarity index 85%
rename from python/paddle/v2/framework/tests/test_framework_debug_str.py
rename to python/paddle/v2/fluid/tests/test_framework_debug_str.py
index 8fdf8f91171ee334fac93c05a4d49056fa0e803d..a4cbabdb36362c4ca14b76f366b648d6dbdbf7b3 100644
--- a/python/paddle/v2/framework/tests/test_framework_debug_str.py
+++ b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
@@ -1,5 +1,5 @@
 import unittest
-from paddle.v2.framework.framework import Program
+from paddle.v2.fluid.framework import Program
 
 
 class TestDebugStringFramework(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_gather_op.py b/python/paddle/v2/fluid/tests/test_gather_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_gather_op.py
rename to python/paddle/v2/fluid/tests/test_gather_op.py
diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
similarity index 91%
rename from python/paddle/v2/framework/tests/test_gaussian_random_op.py
rename to python/paddle/v2/fluid/tests/test_gaussian_random_op.py
index 0dc7e091a5c8dd046f36cab7f79a15b2281cdd90..627ab4e23562f14538d85f2e21edeb7d72d940bb 100644
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -1,6 +1,6 @@
 import unittest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_gru_op.py b/python/paddle/v2/fluid/tests/test_gru_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_gru_op.py
rename to python/paddle/v2/fluid/tests/test_gru_op.py
diff --git a/python/paddle/v2/framework/tests/test_gru_unit_op.py b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_gru_unit_op.py
rename to python/paddle/v2/fluid/tests/test_gru_unit_op.py
diff --git a/python/paddle/v2/framework/tests/test_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_huber_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_huber_loss_op.py
rename to python/paddle/v2/fluid/tests/test_huber_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_image_classification_layer.py
rename to python/paddle/v2/fluid/tests/test_image_classification_layer.py
index b1a267ec32b1c937b946bee82e41b846ebbf1288..bf5444107fa1609e67b09823b82e5fb92234b0a4 100644
--- a/python/paddle/v2/framework/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -1,8 +1,8 @@
 import unittest
 
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program
 
 
 def conv_block(input,
diff --git a/python/paddle/v2/framework/tests/test_infer_shape.py b/python/paddle/v2/fluid/tests/test_infer_shape.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_infer_shape.py
rename to python/paddle/v2/fluid/tests/test_infer_shape.py
index 2b2995f5e22d8c50d67498688c069252bf6e02fc..9f6695ce02de749178046fbb613a58ba591b3dbc 100644
--- a/python/paddle/v2/framework/tests/test_infer_shape.py
+++ b/python/paddle/v2/fluid/tests/test_infer_shape.py
@@ -1,6 +1,6 @@
 import unittest
 
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestInferShape(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_inference_model_io.py b/python/paddle/v2/fluid/tests/test_inference_model_io.py
similarity index 90%
rename from python/paddle/v2/framework/tests/test_inference_model_io.py
rename to python/paddle/v2/fluid/tests/test_inference_model_io.py
index 48984f86a1864baade58aeb8e35c6065cc2a4bbb..98b95713b73e8eba93bd6a58eaaed603cfae7952 100644
--- a/python/paddle/v2/framework/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -1,11 +1,11 @@
 import paddle.v2 as paddle
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.core as core
-import paddle.v2.framework.optimizer as optimizer
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.optimizer as optimizer
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.io import save_inference_model, load_inference_model
-import paddle.v2.framework.executor as executor
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.io import save_inference_model, load_inference_model
+import paddle.v2.fluid.executor as executor
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_initializer.py b/python/paddle/v2/fluid/tests/test_initializer.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_initializer.py
rename to python/paddle/v2/fluid/tests/test_initializer.py
index bd4d2e39d770aebb7468d516f463533185ea8680..f2eb79b209627f5814847db6d96c0a17300d9b5a 100644
--- a/python/paddle/v2/framework/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
@@ -1,8 +1,8 @@
 import numpy as np
 import unittest
 
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.initializer as initializer
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.initializer as initializer
 
 DELTA = 0.00001
 
diff --git a/python/paddle/v2/framework/tests/test_l1_norm_op.py b/python/paddle/v2/fluid/tests/test_l1_norm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_l1_norm_op.py
rename to python/paddle/v2/fluid/tests/test_l1_norm_op.py
diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_layers.py
rename to python/paddle/v2/fluid/tests/test_layers.py
index b42af5ea45d54723e96279f9e16f82a1d52ad236..3d18e7ce3a4dc6c6b917a1000de39fca71f6ac18 100644
--- a/python/paddle/v2/framework/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -1,7 +1,7 @@
-import paddle.v2.framework.layers as layers
-import paddle.v2.framework.nets as nets
-from paddle.v2.framework.framework import Program
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program
+import paddle.v2.fluid.core as core
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_linear_chain_crf_op.py b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_linear_chain_crf_op.py
rename to python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
diff --git a/python/paddle/v2/framework/tests/test_lod_array_length_op.py b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
similarity index 79%
rename from python/paddle/v2/framework/tests/test_lod_array_length_op.py
rename to python/paddle/v2/fluid/tests/test_lod_array_length_op.py
index af2b4d705e7ec121bd5f1350f0a642ae8c44bf1e..a01ae83772185df218b8c453557dc0cac719673b 100644
--- a/python/paddle/v2/framework/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
@@ -1,7 +1,7 @@
 import unittest
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.executor import Executor
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
similarity index 78%
rename from python/paddle/v2/framework/tests/test_lod_rank_table.py
rename to python/paddle/v2/fluid/tests/test_lod_rank_table.py
index 408145c10f46e24e8a54b05b4f3afa9231b6ffd6..bbc11930b9e804c2769cc590c298c6e90dc36ca6 100644
--- a/python/paddle/v2/framework/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
@@ -1,7 +1,7 @@
-from paddle.v2.framework.layers import lod_rank_table, data
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.framework import g_main_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.layers import lod_rank_table, data
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import g_main_program
+import paddle.v2.fluid.core as core
 import numpy
 import unittest
 
diff --git a/python/paddle/v2/fluid/tests/test_lod_reset_op.py b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..652ccecfa443fc95f08f52df766709cb550f4049
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
@@ -0,0 +1,64 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLodResetOpByAttr(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0 = [0, 7, 10]
+        self.inputs = {'X': (x, lod)}
+        self.attrs = {'target_lod': target_lod_0}
+        self.outputs = {'Out': (x, [target_lod_0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestLodResetOpByInput(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0 = [0, 4, 7, 10]
+        self.inputs = {
+            'X': (x, lod),
+            'TargetLoD': np.array([target_lod_0]).astype('int32')
+        }
+        self.outputs = {'Out': (x, [target_lod_0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+
+
+class TestLodResetOpBoth(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0_attr = [0, 7, 10]
+        target_lod_0_in = [0, 4, 7, 10]
+        self.inputs = {
+            'X': (x, lod),
+            'TargetLoD': np.array(target_lod_0_in).astype('int32')
+        }
+        self.attrs = {'target_lod': target_lod_0_attr}
+        self.outputs = {'Out': (x, [target_lod_0_in])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lod_tensor_array.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_lod_tensor_array.py
rename to python/paddle/v2/fluid/tests/test_lod_tensor_array.py
index a433bcf622b14a1d2d33b5b98d555e1a21e4b9e8..d6d3e23fd8898a62528d63795d1bff1b72752477 100644
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
@@ -1,5 +1,5 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
rename to python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
index e9713666b3f64d7a39afadab7da6b22f149b8cf8..b18cb6b49fa41f26e1b6de1128690507c5a2f099 100644
--- a/python/paddle/v2/framework/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -1,10 +1,10 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import numpy
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 
 
 class TestCPULoDTensorArrayOps(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_lookup_table_op.py b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lookup_table_op.py
rename to python/paddle/v2/fluid/tests/test_lookup_table_op.py
diff --git a/python/paddle/v2/framework/tests/test_lrn_op.py b/python/paddle/v2/fluid/tests/test_lrn_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lrn_op.py
rename to python/paddle/v2/fluid/tests/test_lrn_op.py
diff --git a/python/paddle/v2/framework/tests/test_lstm_op.py b/python/paddle/v2/fluid/tests/test_lstm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lstm_op.py
rename to python/paddle/v2/fluid/tests/test_lstm_op.py
diff --git a/python/paddle/v2/framework/tests/test_lstm_unit_op.py b/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_lstm_unit_op.py
rename to python/paddle/v2/fluid/tests/test_lstm_unit_op.py
diff --git a/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
rename to python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_matmul_op.py b/python/paddle/v2/fluid/tests/test_matmul_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_matmul_op.py
rename to python/paddle/v2/fluid/tests/test_matmul_op.py
diff --git a/python/paddle/v2/framework/tests/test_mean_op.py b/python/paddle/v2/fluid/tests/test_mean_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_mean_op.py
rename to python/paddle/v2/fluid/tests/test_mean_op.py
diff --git a/python/paddle/v2/framework/tests/test_minus_op.py b/python/paddle/v2/fluid/tests/test_minus_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_minus_op.py
rename to python/paddle/v2/fluid/tests/test_minus_op.py
diff --git a/python/paddle/v2/framework/tests/test_modified_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_modified_huber_loss_op.py
rename to python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_momentum_op.py b/python/paddle/v2/fluid/tests/test_momentum_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_momentum_op.py
rename to python/paddle/v2/fluid/tests/test_momentum_op.py
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/fluid/tests/test_mul_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_mul_op.py
rename to python/paddle/v2/fluid/tests/test_mul_op.py
diff --git a/python/paddle/v2/framework/tests/test_multiplex_op.py b/python/paddle/v2/fluid/tests/test_multiplex_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_multiplex_op.py
rename to python/paddle/v2/fluid/tests/test_multiplex_op.py
diff --git a/python/paddle/v2/framework/tests/test_nccl_init_op.py b/python/paddle/v2/fluid/tests/test_nccl_init_op.py
similarity index 91%
rename from python/paddle/v2/framework/tests/test_nccl_init_op.py
rename to python/paddle/v2/fluid/tests/test_nccl_init_op.py
index 054909fdf5517a68c6a07971c65a1d5bdc20d4fa..a536800ccd81fdc2f3b7c8320cede4f8ecf3a8cb 100644
--- a/python/paddle/v2/framework/tests/test_nccl_init_op.py
+++ b/python/paddle/v2/fluid/tests/test_nccl_init_op.py
@@ -1,8 +1,8 @@
 import unittest, os
 import numpy as np
 import paddle.v2 as paddle
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 from op_test import OpTest, create_op, set_input
 
 if not core.is_compile_gpu():
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/fluid/tests/test_net.py
similarity index 93%
rename from python/paddle/v2/framework/tests/test_net.py
rename to python/paddle/v2/fluid/tests/test_net.py
index 8503257feb8e1a5802f3f889f72c559a2aaa583a..318df08a9e73ac95cab73c34182bc6220ef6c681 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/fluid/tests/test_net.py
@@ -1,5 +1,5 @@
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_op_support_gpu.py b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
similarity index 84%
rename from python/paddle/v2/framework/tests/test_op_support_gpu.py
rename to python/paddle/v2/fluid/tests/test_op_support_gpu.py
index dd36c666c440a5c378dfceac4502cd8277417412..a0eb4bd5fd2cc178ffe0763efdee61524ad6d4bd 100644
--- a/python/paddle/v2/framework/tests/test_op_support_gpu.py
+++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
@@ -1,5 +1,5 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestOpSupportGPU(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/fluid/tests/test_operator.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_operator.py
rename to python/paddle/v2/fluid/tests/test_operator.py
index 98f6b2f5ee639120557cb85b3ada6d2931f7d0d2..4aa022ef90159cd96eed4e4dbe30cf5d1e8a41a7 100644
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/fluid/tests/test_operator.py
@@ -1,7 +1,7 @@
 import unittest
-import paddle.v2.framework.op as op
-import paddle.v2.framework.core as core
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.op as op
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 
 
 class TestGetAllProtos(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_operator_desc.py
rename to python/paddle/v2/fluid/tests/test_operator_desc.py
index a0bc4e0b91602cfc90f91a1e2dd4bce22c0dbf6d..e8362d2e9c6038c04c24dce35de8c53bfde78142 100644
--- a/python/paddle/v2/framework/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
@@ -1,6 +1,6 @@
 import unittest
-from paddle.v2.framework.framework import Variable, Program, g_main_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import Variable, Program, g_main_program
+import paddle.v2.fluid.core as core
 
 
 class TestOperator(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_optimizer.py
rename to python/paddle/v2/fluid/tests/test_optimizer.py
index a39e7402600c7a94301de030c90ea51264248cf1..0ebf7cdf208c41eacfdff88f59455584eff4ff8f 100644
--- a/python/paddle/v2/framework/tests/test_optimizer.py
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -1,8 +1,8 @@
 import unittest
 
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.optimizer as optimizer
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.backward import append_backward_ops
 
 
 class TestOptimizer(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_pad_op.py b/python/paddle/v2/fluid/tests/test_pad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_pad_op.py
rename to python/paddle/v2/fluid/tests/test_pad_op.py
diff --git a/python/paddle/v2/framework/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py
similarity index 87%
rename from python/paddle/v2/framework/tests/test_parameter.py
rename to python/paddle/v2/fluid/tests/test_parameter.py
index f04eb4cf27276b0f7da0793c97742ac42e4583be..71a1bd2aaf5a9c6362ce0d35c256ed228e942fce 100644
--- a/python/paddle/v2/framework/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
@@ -1,6 +1,6 @@
 import unittest
-from paddle.v2.framework.framework import g_main_program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import g_main_program
+import paddle.v2.fluid.core as core
 
 
 class TestParameter(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/fluid/tests/test_pool2d_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_pool2d_op.py
rename to python/paddle/v2/fluid/tests/test_pool2d_op.py
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/fluid/tests/test_pool3d_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_pool3d_op.py
rename to python/paddle/v2/fluid/tests/test_pool3d_op.py
diff --git a/python/paddle/v2/framework/tests/test_pool_max_op.py b/python/paddle/v2/fluid/tests/test_pool_max_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_pool_max_op.py
rename to python/paddle/v2/fluid/tests/test_pool_max_op.py
diff --git a/python/paddle/v2/framework/tests/test_positive_negative_pair_op.py b/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_positive_negative_pair_op.py
rename to python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
diff --git a/python/paddle/v2/framework/tests/test_precision_recall_op.py b/python/paddle/v2/fluid/tests/test_precision_recall_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_precision_recall_op.py
rename to python/paddle/v2/fluid/tests/test_precision_recall_op.py
diff --git a/python/paddle/v2/framework/tests/test_prelu_op.py b/python/paddle/v2/fluid/tests/test_prelu_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_prelu_op.py
rename to python/paddle/v2/fluid/tests/test_prelu_op.py
diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_program.py
rename to python/paddle/v2/fluid/tests/test_program.py
index 7be67b6614ee3302a319289b821a214a81b6f64e..ef2daf6916e14c015a39ae0193948e7ff6531449 100644
--- a/python/paddle/v2/framework/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,8 +1,8 @@
 import unittest
 
-import paddle.v2.framework.core as core
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.framework import g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import g_main_program
 
 
 class TestProgram(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/fluid/tests/test_protobuf.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_protobuf.py
rename to python/paddle/v2/fluid/tests/test_protobuf.py
index 848a396b3b6eec57d500b464780b64f339b09e94..e064374176fa221cfd042b7dbd2ddcb3b5ec41ec 100644
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import unittest
 
 
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
similarity index 99%
rename from python/paddle/v2/framework/tests/test_protobuf_descs.py
rename to python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 2fd3d5d165ada5026510e0dc3e2c55b6e0596ff3..098a9802dfc6763ce2a2356b7267a439145b7939 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -1,5 +1,5 @@
 import unittest
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class TestOpDesc(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_proximal_adagrad_op.py b/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_proximal_adagrad_op.py
rename to python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
diff --git a/python/paddle/v2/framework/tests/test_proximal_gd_op.py b/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_proximal_gd_op.py
rename to python/paddle/v2/fluid/tests/test_proximal_gd_op.py
diff --git a/python/paddle/v2/framework/tests/test_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_rank_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_rank_loss_op.py
rename to python/paddle/v2/fluid/tests/test_rank_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_recurrent_op.py
rename to python/paddle/v2/fluid/tests/test_recurrent_op.py
index 16100429dd4010eb5c9a3e8896212f39295a4c8a..b623d1231838faff9e91c9234befb1f647fe8ec2 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -1,11 +1,11 @@
 import unittest
 
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 class PyRNNBase(object):
diff --git a/python/paddle/v2/framework/tests/test_reduce_op.py b/python/paddle/v2/fluid/tests/test_reduce_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_reduce_op.py
rename to python/paddle/v2/fluid/tests/test_reduce_op.py
diff --git a/python/paddle/v2/framework/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_regularizer.py
rename to python/paddle/v2/fluid/tests/test_regularizer.py
index b21dceb584bdc660e48598a600f57cb6095b3802..f5d1eb3b96211bd7c7335dbe116a1d765d7bae50 100644
--- a/python/paddle/v2/framework/tests/test_regularizer.py
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -1,9 +1,9 @@
 import unittest
 
-import paddle.v2.framework.framework as framework
-import paddle.v2.framework.optimizer as optimizer
-import paddle.v2.framework.regularizer as regularizer
-from paddle.v2.framework.backward import append_backward_ops
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.regularizer as regularizer
+from paddle.v2.fluid.backward import append_backward_ops
 
 
 class TestL2DecayRegularizer(unittest.TestCase):
diff --git a/python/paddle/v2/framework/tests/test_reshape_op.py b/python/paddle/v2/fluid/tests/test_reshape_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_reshape_op.py
rename to python/paddle/v2/fluid/tests/test_reshape_op.py
diff --git a/python/paddle/v2/framework/tests/test_rmsprop_op.py b/python/paddle/v2/fluid/tests/test_rmsprop_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_rmsprop_op.py
rename to python/paddle/v2/fluid/tests/test_rmsprop_op.py
diff --git a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
similarity index 95%
rename from python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
rename to python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
index 731beff17cc96d26c2d9390a956c774b8676b179..a3cba92504a28590083df57e69f7662a887d94a6 100644
--- a/python/paddle/v2/framework/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -1,10 +1,10 @@
 import unittest
 
-from paddle.v2.framework.framework import Program
-from paddle.v2.framework.executor import Executor
-from paddle.v2.framework.backward import append_backward_ops
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
 import numpy as np
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 
 
 def create_tensor(np_data, place):
diff --git a/python/paddle/v2/framework/tests/test_scale_op.py b/python/paddle/v2/fluid/tests/test_scale_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_scale_op.py
rename to python/paddle/v2/fluid/tests/test_scale_op.py
diff --git a/python/paddle/v2/framework/tests/test_scatter_op.py b/python/paddle/v2/fluid/tests/test_scatter_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_scatter_op.py
rename to python/paddle/v2/fluid/tests/test_scatter_op.py
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/fluid/tests/test_scope.py
similarity index 81%
rename from python/paddle/v2/framework/tests/test_scope.py
rename to python/paddle/v2/fluid/tests/test_scope.py
index 14743654792716e4a7ebce5238b142addc86337e..e4857b590aa6e09f1fa37c4a8a70a3ec9495b085 100644
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/fluid/tests/test_scope.py
@@ -1,22 +1,22 @@
-import paddle.v2.framework.core
+import paddle.v2.fluid.core
 import unittest
 
 
 class TestScope(unittest.TestCase):
     def test_create_destroy(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         self.assertIsNotNone(scope)
         scope_with_parent = scope.new_scope()
         self.assertIsNotNone(scope_with_parent)
 
     def test_none_variable(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         self.assertIsNone(scope.find_var("test"))
 
     def test_create_var_get_var(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         var_a = scope.var("var_a")
         self.assertIsNotNone(var_a)
@@ -25,7 +25,7 @@ class TestScope(unittest.TestCase):
         self.assertIsNotNone(scope2.find_var('var_a'))
 
     def test_var_get_int(self):
-        paddle_c = paddle.v2.framework.core
+        paddle_c = paddle.v2.fluid.core
         scope = paddle_c.Scope()
         var = scope.var("test_int")
         var.set_int(10)
diff --git a/python/paddle/v2/framework/tests/test_selected_rows.py b/python/paddle/v2/fluid/tests/test_selected_rows.py
similarity index 96%
rename from python/paddle/v2/framework/tests/test_selected_rows.py
rename to python/paddle/v2/fluid/tests/test_selected_rows.py
index e8a930cb08c42b48f678bdd7bdb7698923535d4f..93daf37aa2ceb8a599973f7b02874f23fe0763ff 100644
--- a/python/paddle/v2/framework/tests/test_selected_rows.py
+++ b/python/paddle/v2/fluid/tests/test_selected_rows.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/fluid/tests/test_seq_concat_op.py b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..dccc6ed8afe2315da74f6886878b15d58b26b3c9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
@@ -0,0 +1,104 @@
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+exit(0)
+
+
+def to_abs_lod(lod):
+    if len(lod) == 0 or len(lod) == 1:
+        return lod
+    import copy
+    new_lod = copy.deepcopy(lod)
+    for idx, val in enumerate(lod[0]):
+        new_lod[0][idx] = lod[1][val]
+    return new_lod
+
+
+def seq_concat(inputs, level):
+    lod0 = inputs['X'][0][1][1]
+    lod1 = inputs['X'][1][1][1]
+    x0 = inputs['X'][0][1][0]
+    x1 = inputs['X'][1][1][0]
+    level_idx = len(lod0) - level - 1
+    outs = []
+    for i in range(len(lod0[level_idx]) - 1):
+        sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][
+            i + 1], :]
+        sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][
+            i + 1], :]
+        outs.append(np.concatenate((sub_x0, sub_x1), axis=0))
+    return np.concatenate(outs, axis=0)
+
+
+class TestSeqConcatOp(OpTest):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((4, 8, 3)).astype('float32')
+        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        axis = 1
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        self.outputs = {'Out': (np.concatenate([x0, x1], axis=1), lod0)}
+
+    def setUp(self):
+        self.op_type = "sequence_concat"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out')
+
+
+class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]]
+        axis = 0
+        level = 0
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
+
+
+class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]]
+        axis = 0
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
+
+
+class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 3, 4)).astype('float32')
+        lod0 = [[0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 3, 4)).astype('float32')
+        lod1 = [[0, 1, 3, 5, 7]]
+        axis = 0
+        level = 0
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_seq_conv.py b/python/paddle/v2/fluid/tests/test_seq_conv.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_seq_conv.py
rename to python/paddle/v2/fluid/tests/test_seq_conv.py
diff --git a/python/paddle/v2/framework/tests/test_seq_expand.py b/python/paddle/v2/fluid/tests/test_seq_expand.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_seq_expand.py
rename to python/paddle/v2/fluid/tests/test_seq_expand.py
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/fluid/tests/test_seq_pool.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_seq_pool.py
rename to python/paddle/v2/fluid/tests/test_seq_pool.py
diff --git a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sequence_softmax_op.py
rename to python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
similarity index 97%
rename from python/paddle/v2/framework/tests/test_sgd_op.py
rename to python/paddle/v2/fluid/tests/test_sgd_op.py
index 01262bba4d43adaed179baef88ccab6e69b0884b..ca05a381f06cfd40b7939dbda8d4f1f4aacd0271 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -1,7 +1,7 @@
 import unittest
 import numpy as np
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
 from op_test import OpTest
 
 
diff --git a/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
similarity index 86%
rename from python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
rename to python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index 2090455b969806685b525f1e588b6570e3072430..1a3b88e18e38b88d75ad17a0bb6a2965d1e60406 100644
--- a/python/paddle/v2/framework/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -1,9 +1,9 @@
 import unittest
-import paddle.v2.framework.core as core
-from paddle.v2.framework.executor import Executor
-import paddle.v2.framework.layers as layers
-from paddle.v2.framework.backward import append_backward_ops
-from paddle.v2.framework.framework import g_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.backward import append_backward_ops
+from paddle.v2.fluid.framework import g_main_program
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
rename to python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
diff --git a/python/paddle/v2/framework/tests/test_sign_op.py b/python/paddle/v2/fluid/tests/test_sign_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sign_op.py
rename to python/paddle/v2/fluid/tests/test_sign_op.py
diff --git a/python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py b/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_smooth_l1_loss_op.py
rename to python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/fluid/tests/test_softmax_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_softmax_op.py
rename to python/paddle/v2/fluid/tests/test_softmax_op.py
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
rename to python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3aed83b2ea3418c54f9540279ae6e2e0045421fa
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -0,0 +1,181 @@
+import unittest
+import paddle.v2.fluid.core as core
+import numpy as np
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward_ops
+
+
+class TestCPULoDTensorArrayOps(unittest.TestCase):
+    def place(self):
+        return core.CPUPlace()
+
+    def test_split_and_merge_lod_tensor_no_lod(self):
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
+
+        mask_np = np.array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, self.place())
+
+        expect_true_tensor = np.array([2, 3, 4, 5]).astype('int32')
+        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
+        expect_true = core.LoDTensor()
+        expect_true.set(expect_true_tensor, self.place())
+
+        expect_false_tensor = np.array([0, 1, 6, 7, 8, 9]).astype('int32')
+        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
+
+        expect_false = core.LoDTensor()
+        expect_false.set(expect_false_tensor, self.place())
+
+        self.main(
+            tensor=tensor,
+            mask=mask,
+            expect_true=expect_true,
+            expect_false=expect_false,
+            expect_out=tensor)
+
+    def test_split_and_merge_lod_tensor_level_0(self):
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        mask_np = np.array([0, 1, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, self.place())
+
+        expect_true_tensor = np.array([3, 4, 5, 6, 7, 8]).astype('int32')
+        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
+        expect_true = core.LoDTensor()
+        expect_true.set(expect_true_tensor, self.place())
+        expect_true.set_lod([[0, 6]])
+
+        expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32')
+        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
+        expect_false_lod = [[0, 3, 4]]
+
+        expect_false = core.LoDTensor()
+        expect_false.set(expect_false_tensor, self.place())
+        expect_false.set_lod(expect_false_lod)
+
+        self.main(
+            tensor=tensor,
+            mask=mask,
+            expect_true=expect_true,
+            expect_false=expect_false,
+            expect_out=tensor)
+
+    def main(self, tensor, mask, expect_true, expect_false, expect_out,
+             level=0):
+        place = self.place()
+        program = Program()
+        x = layers.data(name='x', shape=[1], main_program=program)
+        x.persistable = True
+
+        y = layers.data(name='y', shape=[1], main_program=program)
+        y.persistable = True
+
+        out_true, out_false = layers.split_lod_tensor(
+            input=x, mask=y, level=level, main_program=program)
+        out_true.persistable = True
+        out_false.persistable = True
+
+        out = layers.merge_lod_tensor(
+            in_true=out_true,
+            in_false=out_false,
+            mask=y,
+            x=x,
+            level=level,
+            main_program=program)
+
+        out.persistable = True
+
+        exe = Executor(place)
+        scope = core.Scope()
+        exe.run(program, feed={'x': tensor, 'y': mask}, scope=scope)
+
+        var_true = scope.find_var(out_true.name).get_tensor()
+
+        var_false = scope.find_var(out_false.name).get_tensor()
+
+        var_out = scope.find_var(out.name).get_tensor()
+
+        self.check_tensor_same(var_true, expect_true)
+        self.check_tensor_same(var_false, expect_false)
+        self.check_tensor_same(var_out, expect_out)
+
+    def check_tensor_same(self, actual, expect):
+        self.assertTrue(np.allclose(np.array(actual), np.array(expect)))
+        self.assertEqual(actual.lod(), expect.lod())
+
+
+class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
+    def test_grad(self):
+        place = core.CPUPlace()
+        program = Program()
+
+        x = layers.data(
+            name='x',
+            shape=[1],
+            data_type='float32',
+            main_program=program,
+            stop_gradient=False)
+        y = layers.data(
+            name='y',
+            shape=[1],
+            data_type='bool',
+            main_program=program,
+            stop_gradient=False)
+
+        level = 0
+
+        out_true, out_false = layers.split_lod_tensor(
+            input=x, mask=y, level=level, main_program=program)
+        out = layers.merge_lod_tensor(
+            in_true=out_true,
+            in_false=out_false,
+            mask=y,
+            x=x,
+            level=level,
+            main_program=program)
+        mean = layers.mean(x=out, main_program=program)
+
+        append_backward_ops(mean)
+
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        mask_np = np.array([0, 1, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, place)
+
+        exe = Executor(place)
+        scope = core.Scope()
+
+        g_vars = program.global_block().var(x.name + "@GRAD")
+        g_out = [
+            item.sum()
+            for item in map(np.array,
+                            exe.run(program,
+                                    feed={'x': tensor,
+                                          'y': mask},
+                                    fetch_list=[g_vars],
+                                    scope=scope))
+        ]
+
+        g_out_sum = np.array(g_out).sum()
+
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_split_op.py b/python/paddle/v2/fluid/tests/test_split_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_split_op.py
rename to python/paddle/v2/fluid/tests/test_split_op.py
diff --git a/python/paddle/v2/framework/tests/test_squared_l2_distance_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_squared_l2_distance_op.py
rename to python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
diff --git a/python/paddle/v2/framework/tests/test_squared_l2_norm_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_squared_l2_norm_op.py
rename to python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
diff --git a/python/paddle/v2/framework/tests/test_sum_op.py b/python/paddle/v2/fluid/tests/test_sum_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_sum_op.py
rename to python/paddle/v2/fluid/tests/test_sum_op.py
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/fluid/tests/test_tensor.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_tensor.py
rename to python/paddle/v2/fluid/tests/test_tensor.py
index e0cd2fa8aaf2db2991ad2b9a3053f0d00b509cd4..9f870d9eb3485aa0b54eb781b906f4232d12c49e 100644
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/fluid/tests/test_tensor.py
@@ -1,4 +1,4 @@
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy
 
diff --git a/python/paddle/v2/framework/tests/test_tensor_array.py b/python/paddle/v2/fluid/tests/test_tensor_array.py
similarity index 98%
rename from python/paddle/v2/framework/tests/test_tensor_array.py
rename to python/paddle/v2/fluid/tests/test_tensor_array.py
index 50b3e09162a24201ee45cbd017dfef8a60f0da78..d6929ba16e4dae0c57adcceb4f0e78c094eee55c 100644
--- a/python/paddle/v2/framework/tests/test_tensor_array.py
+++ b/python/paddle/v2/fluid/tests/test_tensor_array.py
@@ -1,5 +1,5 @@
 import logging
-import paddle.v2.framework.core as core
+import paddle.v2.fluid.core as core
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/framework/tests/test_top_k_op.py b/python/paddle/v2/fluid/tests/test_top_k_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_top_k_op.py
rename to python/paddle/v2/fluid/tests/test_top_k_op.py
diff --git a/python/paddle/v2/framework/tests/test_transpose_op.py b/python/paddle/v2/fluid/tests/test_transpose_op.py
similarity index 100%
rename from python/paddle/v2/framework/tests/test_transpose_op.py
rename to python/paddle/v2/fluid/tests/test_transpose_op.py
diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
similarity index 90%
rename from python/paddle/v2/framework/tests/test_uniform_random_op.py
rename to python/paddle/v2/fluid/tests/test_uniform_random_op.py
index ded777105e0fc64eb82bf4013bfba7ba9d0ddefa..f736dfb2e85552b321403c961da517f3b3efb100 100644
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -1,6 +1,6 @@
 import unittest
-from paddle.v2.framework.op import Operator
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
 import numpy
 
 
diff --git a/python/paddle/v2/framework/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py
similarity index 93%
rename from python/paddle/v2/framework/tests/test_variable.py
rename to python/paddle/v2/fluid/tests/test_variable.py
index 03115f10a5a494424c6f8310c544c569be818e5b..a3e60a751719666bdca56a3096b688125d09f4b2 100644
--- a/python/paddle/v2/framework/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
@@ -1,6 +1,6 @@
 import unittest
-from paddle.v2.framework.framework import Variable, g_main_program, Program
-import paddle.v2.framework.core as core
+from paddle.v2.fluid.framework import Variable, g_main_program, Program
+import paddle.v2.fluid.core as core
 import numpy as np
 
 
diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f01acb3b94dc55a3536e751108e785ddc6e47bb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -0,0 +1,68 @@
+import unittest
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
+import numpy
+
+
+class TestWhileOp(unittest.TestCase):
+    def test_simple_forward(self):
+        d0 = layers.data(
+            "d0", shape=[10], append_batch_size=False, data_type='float32')
+        d1 = layers.data(
+            "d1", shape=[10], append_batch_size=False, data_type='float32')
+        d2 = layers.data(
+            "d2", shape=[10], append_batch_size=False, data_type='float32')
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+        init = layers.zeros(shape=[10], dtype='float32')
+        mem_array = layers.array_write(init, i=i)
+        data_array = layers.array_write(x=d0, i=i)
+
+        i = layers.increment(i)
+        layers.array_write(d1, i, array=data_array)
+
+        i = layers.increment(i)
+        layers.array_write(d2, i, array=data_array)
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+
+        array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        cond = layers.less_than(x=i, y=array_len)
+
+        while_op = layers.While(cond=cond)
+        with while_op.block():
+            d = layers.array_read(array=data_array, i=i)
+            prev = layers.array_read(array=mem_array, i=i)
+            i = layers.increment(x=i, in_place=True)
+            result = layers.sums(input=[d, prev])
+            layers.array_write(result, i=i, array=mem_array)
+            layers.less_than(x=i, y=array_len, cond=cond)
+        sum_result = layers.array_read(mem_array, i=array_len)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        d = []
+
+        for i in xrange(3):
+            d.append(numpy.random.random(size=[10]).astype('float32'))
+
+        d_tensor = []
+        for item in d:
+            t = core.LoDTensor()
+            t.set(item, cpu)
+            d_tensor.append(t)
+
+        outs = map(numpy.array,
+                   exe.run(feed={
+                       'd0': d_tensor[0],
+                       'd1': d_tensor[1],
+                       'd2': d_tensor[2]
+                   },
+                           fetch_list=[sum_result]))
+        self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_beam_search_decode_op.py b/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9f180bbaea0f5922bee0a3e2a8c715d683c0d16
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_beam_search_decode_op.py
@@ -0,0 +1,75 @@
+import unittest
+
+import numpy as np
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+
+class TestBeamSearchDecodeOp(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        self.cpu_place = core.CPUPlace()
+
+    def append_lod_tensor(self, tensor_array, lod, data):
+        lod_tensor = core.LoDTensor()
+        lod_tensor.set_lod(lod)
+        lod_tensor.set(data, self.cpu_place)
+        tensor_array.append(lod_tensor)
+
+    def test_get_set(self):
+        ids = self.scope.var("ids").get_lod_tensor_array()
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="int64"))
+
+        scores = self.scope.var("scores").get_lod_tensor_array()
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="float32"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="float32"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="float32"))
+
+        sentence_ids = self.scope.var("sentence_ids").get_tensor()
+        sentence_scores = self.scope.var("sentence_scores").get_tensor()
+
+        beam_search_decode_op = Operator(
+            "beam_search_decode",
+            # inputs
+            Ids="ids",
+            Scores="scores",
+            # outputs
+            SentenceIds="sentence_ids",
+            SentenceScores="sentence_scores")
+
+        ctx = core.DeviceContext.create(self.cpu_place)
+        beam_search_decode_op.run(self.scope, ctx)
+
+        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
+        self.assertEqual(sentence_ids.lod(), expected_lod)
+        self.assertEqual(sentence_scores.lod(), expected_lod)
+
+        expected_data = np.array(
+            [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
+        self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
+        self.assertTrue(
+            np.array_equal(np.array(sentence_scores), expected_data))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_seq_concat_op.py b/python/paddle/v2/framework/tests/test_seq_concat_op.py
deleted file mode 100644
index abd2ebf0b21a953b76155eb04c57a7b65ac53cbc..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import unittest
-import numpy as np
-import sys
-from op_test import OpTest
-
-
-class TestConcatOp(OpTest):
-    def set_data(self):
-        # two level, batch size is 3
-        x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((4, 8, 3)).astype('float32')
-        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        axis = 1
-        level = 1
-        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
-        self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(4):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
-
-    def setUp(self):
-        self.op_type = "sequence_concat"
-        self.set_data()
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out')
-
-
-class TestConcatOpDiffLod(TestConcatOp):
-    def set_data(self):
-        # two level, batch size is 3
-        x0 = np.random.random((4, 6, 3)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((5, 6, 3)).astype('float32')
-        lod1 = [[0, 3, 5], [0, 1, 2, 3, 5]]
-        axis = 0
-        level = 1
-        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
-        self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(4):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
-
-
-class TestConcatOpLevelZero(TestConcatOp):
-    def set_data(self):
-        # two level, batch size is 3
-        x0 = np.random.random((4, 3, 4)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((5, 3, 4)).astype('float32')
-        lod1 = [[0, 3, 5], [0, 1, 3, 4, 5]]
-        axis = 0
-        level = 0
-        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
-        self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(2):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
-
-
-if __name__ == '__main__':
-    sys.exit(0)
-    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 5348c2d8d7e9b5adc5fe93e2943bef149ba047cc..fe91df10daf303bb14d1e5f28817984d261e0880 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -13,8 +13,8 @@ packages=['paddle',
           'paddle.v2.reader',
           'paddle.v2.master',
           'paddle.v2.plot',
-          'paddle.v2.framework',
-          'paddle.v2.framework.proto',
+          'paddle.v2.fluid',
+          'paddle.v2.fluid.proto',
           'py_paddle']
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
@@ -44,14 +44,14 @@ setup(name='paddlepaddle',
       ext_modules=[Extension('_foo', ['stub.cc'])],
       package_data={
         'paddle.v2.master': ['libpaddle_master.so'],
-        'paddle.v2.framework': ['core.so'],
+        'paddle.v2.fluid': ['core.so'],
         'py_paddle':['*.py','_swig_paddle.so']
       },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
-          # The paddle.v2.framework.proto will be generated while compiling.
+          # The paddle.v2.fluid.proto will be generated while compiling.
           # So that package points to other directory.
-          'paddle.v2.framework.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
           'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
       scripts=paddle_bins,