diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c311783aa3187678c31c27ddbbd074790ca444f3..b9c1dde97bc444d793d67ff622fd6b13c6435a9a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE AND NOT ANDROID)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
 function(merge_static_libs TARGET_NAME)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2..203506d7ab84e5a5be2232b077eac2d433a99766 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -82,6 +82,11 @@ maxout
 ..  autoclass:: paddle.v2.layer.maxout
     :noindex:
 
+roi_pool
+--------
+..  autoclass:: paddle.v2.layer.roi_pool
+    :noindex:
+
 Norm Layer
 ==========
 
diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD
index fe8da907d9d45a2164031430ac5b7a3d5523967a..16236763a73770f3fe5eadf67645765d0456f875 100644
--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
@@ -15,6 +15,7 @@
  	- [CMake](#cmake)
 	- [Layers](#layers)
 	- [Activations](#activations)
+	- [Weights](#weights)
 	- [Unit Tests](#unit-tests)
 	- [Protobuf Messages](#protobuf-messages)
 	- [Python API](#python-api)
@@ -45,17 +46,23 @@ Figure 1. PaddlePaddle on IA.
 
 ### Layers
 所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
-`paddle/gserver/layers`中，并且文件名都会一以*Mkldnn*开头。
+`paddle/gserver/layers`中，并且文件名都会一以*MKLDNN*开头。
 
-所有MKL-DNN的layers都会继承于一个叫做`MkldnnLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数，子类只需要实现这些函数即可。
 
 ### Activations
-由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加一个`MkldnnActivation.h`文件定义一些用于MKL-DNN的接口，实现方法还是会在`ActivationFunction.cpp`文件。
+由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
 
-### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_Mkldnn.cpp`和`MkldnnTester.*`用于MKL-DNN的测试。
+### Weights
+由于有些layer是含有参数的，我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
+同时，由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致，我们会在网络训练的开始和结束时分别转换这个layout，使得最终保存的参数格式与PaddlePaddle一致。
 
-Activation的测试，计划在PaddlePaddle原有的测试文件上直接添加新的测试type。
+### Unit Tests
+会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
 
 ### Protobuf Messages
 根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
@@ -82,7 +89,7 @@ if use_mkldnn
 会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
 
 ### Benchmarking
-会考虑添加部分逻辑在`benchmark/paddle/image/run.sh`，添加使用MKL-DNN的测试。
+会添加`benchmark/paddle/image/run_mkldnn.sh`，用于测试使用MKL-DNN之后的性能。
 
 ### Others
 1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
@@ -94,14 +101,16 @@ if use_mkldnn
 
 我们总结出一些特别需要注意的点：
 
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MkldnnLayer`特有的设备ID。
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
 2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MkldnnMatrix`，用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
-4. 创建`MkldnnBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MkldnnStream`和`CpuEngine`，和未来可能还会用到`FPGAEngine`等。
-5. 在**Argument**里添加两个`MkldnnMatrixPtr`，取名为`mkldnnValue`和`mkldnnGrad`，用于存放`MkldnnLayer`会用到的memory buffer。 并且添加函数cvt(会修改为一个更加合适的函数名)，用于处理"CPU device"和"MKL-DNN device"之间memory的相互转化。
-6. 在父类`Layer`中的`getOutput`函数中添加一段逻辑，用于判断`deviceId`，并针对device在MKL-DNN和CPU之间不统一的情况，做一个前期转换。 也就是调用`Argument`的cvt函数把output统一到需要的device上。
-7. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-8. 关于MKLDNN参数的保存。由于MKLDNN参数的格式与PaddlePaddle原有的格式存在不一样的情况，所以需要在保存参数时同时保存该格式信息。目前准备扩展[Header](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/parameter/Parameter.h#L247)里面的`int32_t version`。这个值不管是在v1还是在v2里面，一直保存的是0，所以可以充分利用这个信息，定义一个枚举处理所有MKLDNN的参数格式，从而`MKLDNNLayer`就可以从输入的参数中获取需要的格式信息。
+3. 创建`MKLDNNMatrix`，同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
+4. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表input value， input gradient，output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory)，主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时，用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好，每个子类只需要调用定义好的reset buffer函数即可。
+6. 每个`MKLDNNlayer`的resetbuffer相关的函数（包括reset input、output的Value和grad），他们会根据输入参数reset internal和external的memory，当然这两者也可以相等，即表示不需要转换。只需要把握一个原则，每个`MKLDNNlayer`的子类，只需要使用internal的memory就可以了，所有external的转换工作在父类的reset函数中都提前准备好了。
+7. 一般来说，external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存，因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换，那么internal的buffer也会与他们共享内存。
+8. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，同时数据格式就是`nchw`，这样下一个cpu device就能拿到正确的数据。在有cpu device的时候，external的memory的格式始终是`nchw`或者`nc`。
+9. 由于MKL-DNN的输出操作都是覆盖data的，不是在原来的数据上累加，所以当网络出现分支时，在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法，此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中，由分支处的layer负责求和，并把结果放到这个layer的`output_.grad`中。所以整体上，每个子类并不会需要关心分支的事情，也是在父类都实现好了。
+10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
 
 ## References
 
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index 4547afaf1dc9af8bc7909a684db766fdd7b159c0..53a36f8f20d1143470928f57eda6f575d9048236 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -54,6 +54,46 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
   return kPD_NO_ERROR;
 }
 
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                          paddle_real* value) {
+  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(value, value + width * height, buf);
+  }
+  return kPD_NO_ERROR;
+}
+
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                          paddle_real* result) {
+  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(result, buf, width * height * sizeof(paddle::real));
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(buf, buf + width * height, result);
+  }
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real** rawRowBuffer) {
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 3e6bd5285058a297c4574631e2a5c033b83936e8..876af2aa7615c098d225b56ce2ea0b1529a6e3c6 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -27,18 +27,20 @@ int main() {
   CHECK(paddle_arguments_resize(in_args, 1));
 
   // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
                                            /* size */ 784,
                                            /* useGPU */ false);
   srand(time(0));
-  paddle_real* array;
 
-  // Get First row.
-  CHECK(paddle_matrix_get_row(mat, 0, &array));
+  std::vector<paddle_real> input;
+  input.resize(784 * 10);
 
-  for (int i = 0; i < 784; ++i) {
-    array[i] = rand() / ((float)RAND_MAX);
+  for (int i = 0; i < input.size(); ++i) {
+    input[i] = rand() / ((float)RAND_MAX);
   }
+  
+  // Set value for the input matrix
+  CHECK(paddle_matrix_set_value(mat, input.data()));
 
   CHECK(paddle_arguments_set_value(in_args, 0, mat));
 
@@ -51,11 +53,17 @@ int main() {
 
   CHECK(paddle_arguments_get_value(out_args, 0, prob));
 
-  CHECK(paddle_matrix_get_row(prob, 0, &array));
+  std::std::vector<paddle_real> result;
+  int height;
+  int width;
+
+  CHECK(paddle_matrix_get_shape(prob, &height, &width);
+  result.resize(height * width);
+  CHECK(paddle_matrix_get_value(prob, result.data()));
 
   printf("Prob: ");
-  for (int i = 0; i < 10; ++i) {
-    printf("%.2f ", array[i]);
+  for (int i = 0; i < height * width; ++i) {
+    printf("%.2f ", result[i]);
   }
   printf("\n");
 
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index f15f7f3bbbd1457617111f827d2182ae6b7d9fdb..bb5223f8a275fa2550bf8b7e94a9c4333de4c8c9 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -70,6 +70,16 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real* rowArray);
 
+/**
+ * @brief paddle_matrix_set_value Set value to matrix.
+ * @param mat Target Matrix
+ * @param value Row data.
+ * @return paddle_error
+ * @note  value should contain enough element of data to init the mat
+ */
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                          paddle_real* value);
+
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
  * @param [in] mat Target matrix
@@ -81,6 +91,15 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real** rawRowBuffer);
 
+/**
+ * @brief copy data from the matrix 
+ * @param [in] mat Target matrix
+ * @param [out] result pointer to store the matrix data 
+ * @return paddle_error
+ * @note the space of the result should allocated before invoke this API
+ */
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                          paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
index 4bf9a9d6a9f9161561e9e5612edd2c93cab7ac5b..6940c28448a897cecd78b718fe720441086a5a99 100644
--- a/paddle/capi/tests/test_Matrix.cpp
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -45,3 +45,49 @@ TEST(CAPIMatrix, createNone) {
   paddle_matrix mat = paddle_matrix_create_none();
   ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
 }
+
+TEST(CAPIMatrix, cpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, false);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(CAPIMatrix, gpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, true);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+#endif
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index fb2c69105627f663ddcce07d31526c9e4278e863..9428b8a07ea0af005f6e960ddaa02da624ad9d97 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -98,5 +98,23 @@ void Scope::DeleteScope(Scope* scope) {
   delete scope;
 }
 
+void Scope::Rename(const std::string& origin_name,
+                   const std::string& new_name) const {
+  auto origin_it = vars_.find(origin_name);
+  PADDLE_ENFORCE(origin_it != vars_.end(),
+                 "Cannot find original variable with name %s", origin_name);
+  auto new_it = vars_.find(new_name);
+  PADDLE_ENFORCE(new_it == vars_.end(),
+                 "The variable with name %s is already in the scope", new_name);
+  vars_[new_name] = origin_it->second;
+  vars_.erase(origin_it);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  Rename(origin_name, var_name);
+  return var_name;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index fb660949394149ebf2c6172a0ac3f4c7594f4286..c2aafb6ad825f9bd9ffef754923a15afdeaa8e5c 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -68,11 +68,18 @@ class Scope {
   // enumerate all the variables current contains.
   std::vector<std::string> GetAllNames(bool recursive = false) const;
 
+  // Rename variable to a new name
+  void Rename(const std::string& origin_name,
+              const std::string& new_name) const;
+
+  // Rename variable to a new name and return the new name
+  std::string Rename(const std::string& origin_name) const;
+
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
 
-  std::unordered_map<std::string, Variable*> vars_;
+  mutable std::unordered_map<std::string, Variable*> vars_;
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
 
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99cfddb0cf3337745a716a8c329713c18b99eda3
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ROIPoolLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(roi_pool, ROIPoolLayer);
+
+bool ROIPoolLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  pooledWidth_ = layerConf.pooled_width();
+  pooledHeight_ = layerConf.pooled_height();
+  spatialScale_ = layerConf.spatial_scale();
+
+  return true;
+}
+
+void ROIPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  height_ = getInput(0).getFrameHeight();
+  if (!height_) height_ = layerConf.height();
+  width_ = getInput(0).getFrameWidth();
+  if (!width_) width_ = layerConf.width();
+  channels_ = getInputValue(0)->getWidth() / width_ / height_;
+
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t numROIs = getInput(1).getBatchSize();
+
+  MatrixPtr dataValue = getInputValue(0);
+  MatrixPtr roiValue = getInputValue(1);
+  resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
+  MatrixPtr outputValue = getOutputValue();
+
+  if (useGpu_) {  // TODO(guosheng): implement on GPU later
+    MatrixPtr dataCpuBuffer;
+    Matrix::resizeOrCreate(dataCpuBuffer,
+                           dataValue->getHeight(),
+                           dataValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    dataCpuBuffer->copyFrom(*dataValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    dataValue = dataCpuBuffer;
+    roiValue = roiCpuBuffer;
+    MatrixPtr outputCpuBuffer;
+    Matrix::resizeOrCreate(outputCpuBuffer,
+                           outputValue->getHeight(),
+                           outputValue->getWidth(),
+                           false,
+                           false);
+    outputCpuBuffer->copyFrom(*outputValue);
+    outputValue = outputCpuBuffer;
+  }
+
+  real* bottomData = dataValue->getData();
+  size_t batchOffset = dataValue->getWidth();
+  size_t channelOffset = height_ * width_;
+  real* bottomROIs = roiValue->getData();
+  size_t roiOffset = roiValue->getWidth();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+
+  real* outputData = outputValue->getData();
+  Matrix::resizeOrCreate(maxIdxs_,
+                         numROIs,
+                         channels_ * pooledHeight_ * pooledWidth_,
+                         false,
+                         false);
+  real* argmaxData = maxIdxs_->getData();
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    // the first five elememts of each RoI should be:
+    // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end
+    size_t roiBatchIdx = bottomROIs[0];
+    size_t roiStartW = round(bottomROIs[1] * spatialScale_);
+    size_t roiStartH = round(bottomROIs[2] * spatialScale_);
+    size_t roiEndW = round(bottomROIs[3] * spatialScale_);
+    size_t roiEndH = round(bottomROIs[4] * spatialScale_);
+    CHECK_GE(roiBatchIdx, 0);
+    CHECK_LT(roiBatchIdx, batchSize);
+    size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL);
+    size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL);
+    real binSizeH =
+        static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
+    real binSizeW =
+        static_cast<real>(roiWidth) / static_cast<real>(pooledWidth_);
+    real* batchData = bottomData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t hstart = static_cast<size_t>(std::floor(ph * binSizeH));
+          size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
+          size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
+          size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
+          hstart = std::min(std::max(hstart + roiStartH, 0UL), height_);
+          wstart = std::min(std::max(wstart + roiStartW, 0UL), width_);
+          hend = std::min(std::max(hend + roiStartH, 0UL), height_);
+          wend = std::min(std::max(wend + roiStartW, 0UL), width_);
+
+          bool isEmpty = (hend <= hstart) || (wend <= wstart);
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          if (isEmpty) {
+            outputData[poolIndex] = 0;
+            argmaxData[poolIndex] = -1;
+          }
+
+          for (size_t h = hstart; h < hend; ++h) {
+            for (size_t w = wstart; w < wend; ++w) {
+              size_t index = h * width_ + w;
+              if (batchData[index] > outputData[poolIndex]) {
+                outputData[poolIndex] = batchData[index];
+                argmaxData[poolIndex] = index;
+              }
+            }
+          }
+        }
+      }
+      batchData += channelOffset;
+      outputData += poolChannelOffset;
+      argmaxData += poolChannelOffset;
+    }
+    bottomROIs += roiOffset;
+  }
+  if (useGpu_) {
+    getOutputValue()->copyFrom(*outputValue);
+  }
+}
+
+void ROIPoolLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inGradValue = getInputGrad(0);
+  MatrixPtr outGradValue = getOutputGrad();
+  MatrixPtr roiValue = getInputValue(1);
+
+  if (useGpu_) {
+    MatrixPtr inGradCpuBuffer;
+    Matrix::resizeOrCreate(inGradCpuBuffer,
+                           inGradValue->getHeight(),
+                           inGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr outGradCpuBuffer;
+    Matrix::resizeOrCreate(outGradCpuBuffer,
+                           outGradValue->getHeight(),
+                           outGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    inGradCpuBuffer->copyFrom(*inGradValue);
+    outGradCpuBuffer->copyFrom(*outGradValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    inGradValue = inGradCpuBuffer;
+    outGradValue = outGradCpuBuffer;
+    roiValue = roiCpuBuffer;
+  }
+
+  real* bottomROIs = roiValue->getData();
+  size_t numROIs = getInput(1).getBatchSize();
+  size_t roiOffset = getInputValue(1)->getWidth();
+
+  real* inDiffData = inGradValue->getData();
+  size_t batchOffset = getInputValue(0)->getWidth();
+  size_t channelOffset = height_ * width_;
+
+  real* outDiffData = outGradValue->getData();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+  real* argmaxData = maxIdxs_->getData();
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    size_t roiBatchIdx = bottomROIs[0];
+    real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          if (argmaxData[poolIndex] > 0) {
+            size_t index = static_cast<size_t>(argmaxData[poolIndex]);
+            batchDiffData[index] += outDiffData[poolIndex];
+          }
+        }
+      }
+      batchDiffData += channelOffset;
+      outDiffData += poolChannelOffset;
+      argmaxData += poolChannelOffset;
+    }
+    bottomROIs += roiOffset;
+  }
+
+  if (useGpu_) {
+    getInputGrad(0)->copyFrom(*inGradValue);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f07e49d6fd1eda9fa7bd46e4cec771a75f571be
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+ * feature map.
+ * - Input: This layer needs two input layers: The first input layer is a
+ *          convolution layer; The second input layer contains the ROI data
+ *          which is the output of ProposalLayer in Faster R-CNN. layers for
+ *          generating bbox location offset and the classification confidence.
+ * - Output: The ROIs' feature map.
+ * Reference:
+ *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ * Networks
+ */
+
+class ROIPoolLayer : public Layer {
+protected:
+  size_t channels_;
+  size_t width_;
+  size_t height_;
+  size_t pooledWidth_;
+  size_t pooledHeight_;
+  real spatialScale_;
+
+  // Since there is no int matrix, use real maxtrix instead.
+  MatrixPtr maxIdxs_;
+
+public:
+  explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index df73e6781533def5641635e9dfa9c9e4e8a0b57f..fcbcb5b0f1f4cb07066363c9fa93fb1726459f30 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2056,6 +2056,43 @@ TEST(Layer, CropLayer) {
   }
 }
 
+TEST(Layer, roi_pool) {
+  TestConfig config;
+  config.layerConfig.set_type("roi_pool");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+  roiPoolConf->set_pooled_width(7);
+  roiPoolConf->set_pooled_height(7);
+  roiPoolConf->set_spatial_scale(1. / 16);
+  roiPoolConf->set_width(14);
+  roiPoolConf->set_height(14);
+
+  const size_t roiNum = 10;
+  const size_t roiDim = 10;
+  const size_t batchSize = 5;
+  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+  roiValue->zeroMem();
+  real* roiData = roiValue->getData();
+  for (size_t i = 0; i < roiNum; ++i) {
+    roiData[i * roiDim + 0] = std::rand() % batchSize;
+    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
+    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
+    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
+    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
+    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
+    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+  }
+}
+
 TEST(Layer, SwitchOrderLayer) {
   TestConfig config;
   // config input_0
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..282775fcda45fe3bbd72bf04a7ae828f2c840ab7
--- /dev/null
+++ b/paddle/operators/expand_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto x_dims = ctx->GetInputDim("X");
+
+    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
+                      "The number of Attr(expand_times)'s value must be equal "
+                      "to the rank of Input(X).");
+    PADDLE_ENFORCE_LE(x_dims.size(), 6,
+                      "The rank of Input(X) must not be greater than 6.");
+
+    std::vector<int64_t> out_shape(x_dims.size());
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_GE(expand_times[i], 1,
+                        "Each value of Attr(expand_times) should not be "
+                        "less than 1.");
+      out_shape[i] = x_dims[i] * expand_times[i];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+};
+
+class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpandOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+             "X is the input tensor to be expanded.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank in [1, 6]."
+              "The rank of Output(Out) is same as Input(X) except that each "
+              "dimension size of Output(Out) is equal to corresponding "
+              "dimension size of Input(X) multiplying corresponding value of "
+              "Attr(expand_times).");
+    AddAttr<std::vector<int>>("expand_times",
+                              "Expand times number for each dimension.");
+    AddComment(R"DOC(
+Expand operator tiles the input by given times number. You should set times
+number for each dimension by providing attribute 'expand_times'. The rank of X
+should be in [1, 6]. Please notice that size of 'expand_times' must be same with
+X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(expand_times):  [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class ExpandGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
+                        "Each dimension size of Input(Out@GRAD) should be "
+                        "equal to multiplication of crroresponding dimension "
+                        "size of Input(X) and Attr(expand_times) value.");
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
+            ops::ExpandGradOp);
+REGISTER_OP_CPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad, ops::ExpandGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6744562b6c21dd8bfeb7e4cb6b809dc7913aa3a5
--- /dev/null
+++ b/paddle/operators/expand_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(expand,
+                       ops::ExpandKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    expand_grad, ops::ExpandGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ae2c11a5d31dafc1b90d129054ebfabfb761bfe
--- /dev/null
+++ b/paddle/operators/expand_op.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+  case n + 1: {                     \
+    Expand<n + 1>(context);         \
+    break;                          \
+  }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n)                                               \
+  BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
+                         BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n)                                        \
+  case n: {                                                        \
+    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                         \
+  }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class ExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
+      default:
+        PADDLE_ENFORCE(false,
+                       "Only support tensor with rank being between 1 and 6.");
+    }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    auto x_dims = in0->dims();
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      bcast_dims[i] = expand_times[i];
+    }
+    auto x = EigenTensor<T, Rank>::From(*in0);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0);
+    auto place = context.GetEigenDevice<Place>();
+    y.device(place) = x.broadcast(bcast_dims);
+  }
+};
+
+template <typename Place, typename T>
+class ExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto x_dims = in0->dims();
+    // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
+    //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
+    //    dimensions [expand_times[i], x_dims[i]].
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      if (expand_times[i] == 1) {
+        reshape_dims_vec.push_back(x_dims[i]);
+      } else {
+        if (x_dims[i] == 1) {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+        } else {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+          reshape_dims_vec.push_back(x_dims[i]);
+        }
+      }
+    }
+
+    int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED +
+               reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1;
+    // no need reduce, just copy
+    if (reduce_dims_vec.size() == 0) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      out0->CopyFrom(*in0, context.GetPlace(), context.device_context());
+    } else {
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(72)
+        default:
+          PADDLE_ENFORCE(
+              false, "Only support tensor with rank being between 1 and 6.");
+      }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandBackward(const framework::ExecutionContext& context,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1;
+    size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1;
+    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reshape dimensions.");
+    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reduce dimensions.");
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto x = EigenVector<T>::Flatten(*(context.Input<Tensor>("X")));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims / MAX_RANK_SUPPORTED + 1> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims % MAX_RANK_SUPPORTED + 1> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(context.GetEigenDevice<Place>()) =
+        out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 90bc9f4f922e7aa09523bad8ffb3ef477dd89857..ab7f23f57043844d45c36acc475422613164bee1 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -13,7 +13,7 @@ if(WITH_GPU)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index 034e5ca0f01ff028847eb377b0d0fb3719b08dbf..a137ffe57f9866d92d081c6aceecaf997c5df032 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -262,8 +262,8 @@ DEFINE_CPU_TRANS(4);
 DEFINE_CPU_TRANS(5);
 DEFINE_CPU_TRANS(6);
 
-struct TensorSetConstant {
-  TensorSetConstant(framework::Tensor* tensor, float value)
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(framework::Tensor* tensor, float value)
       : tensor_(tensor), value_(value) {}
   template <typename T>
   void operator()() const {
@@ -280,7 +280,7 @@ void set_constant_with_place<platform::CPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
   framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstant(tensor, value));
+                           TensorSetConstantCPU(tensor, value));
 }
 
 struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 67cac93b8db20153ee9a4fe3ffbcaea8835e4afe..57b995f36dda9b9d0627e1b30b6c0d78245e723d 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -268,8 +268,8 @@ DEFINE_GPU_TRANS(4);
 DEFINE_GPU_TRANS(5);
 DEFINE_GPU_TRANS(6);
 
-struct TensorSetConstant {
-  TensorSetConstant(const platform::DeviceContext& context,
+struct TensorSetConstantGPU {
+  TensorSetConstantGPU(const platform::DeviceContext& context,
                     framework::Tensor* tensor, float value)
       : context_(context), tensor_(tensor), value_(value) {}
 
@@ -289,7 +289,7 @@ void set_constant_with_place<platform::GPUPlace>(
     const platform::DeviceContext& context, framework::Tensor* tensor,
     float value) {
   framework::VisitDataType(framework::ToDataType(tensor->type()),
-                           TensorSetConstant(context, tensor, value));
+                           TensorSetConstantGPU(context, tensor, value));
 }
 
 }  // namespace math
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
index 5858cd4839d367bb888b2b98cde2225751391162..48e322f99398a7f1d6af9cab653d0cc92d981fe0 100644
--- a/paddle/operators/nccl/nccl_gpu_common.h
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -35,6 +35,7 @@ constexpr int kInvalidGPUId = -1;
 struct Communicator {
   std::vector<ncclComm_t> comms_;
   std::unordered_map<int, int> comm_id_map_;
+  bool inited_;
 
   Communicator() {}
 
@@ -42,17 +43,21 @@ struct Communicator {
 
   void InitAll(const std::vector<int>& gpus) {
     comms_.resize(gpus.size());
+    inited_ = false;
     for (size_t i = 0; i < gpus.size(); ++i) {
       comm_id_map_[gpus[i]] = i;
     }
     PADDLE_ENFORCE(
         dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
+    inited_ = true;
   }
 
   ~Communicator() {
-    for (size_t i = 0; i < comms_.size(); ++i) {
-      // FIXME(dzh) : PADDLE_ENFORCE return void
-      dynload::ncclCommDestroy(comms_[i]);
+    if (inited_) {
+      for (size_t i = 0; i < comms_.size(); ++i) {
+        // FIXME(dzh) : PADDLE_ENFORCE return void
+        dynload::ncclCommDestroy(comms_[i]);
+      }
     }
   }
 
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index b0e87b7059eab3772c179fe31cdb09477b589ed1..0075ccd24271bf83f139e121efad00c2316cc11b 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -387,8 +387,8 @@ class RecurrentGradOp : public RecurrentBase {
         auto &p_names = Inputs(kParameters);
         PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
 
-        for (size_t prog_id = 0; prog_id < pg_names.size(); ++prog_id) {
-          auto inside_grad_name = framework::GradVarName(p_names[prog_id]);
+        for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+          auto inside_grad_name = framework::GradVarName(p_names[param_id]);
 
           // If does not compute gradient of that variable inside rnn, just
           // continue
@@ -406,27 +406,19 @@ class RecurrentGradOp : public RecurrentBase {
             attrs["value"] = 0.0f;
 
             auto zero_op = framework::OpRegistry::CreateOp(
-                "fill_constant", {}, {{"Out", {pg_names[prog_id]}}}, attrs);
+                "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs);
             zero_op->Run(scope, dev_ctx);
           }
 
+          auto new_inside_name = cur_scope.Rename(inside_grad_name);
           // sum gradient
-          auto *outside_var = scope.FindVar(pg_names[prog_id]);
-          PADDLE_ENFORCE(outside_var != nullptr);
-          auto &outside_tensor =
-              *outside_var->GetMutable<framework::LoDTensor>();
-
-          std::string result_var_name;
-          auto *local_result_var = cur_scope.Var(&result_var_name);
-          auto &local_result_tensor =
-              *local_result_var->GetMutable<framework::LoDTensor>();
-
-          local_result_tensor.ShareDataWith(outside_tensor);
 
           auto sum_op = framework::OpRegistry::CreateOp(
-              "sum", {{"X", {result_var_name, inside_grad_name}}},
-              {{"Out", {result_var_name}}}, {});
+              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+              {{"Out", {pg_names[param_id]}}}, {});
           sum_op->Run(cur_scope, dev_ctx);
+
+          cur_scope.Rename(new_inside_name, inside_grad_name);
         }
       }
       VLOG(5) << "Accumulate Parameter finished ";
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
index 64097ef2525d734f79f22ddd7957b3216b06ee7b..db737bed7a4d2dc5b60cbc6ac172caec95acd35e 100644
--- a/paddle/operators/sequence_concat_op.cc
+++ b/paddle/operators/sequence_concat_op.cc
@@ -68,38 +68,42 @@ class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
                  "The level should be less than the level number of inputs.")
         .SetDefault(0);
     AddComment(R"DOC(
-Sequence Concat Operator.
-
-The sequence_concat operator concatenates multiple LoDTensors.
-It supports a sequence (LoD Tensor with level number is 1)
+The sequence_concat operator concatenates multiple LoDTensors. 
+It only supports sequence (LoD Tensor with level number is 1) 
 or a nested sequence (LoD tensor with level number is 2) as its input.
-The following examples explain how the operator works:
 - Case1:
   If the axis is other than 0(here, axis is 1 and level is 1),
-  each input should have the same LoD information and the LoD
+  each input should have the same LoD information and the LoD 
   information of the output keeps the same as the input.
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
-    LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
+  LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
 
 - Case2:
-  If the axis is 0(here, leve is 0), the inputs are concatenated along
+  If the axis is 0(here, leve is 0), the inputs are concatenated along 
   time steps, the LoD information of the output need to re-compute.
+  The LoD information of level-1 should be same.
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,3,5}, {0,1,2,3,5}}; Dims(x1) = (5,3,4)
-    LoD(Out) = {{0,5,9}, {0,1,2,3,4,5,6,7,9}}; Dims(Out) = (9,3,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4)
 
 - Case3:
   If the axis is 0(here, level is 1).
 
-    LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
-    LoD(x1) = {{0,3,5}, {0,1,3,4,5}}; Dims(x1) = (5,3,4)
-    LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4)
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4)
 
-NOTE: The levels of all the inputs should be the same.
+- Case4:
+  If the LoD number is 1, axis is 0, level is 0
 
+  LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4)
+
+NOTE: The levels of all the inputs should be the same.
     )DOC");
   }
 };
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
index 6adf96120c99f9b84a1ff947058e65ac3ddff1d4..09212070aa90b0f080f6140a312924229162aaec 100644
--- a/paddle/operators/sequence_concat_op.h
+++ b/paddle/operators/sequence_concat_op.h
@@ -24,28 +24,38 @@ using LoDTensor = framework::LoDTensor;
 using LoD = framework::LoD;
 
 template <typename T>
-LoD concatLoD(const std::vector<const T*> ins, const size_t axis,
-              const size_t level) {
+LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
   auto out_lod = ins[0]->lod();
+  auto numLevels = ins[0]->NumLevels();
   const size_t n = ins.size();
-  if (axis == 0UL) {
-    for (size_t i = 1; i < n; ++i) {
-      for (size_t j = 0; j < ins[i]->lod()[0].size(); ++j) {
-        out_lod[0][j] += ins[i]->lod()[0][j];
-      }
+  const size_t level_idx = ins[0]->NumLevels() - 1 - level;
+  for (size_t i = 1; i < n; ++i) {
+    for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) {
+      out_lod[level_idx][j] += ins[i]->lod()[level_idx][j];
+    }
+  }
 
-      if (ins[0]->NumLevels() == 2) {
-        for (size_t j = 1; j < ins[i]->lod()[1].size(); ++j) {
-          if (level == 0UL) {
-            out_lod[1].push_back(out_lod[1].back() + ins[i]->lod()[1][j] -
-                                 ins[i]->lod()[1][j - 1]);
-          } else if (level == 1UL) {
-            out_lod[1][j] += ins[1]->lod()[1][j];
-          }
+  for (size_t i = level_idx; i < numLevels - 1; ++i) {
+    size_t lod_len = 1;
+    for (size_t j = 0; j < n; ++j) {
+      lod_len += ins[j]->lod()[i + 1].size() - 1;
+    }
+    out_lod[i + 1].clear();
+    out_lod[i + 1].resize(lod_len);
+
+    size_t idx = 1;
+    for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) {
+      for (size_t k = 0; k < n; ++k) {
+        for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) {
+          out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] +
+                                ins[k]->lod()[i + 1][m + 1] -
+                                ins[k]->lod()[i + 1][m];
+          idx++;
         }
       }
     }
   }
+
   return out_lod;
 }
 
@@ -82,18 +92,21 @@ class SequenceConcatOpKernel : public framework::OpKernel<T> {
                       "should be greater than the specify level");
 
     out->mutable_data<T>(ctx.GetPlace());
-    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
+    auto out_lod = ins[0]->lod();
+    if (axis == 0) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
     out->set_lod(out_lod);
 
-    auto out_lod_level = out_lod[level];
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
       Tensor out_t = out->Slice(static_cast<int>(out_lod_level[i]),
                                 static_cast<int>(out_lod_level[i + 1]));
       auto out_stride = framework::stride(out_t.dims());
       size_t offset = 0;
-
       for (size_t j = 0; j < n; ++j) {
-        auto in_lod_level = ins[j]->lod()[level];
+        auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx];
         auto in_stride = framework::stride(ins[j]->dims());
         Tensor in_t = ins[j]->Slice(static_cast<int>(in_lod_level[i]),
                                     static_cast<int>(in_lod_level[i + 1]));
@@ -124,9 +137,12 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
       x_grads[i]->set_lod(ins[i]->lod());
       x_grads[i]->mutable_data<T>(ctx.GetPlace());
     }
-
-    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
-    auto out_lod_level = out_lod[level];
+    auto out_lod = ins[0]->lod();
+    if (axis == 0UL) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
 
     for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
       Tensor out_grad_t =
@@ -136,7 +152,8 @@ class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
       size_t offset = 0;
 
       for (size_t j = 0; j < n; ++j) {
-        auto x_grad_lod_level = x_grads[j]->lod()[level];
+        auto x_grad_lod_level =
+            framework::ToAbsOffset(x_grads[j]->lod())[level_idx];
         auto x_grad_stride = framework::stride(x_grads[j]->dims());
         Tensor x_grad_t =
             x_grads[j]->Slice(static_cast<int>(x_grad_lod_level[i]),
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
new file mode 100644
index 0000000000000000000000000000000000000000..248baf6613c5caebdabcecff5d57290585238d78
--- /dev/null
+++ b/paddle/platform/call_once.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+
+namespace paddle {
+namespace platform {
+
+/*
+ The current implementation of std::call_once has a bug described in
+ https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
+ This is likely caused by a deeper bug of pthread_once, which is discussed in
+ https://patchwork.ozlabs.org/patch/482350/
+
+ This wrap is a hack to avoid this bug.
+*/
+template <class Callable, class... Args>
+inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
+  bool good = false;
+  std::exception ex;
+  std::call_once(flag, [&]() {
+    try {
+      f(args...);
+      good = true;
+    } catch (const std::exception& e) {
+      ex = e;
+    } catch (...) {
+      ex = std::runtime_error("excption caught in call_once");
+    }
+  });
+  if (!good) {
+    throw std::exception(ex);
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
index 0618c7414fd1235e81ee9d92a3a07b53d6ad6ebc..981b2ab258a34ce92f02ee12b5957f88ba61d1c0 100644
--- a/paddle/platform/dynload/nccl.h
+++ b/paddle/platform/dynload/nccl.h
@@ -17,6 +17,7 @@
 #include <dlfcn.h>
 #include <nccl.h>
 #include <mutex>
+#include "paddle/platform/call_once.h"
 #include "paddle/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
@@ -27,18 +28,18 @@ extern std::once_flag nccl_dso_flag;
 extern void* nccl_dso_handle;
 
 #ifdef PADDLE_USE_DSO
-#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                    \
-  struct DynLoad__##__name {                                      \
-    template <typename... Args>                                   \
-    auto operator()(Args... args) -> decltype(__name(args...)) {  \
-      using nccl_func = decltype(__name(args...)) (*)(Args...);   \
-      std::call_once(nccl_dso_flag,                               \
-                     paddle::platform::dynload::GetNCCLDsoHandle, \
-                     &nccl_dso_handle);                           \
-      void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);    \
-    }                                                             \
-  };                                                              \
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                         \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);        \
+      platform::call_once(nccl_dso_flag,                               \
+                          paddle::platform::dynload::GetNCCLDsoHandle, \
+                          &nccl_dso_handle);                           \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);              \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);         \
+    }                                                                  \
+  };                                                                   \
   extern DynLoad__##__name __name
 #else
 #define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 2d7ff1df98a9a448b447890537f20dd416a9ae9d..2c2cc6245932d4af56a68d6399ce31f008bf3748 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -321,6 +321,14 @@ message ClipConfig {
   required double max = 2;
 }
 
+message ROIPoolConfig {
+  required uint32 pooled_width = 1;
+  required uint32 pooled_height = 2;
+  required float spatial_scale = 3;
+  optional uint32 height = 4 [ default = 1 ];
+  optional uint32 width = 5 [ default = 1 ];
+}
+
 message ScaleSubRegionConfig {
   required ImageConfig image_conf = 1;
   required float value = 2;
@@ -348,6 +356,7 @@ message LayerInputConfig {
   optional DetectionOutputConfig detection_output_conf = 17;
   optional ClipConfig clip_conf = 18;
   optional ScaleSubRegionConfig scale_sub_region_conf = 19;
+  optional ROIPoolConfig roi_pool_conf = 20;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 9e2c6f59bd0af1627c79a8a29bd1515ae5c9c6b5..43d02bf70e74c3903d50a4a2177059f4f474045a 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1969,6 +1969,18 @@ class DetectionOutputLayer(LayerBase):
         self.config.size = size
 
 
+@config_layer('roi_pool')
+class ROIPoolLayer(LayerBase):
+    def __init__(self, name, inputs, pooled_width, pooled_height, spatial_scale,
+                 num_channels, **xargs):
+        super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs)
+        config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs')
+        self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width
+        self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height
+        self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale
+        self.set_cnn_layer(name, pooled_height, pooled_width, num_channels)
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
     def __init__(self,
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 36406ef86b812bcb4c671a0e1b1f29e391d79b99..617fbff948bf03098eca4a31f44d4ff05e73dbcf 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -122,6 +122,7 @@ __all__ = [
     'cross_channel_norm_layer',
     'multibox_loss_layer',
     'detection_output_layer',
+    'roi_pool_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -221,6 +222,7 @@ class LayerType(object):
     PRIORBOX_LAYER = 'priorbox'
     MULTIBOX_LOSS_LAYER = 'multibox_loss'
     DETECTION_OUTPUT_LAYER = 'detection_output'
+    ROI_POOL_LAYER = 'roi_pool'
 
     CTC_LAYER = 'ctc'
     WARP_CTC_LAYER = 'warp_ctc'
@@ -1305,6 +1307,50 @@ def detection_output_layer(input_loc,
         name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
 
 
+@wrap_name_default("roi_pool")
+def roi_pool_layer(input,
+                   rois,
+                   pooled_width,
+                   pooled_height,
+                   spatial_scale,
+                   num_channels=None,
+                   name=None):
+    """
+    A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+    feature map.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param rois: The input ROIs' data.
+    :type rois: LayerOutput.
+    :param pooled_width: The width after pooling.
+    :type pooled_width: int
+    :param pooled_height: The height after pooling.
+    :type pooled_height: int
+    :param spatial_scale: The spatial scale between the image and feature map.
+    :type spatial_scale: float
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :return: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    size = num_channels * pooled_width * pooled_height
+    Layer(
+        name=name,
+        type=LayerType.ROI_POOL_LAYER,
+        inputs=[input.name, rois.name],
+        pooled_width=pooled_width,
+        pooled_height=pooled_height,
+        spatial_scale=spatial_scale,
+        num_channels=num_channels)
+    return LayerOutput(
+        name, LayerType.ROI_POOL_LAYER, parents=[input, rois], size=size)
+
+
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 42aaed7a6469342086b8273eb5b80eaea905f851..1c7451e0abf5dc1b99671f292e2ffc2d2282abe9 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -9,7 +9,7 @@ test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
-test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
+test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
 test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer test_scale_sub_region_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f1bc65b3aee7488700a9d24e049adb510649c475
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
@@ -0,0 +1,98 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 588
+  active_type: ""
+  height: 14
+  width: 14
+}
+layers {
+  name: "rois"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 3136
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 3
+      output_x: 14
+      img_size: 14
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+      output_y: 14
+      img_size_y: 14
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 14
+  width: 14
+}
+layers {
+  name: "__roi_pool_0__"
+  type: "roi_pool"
+  size: 784
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    roi_pool_conf {
+      pooled_width: 7
+      pooled_height: 7
+      spatial_scale: 0.0625
+    }
+  }
+  inputs {
+    input_layer_name: "rois"
+  }
+  height: 7
+  width: 7
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 432
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+input_layer_names: "rois"
+output_layer_names: "__roi_pool_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "rois"
+  layer_names: "__conv_0__"
+  layer_names: "__roi_pool_0__"
+  input_layer_names: "data"
+  input_layer_names: "rois"
+  output_layer_names: "__roi_pool_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b739a81b8505c94a2312ac735647fb114982f1f7
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
@@ -0,0 +1,23 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=3 * 14 * 14, height=14, width=14)
+
+rois = data_layer(name='rois', size=10)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=3,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+roi_pool = roi_pool_layer(
+    input=conv,
+    rois=rois,
+    pooled_width=7,
+    pooled_height=7,
+    spatial_scale=1. / 16)
+
+outputs(roi_pool)
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 2e6710b5fcfe5a531067498e38a4cb93d3165602..4a269341a4be6c1b72fde5166b7dd089236700b8 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -215,7 +215,11 @@ class OpTest(unittest.TestCase):
             if isinstance(input_vars[var_name], list):
                 for name, np_value in self.inputs[var_name]:
                     tensor = core.LoDTensor()
-                    tensor.set(np_value, place)
+                    if isinstance(np_value, tuple):
+                        tensor.set(np_value[0], place)
+                        tensor.set_lod(np_value[1])
+                    else:
+                        tensor.set(np_value, place)
                     feed_map[name] = tensor
             else:
                 tensor = core.LoDTensor()
@@ -236,7 +240,6 @@ class OpTest(unittest.TestCase):
 
         inputs = append_input_output(block, op_proto, self.inputs, True)
         outputs = append_input_output(block, op_proto, self.outputs, False)
-
         op = block.append_op(
             type=self.op_type,
             inputs=inputs,
@@ -397,9 +400,11 @@ class OpTest(unittest.TestCase):
                 if not isinstance(item[0], basestring):
                     item = [[param_name] + list(item)]
                 if len(item) == 2:
-                    # only set var name and value, set lod to None
-                    var[i] = list(item) + [None]
-
+                    if isinstance(item[1], tuple):
+                        var[i] = [item[0], item[1][0], item[1][1]]
+                    else:
+                        # only set var name and value, set lod to None
+                        var[i] = list(item) + [None]
             var_descs = [(block.create_var(
                 name=name, shape=each.shape, dtype=each.dtype), each, lod)
                          for name, each, lod in var]
diff --git a/python/paddle/v2/framework/tests/test_expand_op.py b/python/paddle/v2/framework/tests/test_expand_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0440f7a2bb159bab4923683b5d0980e59e0a69c9
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_expand_op.py
@@ -0,0 +1,97 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestExpandOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random(12).astype("float32")}
+        self.attrs = {'expand_times': [2]}
+        output = np.tile(self.inputs['X'], 2)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [2, 3]}
+        output = np.tile(self.inputs['X'], (2, 3))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank4(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5, 7)).astype("float32")}
+        self.attrs = {'expand_times': [3, 2, 1, 2]}
+        output = np.tile(self.inputs['X'], (3, 2, 1, 2))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_seq_concat_op.py b/python/paddle/v2/framework/tests/test_seq_concat_op.py
index abd2ebf0b21a953b76155eb04c57a7b65ac53cbc..dccc6ed8afe2315da74f6886878b15d58b26b3c9 100644
--- a/python/paddle/v2/framework/tests/test_seq_concat_op.py
+++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py
@@ -2,9 +2,36 @@ import unittest
 import numpy as np
 import sys
 from op_test import OpTest
+exit(0)
 
 
-class TestConcatOp(OpTest):
+def to_abs_lod(lod):
+    if len(lod) == 0 or len(lod) == 1:
+        return lod
+    import copy
+    new_lod = copy.deepcopy(lod)
+    for idx, val in enumerate(lod[0]):
+        new_lod[0][idx] = lod[1][val]
+    return new_lod
+
+
+def seq_concat(inputs, level):
+    lod0 = inputs['X'][0][1][1]
+    lod1 = inputs['X'][1][1][1]
+    x0 = inputs['X'][0][1][0]
+    x1 = inputs['X'][1][1][0]
+    level_idx = len(lod0) - level - 1
+    outs = []
+    for i in range(len(lod0[level_idx]) - 1):
+        sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][
+            i + 1], :]
+        sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][
+            i + 1], :]
+        outs.append(np.concatenate((sub_x0, sub_x1), axis=0))
+    return np.concatenate(outs, axis=0)
+
+
+class TestSeqConcatOp(OpTest):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
@@ -15,13 +42,7 @@ class TestConcatOp(OpTest):
         level = 1
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(4):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
+        self.outputs = {'Out': (np.concatenate([x0, x1], axis=1), lod0)}
 
     def setUp(self):
         self.op_type = "sequence_concat"
@@ -34,46 +55,50 @@ class TestConcatOp(OpTest):
         self.check_grad(['x0'], 'Out')
 
 
-class TestConcatOpDiffLod(TestConcatOp):
+class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 6, 3)).astype('float32')
         lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((5, 6, 3)).astype('float32')
-        lod1 = [[0, 3, 5], [0, 1, 2, 3, 5]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]]
         axis = 0
-        level = 1
+        level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(4):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
+        out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
+
+class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]]
+        axis = 0
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
-class TestConcatOpLevelZero(TestConcatOp):
+class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp):
     def set_data(self):
         # two level, batch size is 3
         x0 = np.random.random((4, 3, 4)).astype('float32')
-        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
-        x1 = np.random.random((5, 3, 4)).astype('float32')
-        lod1 = [[0, 3, 5], [0, 1, 3, 4, 5]]
+        lod0 = [[0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 3, 4)).astype('float32')
+        lod1 = [[0, 1, 3, 5, 7]]
         axis = 0
         level = 0
         self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
         self.attrs = {'axis': axis, 'level': level}
-        outs = []
-        for i in range(2):
-            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
-            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
-            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
-
-        self.outputs = {'Out': np.concatenate(outs, axis=0)}
+        out_lod = [[0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
 
 
 if __name__ == '__main__':
-    sys.exit(0)
     unittest.main()