Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/refine_w2v

test=develop

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/refine_w2v
test=develop
740e1626 · Yu Yang · 7b10bf0e · 3628d894 · 740e1626 · 740e1626
110 changed file
--- a/README.md
+++ b/README.md
@@ -19,6 +19,15 @@ Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.


+欢迎来到 PaddlePaddle GitHub
+
+PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台，最初由百度科学家和工程师共同开发，目的是将深度学习技术应用到百度的众多产品中。
+
+我们的愿景是让每个人都能通过PaddlePaddle接触深度学习
+
+跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
+
+
 ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
 ### Install Latest Stable Release:
 ```
@@ -34,6 +43,23 @@ pip install paddlepaddle-gpu==1.2.0.post85
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```

+
+### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2)
+### 安装最新稳定版本:
+```
+# Linux CPU
+pip install paddlepaddle
+# Linux GPU cuda9cudnn7
+pip install paddlepaddle-gpu
+# Linux GPU cuda8cudnn7
+pip install paddlepaddle-gpu==1.2.0.post87
+# Linux GPU cuda8cudnn5
+pip install paddlepaddle-gpu==1.2.0.post85
+
+# 其他平台上的安装指引请参考 http://paddlepaddle.org/
+```
+
+
 ## Features

 - **Flexibility**
@@ -74,10 +100,38 @@ pip install paddlepaddle-gpu==1.2.0.post85
    Baidu and it has achieved a significant impact. We hope you can also explore
    the capability of PaddlePaddle to make an impact on your product.

+## 特点
+
+- **灵活性**
+
+    PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型，例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。
+
+-  **高效性**
+
+    为了高效使用异步计算资源，PaddlePaddle对框架的不同层进行优化，包括计算、存储、架构和通信。下面是一些样例：
+    
+    - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。
+    - 通过MKL-DNN库优化CNN网络
+    - 高度优化循环网络，无需执行 `padding` 操作即可处理 **变长** 序列
+    - 针对高维稀疏数据模型，优化了局部和分布式训练。
+     
+
+- **稳定性**
+
+    有了 PaddlePaddle，使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。
+
+- **连接产品**
+
+    另外，PaddlePaddle 的设计也易于部署。在百度，PaddlePaddle 已经部署到含有巨大用户量的产品和服务上，包括广告点击率（CTR）预测、大规模图像分类、光学字符识别（OCR）、搜索排序，计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中，产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力，为您的产品创造新的影响力和效果。
+
 ## Installation

 It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website.

+## 安装
+
+推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) 
+
 ## Documentation

 We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and
@@ -99,10 +153,37 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte

   We appreciate your contributions!

+## 文档
+
+我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和
+[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档
+
+- [深度学习101](https://github.com/PaddlePaddle/book)
+
+  或许您想从这个在线交互式书籍开始，可以在Jupyter Notebook中运行
+
+- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html)
+
+  可以在MPI集群上运行分布式训练任务
+
+- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html)
+
+   新的API支持代码更少更简洁的程序
+
+- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html)
+
+   欢迎您的贡献!

 ## Ask Questions

 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).

+## 答疑
+
+欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交
+
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
+
+## 版权和许可证
+PaddlePaddle由[Apache-2.0 license](LICENSE)提供
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -81,9 +81,11 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog):
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")

-    config = distribute_transpiler.DistributeTranspilerConfig()
+    config = fluid.DistributeTranspilerConfig()
    config.slice_var_up = not args.no_split_var
+    config.min_block_size = 1048576
    t = distribute_transpiler.DistributeTranspiler(config=config)
+
    t.transpile(
        trainer_id,
        # NOTE: *MUST* use train_prog, for we are using with guard to

--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -14,14 +14,16 @@

 INCLUDE(ExternalProject)

-find_library(SSL_LIBRARY NAMES ssl)
+find_package(OpenSSL REQUIRED) 
+
+message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY})
+message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY})
+
 ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY})

-find_library(CRYPTO_LIBRARY NAMES crypto)
 ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
-
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY})

 SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
 SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
@@ -31,14 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})

 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog")

 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
    extern_brpc
    ${EXTERNAL_PROJECT_LOG_ARGS}
+    # TODO(gongwb): change to de newst repo when they changed.
    GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
-    GIT_TAG         "7dc04defad1fd4173aae170c3fcbde131b65155a"
+    GIT_TAG         "e9b67ec1b7458f2af5fae76451afe1e27e01b4b4"
    PREFIX          ${BRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -50,7 +53,7 @@ ExternalProject_Add(
                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                    -DCMAKE_PREFIX_PATH=${prefix_path}
-                    -DBRPC_WITH_GLOG=ON
+                    -DWITH_GLOG=ON
                    -DIOBUF_WITH_HUGE_BLOCK=ON
                    -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
                    ${EXTERNAL_OPTIONAL_ARGS}
@@ -65,5 +68,6 @@ ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)

+add_definitions(-DBRPC_WITH_GLOG)

 LIST(APPEND external_project_dependencies brpc)
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -12,8 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-IF(WITH_TESTING)
+#FIXME:(gongwb) Move brpc's gtest dependency.
+IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
+    IF(WITH_TESTING)
        ENABLE_TESTING()
+    ENDIF(WITH_TESTING)
+
    INCLUDE(ExternalProject)

    SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
@@ -76,4 +80,4 @@ IF(WITH_TESTING)
    ADD_DEPENDENCIES(gtest_main extern_gtest)

    LIST(APPEND external_project_dependencies gtest gtest_main)
-ENDIF(WITH_TESTING)
+ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -24,8 +24,8 @@ ExternalProject_Add(
    extern_leveldb
    ${EXTERNAL_PROJECT_LOG_ARGS}
    PREFIX ${LEVELDB_SOURCES_DIR}
-    URL "https://github.com/google/leveldb/archive/v1.18.tar.gz"
-    URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0"
+    GIT_REPOSITORY "https://github.com/google/leveldb"
+    GIT_TAG v1.18
    CONFIGURE_COMMAND ""
    BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
    INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ 

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -77,6 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
 paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
 paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
 paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
+paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
+paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None))
 paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
 paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -198,6 +200,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act
 paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
+paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)

+cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory)
+
 cc_library(reader SRCS reader.cc DEPS lod_tensor ddim)
 cc_test(reader_test SRCS reader_test.cc DEPS reader)

@@ -167,9 +169,12 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
 cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)

 if(WITH_DISTRIBUTE)
-  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
+    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog
+        lod_rank_table feed_fetch_method sendrecvop_rpc  ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper)
+
   set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
   set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+
 else()
  if(WITH_NGRAPH)
    if(NOT WIN32)
@@ -183,6 +188,8 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

+target_link_libraries(executor garbage_collector)
+
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph build_strategy

--- a/paddle/fluid/framework/data_type_test.cc
+++ b/paddle/fluid/framework/data_type_test.cc
@@ -35,6 +35,6 @@ TEST(DataType, float16) {
  EXPECT_EQ(f::SizeOfType(dtype), 2u);

  // test debug info
-  std::string type = "float16";
+  std::string type = "::paddle::platform::float16";
  EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str());
 }
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -12,12 +12,19 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc

 cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows)

+if(WITH_DISTRIBUTE)
+    if(NOT WITH_GRPC)
+        set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+        set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    endif()
+endif()
+
 if(WITH_GPU)
    nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
            dynload_cuda variable_visitor)
    if(WITH_DISTRIBUTE)
        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim dynload_cuda selected_rows_functor sendrecvop_grpc)
+            ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
    else()
        nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim dynload_cuda selected_rows_functor)
@@ -30,7 +37,7 @@ else()
             variable_visitor)
    if(WITH_DISTRIBUTE)
        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
-            ddim selected_rows_functor sendrecvop_grpc)
+            ddim selected_rows_functor sendrecvop_rpc)
    else()
        cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
            ddim selected_rows_functor)
@@ -45,10 +52,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s

 cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)

-if (WITH_GPU)
-  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
-          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
-endif()
+cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
+cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
+cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
+cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)

 cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
@@ -56,10 +63,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)

-set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) 
-if (WITH_GPU)
-  list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
-endif()
+set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) 

 cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})


--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -20,11 +20,13 @@ namespace paddle {
 namespace framework {
 namespace details {
 ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
-                                         platform::Place place)
+                                         platform::Place place,
+                                         size_t scope_idx)
    : OpHandleBase(node),
      op_(framework::OpRegistry::CreateOp(*node->Op())),
      scope_(scope),
-      place_(place) {}
+      place_(place),
+      scope_idx_(scope_idx) {}

 void ComputationOpHandle::RunImpl() {
  WaitInputVarGenerated(place_);

--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -28,7 +28,8 @@ namespace framework {
 namespace details {
 struct ComputationOpHandle : public OpHandleBase {
 public:
-  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
+  ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
+                      size_t scope_idx);

  std::string Name() const override;

@@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase {

  void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }

+  size_t GetScopeIdx() const { return scope_idx_; }
+
 protected:
  void RunImpl() override;

@@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase {
  std::unique_ptr<OperatorBase> op_;
  Scope *scope_;
  platform::Place place_;
+  size_t scope_idx_;
  bool is_lock_and_record_event_free_{false};
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+EagerDeletionOpHandle::EagerDeletionOpHandle(
+    ir::Node *node, const Scope *scope, const platform::Place &place,
+    const std::unordered_set<std::string> &var_names, GarbageCollector *gc,
+    AtomicReferenceCountMap *ref_cnts)
+    : OpHandleBase(node),
+      scope_(scope),
+      var_names_(var_names),
+      gc_(gc),
+      ref_cnts_(ref_cnts) {
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(place)) {
+    dev_ctx_ = reinterpret_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    if (dynamic_cast<StreamGarbageCollector *>(gc_)) {
+      platform::CUDADeviceGuard guard(
+          boost::get<platform::CUDAPlace>(place).device);
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+      PADDLE_ENFORCE_NOT_NULL(event_);
+    }
+  }
+#endif
+}
+
+EagerDeletionOpHandle::~EagerDeletionOpHandle() {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+    platform::CUDADeviceGuard guard(gpu_place.device);
+    PADDLE_ENFORCE(cudaEventDestroy(event_));
+  }
+#endif
+}
+
+std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
+
+void EagerDeletionOpHandle::RunImpl() {
+  auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;
+  for (auto &name : var_names_) {
+    auto it = ref_cnts_->find(name);
+    // Var not found, not reference count has not decreased to 0
+    if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
+      continue;
+    }
+
+    auto *var = exec_scope->FindVar(name);
+    if (var == nullptr) {
+      continue;
+    }
+
+    VLOG(2) << "Erase variable " << name;
+
+    if (var->IsType<LoDTensor>()) {
+      garbages.emplace_back(var->GetMutable<LoDTensor>()->MoveMemoryHolder());
+    } else if (var->IsType<SelectedRows>()) {
+      garbages.emplace_back(
+          var->GetMutable<SelectedRows>()->mutable_value()->MoveMemoryHolder());
+    } else if (var->IsType<LoDTensorArray>()) {
+      auto *tensor_arr = var->GetMutable<LoDTensorArray>();
+      for (auto &t : *tensor_arr) {
+        garbages.emplace_back(t.MoveMemoryHolder());
+      }
+    } else {
+      PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                   var->Type().name(), name);
+    }
+  }
+
+  if (!garbages.empty()) {
+    ClearGarbages(&garbages);
+  }
+}
+
+void EagerDeletionOpHandle::ClearGarbages(
+    std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
+#ifdef PADDLE_WITH_CUDA
+  if (event_) {
+    auto compute_stream = dev_ctx_->stream();
+    auto callback_stream =
+        reinterpret_cast<StreamGarbageCollector *>(gc_)->stream();
+    auto callback_func = [=]() {
+      PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+    };
+    gc_->Add(std::move(*garbages), callback_func);
+  } else {
+#endif
+    gc_->Add(std::move(*garbages));
+#ifdef PADDLE_WITH_CUDA
+  }
+#endif
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <deque>
+#include <string>
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace details {
+
+class EagerDeletionOpHandle : public OpHandleBase {
+ public:
+  EagerDeletionOpHandle(ir::Node *node, const Scope *scope,
+                        const platform::Place &place,
+                        const std::unordered_set<std::string> &var_names,
+                        GarbageCollector *gc,
+                        AtomicReferenceCountMap *ref_cnts);
+
+  ~EagerDeletionOpHandle();
+
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+
+ private:
+  void ClearGarbages(std::deque<std::shared_ptr<memory::Allocation>> *garbages);
+
+  const Scope *scope_;
+  std::unordered_set<std::string> var_names_;
+  GarbageCollector *gc_;               // not own
+  AtomicReferenceCountMap *ref_cnts_;  // not own
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDADeviceContext *dev_ctx_{nullptr};
+  cudaEvent_t event_{nullptr};
+#endif
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/eager_deletion_pass.cc
+++ b/paddle/fluid/framework/details/eager_deletion_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto &ref_cnts =
+      Get<std::vector<AtomicReferenceCountMap>>(kRuntimeReferenceCount);
+  PADDLE_ENFORCE(ref_cnts.empty(),
+                 "kRuntimeReferenceCount should be initialized here!");
+
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);
+  ref_cnts.resize(vars.size());
+
+  const auto &last_live_ops =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);
+  const auto &gcs = Get<GarbageCollectorMap>(kGarbageCollector);
+  const auto &places = Get<std::vector<platform::Place>>(kAllPlaces);
+
+  // a reverse map of last_live_ops
+  //   i.e., last op --> variable names which can be deleted.
+  std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
+      op_vars_map;
+
+  for (auto &var_ops_map : last_live_ops) {
+    for (auto &var_ops_pair : var_ops_map) {
+      const std::string &var_name = var_ops_pair.first;
+      for (auto *op : var_ops_pair.second) {
+        op_vars_map[op].insert(var_name);
+      }
+    }
+  }
+
+  for (auto &pair : op_vars_map) {
+    auto *op = pair.first;
+    auto &var_names = pair.second;
+
+    auto *eager_deletion_node =
+        graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation);
+    auto *eager_deletion_op = new EagerDeletionOpHandle(
+        eager_deletion_node, op->GetScope(), op->GetPlace(), var_names,
+        gcs.at(places[op->GetScopeIdx()]).get(),
+        &(ref_cnts[op->GetScopeIdx()]));
+
+    auto it = std::find_if(
+        op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) {
+          return dynamic_cast<DummyVarHandle *>(var) != nullptr;
+        });
+
+    if (it != op->Outputs().end()) {
+      eager_deletion_op->AddInput(*it);
+    } else {
+      auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+      op->AddOutput(dep_var);
+      eager_deletion_op->AddInput(dep_var);
+    }
+
+    auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
+    eager_deletion_op->AddOutput(dummy_leaf);
+  }
+
+  VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(eager_deletion_pass,
+              paddle::framework::details::EagerDeletionPass)
+    .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
+    .RequirePassAttr(paddle::framework::details::kAllPlaces)
+    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
--- a/paddle/fluid/framework/details/eager_deletion_pass.h
+++ b/paddle/fluid/framework/details/eager_deletion_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class EagerDeletionPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -565,7 +565,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result,
                                                    int dev_id) const {
  result->Get<GraphOps>(kGraphOps).emplace_back(
      new ComputationOpHandle(result->CreateOpNode(node->Op()),
-                              local_scopes_[dev_id], places_[dev_id]));
+                              local_scopes_[dev_id], places_[dev_id], dev_id));
  CreateOpHandleIOs(result, node, dev_id);
 }

@@ -688,8 +688,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result,
  for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
    auto p = places_[scope_idx];
    auto s = local_scopes_[scope_idx];
-    result->Get<GraphOps>(kGraphOps).emplace_back(
-        new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
+    result->Get<GraphOps>(kGraphOps).emplace_back(new ComputationOpHandle(
+        result->CreateOpNode(node->Op()), s, p, scope_idx));
    CreateOpHandleIOs(result, node, scope_idx);
  }
 }

--- a/paddle/fluid/framework/details/op_graph_view.cc
+++ b/paddle/fluid/framework/details/op_graph_view.cc
@@ -23,6 +23,8 @@ namespace details {
 OpGraphView::OpGraphView(const std::vector<OpHandleBase *> &ops) { Build(ops); }

 void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {
+  preceding_ops_.clear();
+  pending_ops_.clear();
  for (auto &op : ops) {
    preceding_ops_[op];
    pending_ops_[op];
@@ -40,6 +42,7 @@ void OpGraphView::Build(const std::vector<OpHandleBase *> &ops) {

 std::unordered_set<OpHandleBase *> OpGraphView::AllOps() const {
  std::unordered_set<OpHandleBase *> ret;
+  ret.reserve(preceding_ops_.size());
  for (auto &pair : preceding_ops_) {
    ret.insert(pair.first);
  }

--- a/paddle/fluid/framework/details/op_graph_view.h
+++ b/paddle/fluid/framework/details/op_graph_view.h
@@ -14,7 +14,7 @@

 #pragma once

-#include <memory>
+#include <queue>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
@@ -34,6 +34,11 @@ class OpGraphView {

  bool HasOp(OpHandleBase *op) const;

+  // Use a visitor to visit all pending ops of op
+  // Stop when callback returns false
+  template <typename Callback>
+  bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const;
+
 private:
  void Build(const std::vector<OpHandleBase *> &ops);
  void EnforceHasOp(OpHandleBase *op) const;
@@ -44,6 +49,28 @@ class OpGraphView {
      pending_ops_;
 };

+template <typename Callback>
+bool OpGraphView::VisitAllPendingOps(OpHandleBase *op,
+                                     Callback &&callback) const {
+  EnforceHasOp(op);
+  std::unordered_set<OpHandleBase *> visited;
+  std::queue<OpHandleBase *> q;
+  q.push(op);
+  do {
+    op = q.front();
+    q.pop();
+    for (auto &pending_op : pending_ops_.at(op)) {
+      if (visited.count(pending_op) == 0) {
+        visited.insert(pending_op);
+        if (!callback(pending_op)) {
+          return false;
+        }
+      }
+    }
+  } while (!q.empty());
+  return true;
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ b/paddle/fluid/framework/details/reference_count_op_handle.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <atomic>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/details/op_handle_base.h"
-#include "paddle/fluid/framework/garbage_collector.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/framework/tensor.h"
-
-namespace paddle {
-namespace framework {
-namespace details {
-
-using ReferenceCountMap = std::unordered_map<std::string, int>;
-using AtomicReferenceCountMap =
-    std::unordered_map<std::string, std::atomic<int>>;
-using DeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
-using AtomicDeviceReferenceCountMap =
-    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
-using DeviceGarbageCollectorMap =
-    std::unordered_map<int,
-                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;
-
-class ReferenceCountOpHandle : public OpHandleBase {
- public:
-  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
-                         const platform::CUDAPlace &place,
-                         const std::vector<std::string> &var_names,
-                         GarbageCollector<Tensor> *gc,
-                         AtomicReferenceCountMap *ref_cnts)
-      : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) {
-    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
-        platform::DeviceContextPool::Instance().Get(place));
-    if (IsStreamGarabageCollector()) {
-      platform::SetDeviceId(place.device);
-      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
-    }
-
-    for (auto &name : var_names) AddVar(name);
-  }
-
-  ~ReferenceCountOpHandle() {
-    if (IsStreamGarabageCollector()) {
-      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
-      platform::SetDeviceId(gpu_place.device);
-      PADDLE_ENFORCE(cudaEventDestroy(event_));
-    }
-  }
-
-  std::string Name() const override { return "reference_count"; }
-
-  void AddVar(const std::string &name) {
-    auto it = var_names_.find(name);
-    if (it != var_names_.end())
-      ++(it->second);
-    else
-      var_names_[name] = 1;
-  }
-
- protected:
-  void RunImpl() override {
-    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    std::vector<Tensor *> tensors;
-    for (auto &pair : var_names_) {
-      auto &name = pair.first;
-      auto it = ref_cnts_->find(name);
-      if (it == ref_cnts_->end()) continue;
-
-      auto *var = exec_scope->FindVar(name);
-      if (var == nullptr) continue;
-
-      if (var->IsType<LoDTensor>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(var->GetMutable<LoDTensor>());
-        }
-      } else if (var->IsType<SelectedRows>()) {
-        if (it->second.fetch_sub(pair.second) <= pair.second) {
-          tensors.emplace_back(
-              var->GetMutable<SelectedRows>()->mutable_value());
-        }
-      }
-    }
-
-    if (!tensors.empty()) {
-      ClearTensors(tensors);
-    }
-  }
-
- private:
-  void ClearTensors(const std::vector<Tensor *> &tensors) {
-    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
-    if (gc != nullptr) {
-      auto compute_stream = dev_ctx_->stream();
-      auto callback_stream = gc->stream();
-      auto callback_func = [=]() {
-        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
-        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
-      };
-      gc_->Add(tensors, callback_func);
-    } else {
-      gc_->Add(tensors);
-    }
-  }
-
-  bool IsStreamGarabageCollector() const {
-    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
-  }
-
-  const Scope *scope_;
-  platform::CUDADeviceContext *dev_ctx_;
-  std::unordered_map<std::string, int> var_names_;
-  GarbageCollector<Tensor> *gc_;       // not own
-  AtomicReferenceCountMap *ref_cnts_;  // not own
-  cudaEvent_t event_;
-};
-
-}  // namespace details
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -14,187 +14,240 @@

 #include <queue>
 #include <string>
+#include <type_traits>
 #include <vector>

 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/op_graph_view.h"
 #include "paddle/fluid/framework/details/reference_count_pass.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
 namespace framework {
 namespace details {

-static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) {
-  std::queue<VarHandleBase *> queue;
-  queue.push(var_in);
+// A functor to shrink/remove operators who depend on other operators in a set
+class ShrinkDepsOpFunctor {
+ private:
+  enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 };
+
+ public:
+  explicit ShrinkDepsOpFunctor(const std::vector<OpHandleBase *> &all_ops)
+      : graph_(all_ops) {}
+
+  template <typename OpSet>
+  OpSet operator()(const OpSet &op_set) const {
+    using KeyType = typename OpSet::key_type;
+    static_assert(
+        std::is_base_of<OpHandleBase,
+                        typename std::remove_pointer<KeyType>::type>::value,
+        "Key type of OpSet must be OpHandleBase, or derived of OpHandleBase");
+
+    if (op_set.size() <= 1) return op_set;
+    std::vector<OpHandleBase *> ops(op_set.begin(), op_set.end());
+    OpSet ret;
+    auto rels = GetRelations(ops);
+    auto not_before = [](RelationShip r) { return r != kBefore; };
+    for (size_t i = 0; i < rels.size(); ++i) {
+      if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) {
+        ret.emplace(static_cast<KeyType>(ops[i]));
+      }
+    }
+    return ret;
+  }
+
+ private:
+  std::vector<std::vector<RelationShip>> GetRelations(
+      const std::vector<OpHandleBase *> &ops) const {
+    std::unordered_map<OpHandleBase *, size_t> op_to_idx;
+    for (size_t i = 0; i < ops.size(); ++i) {
+      PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph");
+      op_to_idx[ops[i]] = i;
+    }
+
+    PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops");
+
+    std::vector<std::vector<RelationShip>> ret(ops.size());
+    for (auto &e : ret) {
+      e.assign(ops.size(), kSame);
+    }
+
+    size_t found_num = ops.size();
+    size_t total_num = ops.size() * ops.size();
+    auto visitor = [&](OpHandleBase *op, size_t i) {
+      auto it = op_to_idx.find(op);
+      if (it != op_to_idx.end()) {
+        size_t j = it->second;
+        if (i != j && ret[i][j] == kSame) {
+          ret[i][j] = kBefore;
+          ret[j][i] = kAfter;
+          found_num += 2;
+          if (found_num == total_num) {
+            return false;
+          }
+        }
+      }
+      return true;
+    };
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      auto sub_visitor = [&, i](OpHandleBase *op) { return visitor(op, i); };
+      if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) {
+        break;
+      }
+    }
+
+    for (size_t i = 0; i < ops.size(); ++i) {
+      for (size_t j = i + 1; j < ops.size(); ++j) {
+        if (ret[i][j] != kSame) continue;
+        ret[i][j] = kNoDeps;
+        ret[j][i] = kNoDeps;
+      }
+    }
+
+    return ret;
+  }
+
+  const OpGraphView graph_;
+};
+
+/**
+ * Find the nearest downstream computation op handle. If the op is a
+ * computation op, just return itself.
+ */
+static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself(
+    OpHandleBase *op, size_t scope_idx) {
+  std::queue<OpHandleBase *> q;
+  std::unordered_set<OpHandleBase *> visited;
+  q.push(op);
  do {
-    auto *var = queue.front();
-    queue.pop();
-    for (auto *op : var->PendingOps()) {
+    auto *op = q.front();
+    q.pop();
    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-      if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) {
+    if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) {
      return compute_op;
    }
    for (auto *out_var : op->Outputs()) {
-        queue.push(out_var);
+      for (auto *pending_op : out_var->PendingOps()) {
+        if (visited.count(pending_op)) continue;
+        visited.insert(pending_op);
      }
    }
-  } while (!queue.empty());
+  } while (!q.empty());
  return nullptr;
 }

-static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out,
-                                 ir::Graph *graph) {
-  auto it = std::find_if(
-      in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) {
-        return dynamic_cast<DummyVarHandle *>(var) != nullptr;
-      });
-
-  if (it != in->Outputs().end()) {
-    out->AddInput(*it);
+static std::unordered_set<ComputationOpHandle *>
+ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx,
+                                     const ShrinkDepsOpFunctor &shrink_func,
+                                     bool *ok) {
+  // stage one. Get last op for variable.
+  std::unordered_set<OpHandleBase *> candidates;
+  {
+    if (var->PendingOps().empty() && var->GeneratedOp()) {
+      // No operator depends on this variable. So the last operator is the op
+      // who generates this variable.
+      candidates.emplace(var->GeneratedOp());
    } else {
-    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
-    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
-    in->AddOutput(dep_var);
-    out->AddInput(dep_var);
+      candidates = var->PendingOps();
+    }
+
+    // No pending ops or generated op is nullptr
+    if (candidates.empty()) {
+      *ok = false;
+      return {};
    }
+  }
+
+  // stage two. Try to cast them to computation op.
+  // return (*ok=false) when failed.
+  //
+  // The reason why we cannot make any types of op handle to be the last lived
+  // op is:
+  //    some op handle may operate on many DeviceContext, however, our garbage
+  //    collector can only wait one DeviceContext for now. So currently, we wait
+  //    the nearest compute op.
+  std::unordered_set<ComputationOpHandle *> computation_op;
+  {
+    for (auto *op : candidates) {
+      auto *compute_op =
+          FindNextComputationOpHandleOrReturnItself(op, scope_idx);
+      if (compute_op == nullptr) {
+        *ok = false;
+        return {};
+      }
+      computation_op.emplace(compute_op);
+    }
+  }
+
+  // stage three. Try to shrink computation op if they depend on each other.
+  // Get the smallest set of the most ops.
+  *ok = true;
+  return shrink_func(computation_op);
+}
+
+static VarDesc *TryGetLatestVarDesc(const std::vector<VarHandle *> &vars) {
+  VarDesc *var_desc = nullptr;
+  std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool {
+    var_desc = var_handle->Node()->Var();
+    return var_desc != nullptr;
+  });
+  return var_desc;
 }

 std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
-  auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
-  auto &cur_ref_cnts = Get<AtomicDeviceReferenceCountMap>(kCurReferenceCount);
-  auto &gcs = Get<DeviceGarbageCollectorMap>(kGarbageCollector);
-
-  // It is not easy to find the right reference counts of varaibles in graph
-  // Step 1: Find all variables in computation ops
-  // Step 2: Find all variables in non-computation ops which refers to variables
-  // in computation ops
-  std::unordered_set<std::string> names;
-  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
-      compute_ref_cnt_map;
-
-  auto get_ref_cnts_from_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    std::vector<std::string> var_names_in_op;
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    if (compute_op == nullptr ||
-        !platform::is_gpu_place(compute_op->GetPlace()))
-      return var_names_in_op;
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      if (!platform::is_gpu_place(var_handle->place_) ||
-          boost::get<platform::CUDAPlace>(var_handle->place_) != place)
-        continue;
+  auto &ref_cnts = Get<std::vector<ReferenceCountMap>>(kGlobalReferenceCount);
+  auto &last_live_ops_of_vars =
+      Get<std::vector<LastLiveOpsOfVars>>(kLastLiveOpsOfVars);

-      VarDesc *var_desc = var_handle->Node()->Var();
-      auto var_name = var_handle->Node()->Name();
+  PADDLE_ENFORCE(last_live_ops_of_vars.empty() && ref_cnts.empty(),
+                 "Last Live Ops and Reference Counts of vars should be "
+                 "initialized at here.");

-      // This is weird but there is really some variables without var_desc
-      // in computation_op
-      if (var_desc == nullptr) {
-        var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
-        if (var_desc == nullptr) continue;
-      }
+  const auto &vars = graph->Get<GraphVars>(kGraphVars);

-      if (var_desc->Persistable()) continue;
-      auto var_type = var_desc->Proto()->type().type();
-      if (var_type != proto::VarType::LOD_TENSOR &&
-          var_type != proto::VarType::SELECTED_ROWS) {
-        continue;
-      }
+  last_live_ops_of_vars.resize(vars.size());
+  ref_cnts.resize(vars.size());

-      // compute op only runs in one device
-      if (ref_cnts[place.device]->count(var_name))
-        ++(*ref_cnts[place.device])[var_name];
-      else
-        (*ref_cnts[place.device])[var_name] = 1;
+  ShrinkDepsOpFunctor shrink_func(
+      ir::FilterByNodeWrapper<OpHandleBase>(*graph));

-      names.insert(var_name);
-      var_names_in_op.push_back(var_name);
-    }
-    return var_names_in_op;
-  };
+  for (size_t i = 0; i < vars.size(); ++i) {
+    for (auto &name_var_pair : vars[i]) {
+      // Whether this variable can be reused or deleted? If not, we do not
+      // compute reference counts and dependencies.
+      VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second);

-  auto update_ref_cnts_from_non_compute_op = [&](
-      OpHandleBase *op, const std::vector<VarHandleBase *> &vars) {
-    if (dynamic_cast<ComputationOpHandle *>(op) != nullptr) return;
-    for (VarHandleBase *var_handle_base : vars) {
-      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
-      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
-
-      auto var_name = var_handle->Node()->Name();
-      auto var_place = var_handle->place_;
-      if (!platform::is_gpu_place(var_place)) continue;
-      auto place = boost::get<platform::CUDAPlace>(var_place);
-      if (names.count(var_name) == 0) continue;
-      if (ref_cnts.count(place.device) &&
-          ref_cnts[place.device]->count(var_name)) {
-        ++(*ref_cnts[place.device])[var_name];
-
-        auto *next_compute_op = FindNextComputationOpHandle(var_handle);
-        if (next_compute_op != nullptr) {
-          if (compute_ref_cnt_map.count(next_compute_op)) {
-            compute_ref_cnt_map[next_compute_op]->AddVar(var_name);
-            VLOG(5) << "Add reference count of " << var_name << " to Operator "
-                    << next_compute_op->Name();
-          } else {
-            // Create new reference_count_op_handle
-            ir::Node *ref_cnt_node = graph->CreateEmptyNode(
-                "reference_count", ir::Node::Type::kOperation);
-            auto *ref_cnt_handle = new ReferenceCountOpHandle(
-                ref_cnt_node, next_compute_op->GetScope(), place, {var_name},
-                gcs[place.device].get(), cur_ref_cnts[place.device].get());
-            AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get());
-            compute_ref_cnt_map[next_compute_op] = ref_cnt_handle;
-          }
-        }
-      }
+      if (var_desc == nullptr || var_desc->Persistable()) {
+        continue;
      }
-  };

-  auto all_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
-  for (auto &op : all_ops) {
-    auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
-    auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
-    if (in_var_names.empty() && out_var_names.empty()) continue;
-    in_var_names.insert(in_var_names.end(), out_var_names.begin(),
-                        out_var_names.end());
-    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op);
-    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
-    ir::Node *ref_cnt_node =
-        graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
-    auto *ref_cnt_handle = new ReferenceCountOpHandle(
-        ref_cnt_node, compute_op->GetScope(), place, in_var_names,
-        gcs[place.device].get(), cur_ref_cnts[place.device].get());
-    AddDependencyBetween(compute_op, ref_cnt_handle, graph.get());
-    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
+      auto var_type = var_desc->Proto()->type().type();
+      if (var_type != proto::VarType::LOD_TENSOR &&
+          var_type != proto::VarType::SELECTED_ROWS &&
+          var_type != proto::VarType::LOD_TENSOR_ARRAY) {
+        // Var type cannot be deleted
+        continue;
      }

-  for (auto &op : all_ops) {
-    update_ref_cnts_from_non_compute_op(op, op->Inputs());
-    update_ref_cnts_from_non_compute_op(op, op->Outputs());
-  }
+      bool ok;
+      auto result = ExtractComputationOpFromLastLivedVar(
+          name_var_pair.second.back(), i, shrink_func, &ok);

-  std::vector<OpHandleBase *> new_all_ops;
-  new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
-  for (auto &op : all_ops) {
-    new_all_ops.emplace_back(std::move(op));
-    auto it = compute_ref_cnt_map.find(new_all_ops.back());
-    if (it != compute_ref_cnt_map.end()) {
-      // Add LeafNode to ReferenceCountOpHandle
-      auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar());
-      graph->Get<GraphDepVars>(kGraphDepVars).emplace(dummy_leaf);
-      it->second->AddOutput(dummy_leaf);
-      new_all_ops.emplace_back(std::move(it->second));
+      if (ok) {
+        auto &var_name = name_var_pair.first;
+        PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty",
+                       var_name);
+        ref_cnts[i].emplace(var_name, result.size());
+        last_live_ops_of_vars[i].emplace(var_name, std::move(result));
+      }
    }
  }

-  all_ops.swap(new_all_ops);
  return graph;
 }

@@ -205,5 +258,4 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
 REGISTER_PASS(reference_count_pass,
              paddle::framework::details::ReferenceCountPass)
    .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
-    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
+    .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars);
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
@@ -14,7 +14,6 @@

 #pragma once

-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"

@@ -22,10 +21,6 @@ namespace paddle {
 namespace framework {
 namespace details {

-constexpr char kGlobalReferenceCount[] = "reference_count";
-constexpr char kCurReferenceCount[] = "current_reference_count";
-constexpr char kGarbageCollector[] = "garbage_collector";
-
 class ReferenceCountPass : public ir::Pass {
 protected:
  std::unique_ptr<ir::Graph> ApplyImpl(

--- a/paddle/fluid/framework/details/reference_count_pass_helper.cc
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass_helper.h
+++ b/paddle/fluid/framework/details/reference_count_pass_helper.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class ComputationOpHandle;
+
+using ReferenceCountMap = std::unordered_map<std::string, size_t>;
+
+using AtomicReferenceCountMap =
+    std::unordered_map<std::string, std::atomic<size_t>>;
+
+using GarbageCollectorMap =
+    std::map<platform::Place, std::unique_ptr<GarbageCollector>>;
+
+const char kGlobalReferenceCount[] = "global_reference_count";
+const char kRuntimeReferenceCount[] = "runtime_reference_count";
+const char kGarbageCollector[] = "garbage_collector";
+const char kAllPlaces[] = "all_places";
+
+using LastLiveOpsOfVars =
+    std::unordered_map<std::string, std::unordered_set<ComputationOpHandle*>>;
+const char kLastLiveOpsOfVars[] = "last_live_ops_of_var";
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -18,9 +18,6 @@
 #include <vector>
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/profiler.h"
-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_op_handle.h"
-#endif

 namespace paddle {
 namespace framework {
@@ -69,27 +66,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
  drop_scope_counter_ += 1;

-#ifdef PADDLE_WITH_CUDA
-  const std::string gc_name = "garbage_collector";
-  DeviceGarbageCollectorMap *gc =
-      Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
-                           : nullptr;
-#endif
-
  if (!fetch_tensors.empty() ||
      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
    drop_scope_counter_ = 0;
    // Wait All computational streams
    for (auto p : places_) {
      platform::DeviceContextPool::Instance().Get(p)->Wait();
-#ifdef PADDLE_WITH_CUDA
-      if (gc != nullptr && platform::is_gpu_place(p)) {
-        auto gpu_place = boost::get<platform::CUDAPlace>(p);
-        auto &gc_at_place = gc->at(gpu_place.device);
-        gc_at_place->Wait();
-        gc_at_place->Reset();
-      }
-#endif
    }
    for (auto &scope : local_scopes_) {
      auto &local_scope =

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/executor.h"
+#include <deque>

 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
@@ -41,11 +42,43 @@ namespace {
 int kProgramId = -1;
 }  // namespace

+static std::unordered_map<std::string, size_t> GetNonPersistableReferenceCounts(
+    const BlockDesc& block, const std::vector<std::string>& skip_var_list) {
+  std::unordered_map<std::string, size_t> ref_cnts;
+  std::unordered_set<std::string> skip_vars(skip_var_list.begin(),
+                                            skip_var_list.end());
+
+  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
+    for (auto& name_pair : name_map) {
+      for (auto& name : name_pair.second) {
+        if (skip_vars.count(name)) continue;
+        auto* var_desc = block.FindVar(name);
+        if (var_desc == nullptr || var_desc->Persistable()) continue;
+        auto type = var_desc->Proto()->type().type();
+        if (type != proto::VarType::LOD_TENSOR &&
+            type != proto::VarType::SELECTED_ROWS &&
+            type != proto::VarType::LOD_TENSOR_ARRAY) {
+          continue;
+        }
+        ++ref_cnts[name];
+      }
+    }
+  };
+
+  for (auto op_desc : block.AllOps()) {
+    update_ref_cnts(op_desc, op_desc->Inputs());
+    update_ref_cnts(op_desc, op_desc->Outputs());
+  }
+  return ref_cnts;
+}
+
 ExecutorPrepareContext::ExecutorPrepareContext(
-    const framework::ProgramDesc& prog, size_t block_id)
+    const framework::ProgramDesc& prog, size_t block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars)
    : prog_(prog), block_id_(block_id) {
  if (GetEagerDeletionThreshold() >= 0) {
-    ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
+    global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id),
+                                                        skip_ref_cnt_vars);
  }
 }

@@ -53,28 +86,40 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
 }

-template <typename RefCntMap>
-static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
-                                GarbageCollector<Tensor>* gc,
-                                RefCntMap* ref_cnts) {
-  std::unordered_set<Tensor*> erase_tensors;
+static void DeleteUnusedTensors(
+    const Scope& scope, const OperatorBase* op, GarbageCollector* gc,
+    std::unordered_map<std::string, size_t>* ref_cnts) {
+  std::deque<std::shared_ptr<memory::Allocation>> garbages;

  auto handler = [&](const VariableNameMap& name_map) {
    for (auto& name_pair : name_map) {
      for (auto& name : name_pair.second) {
        auto it = ref_cnts->find(name);
        if (it == ref_cnts->end()) continue;
-        if ((it->second)-- == 1) {
+        if (--(it->second) != 0) {
+          continue;
+        }
        auto* var = scope.FindVar(name);
        if (var != nullptr) {
-            VLOG(10) << "Erase tensor \'" << name << "\'";
+          continue;
+        }
+
+        VLOG(2) << "Erase variable " << name;
        if (var->IsType<LoDTensor>()) {
-              erase_tensors.insert(var->GetMutable<LoDTensor>());
+          garbages.emplace_back(
+              var->GetMutable<LoDTensor>()->MoveMemoryHolder());
        } else if (var->IsType<SelectedRows>()) {
-              erase_tensors.insert(
-                  var->GetMutable<SelectedRows>()->mutable_value());
-            }
+          garbages.emplace_back(var->GetMutable<SelectedRows>()
+                                    ->mutable_value()
+                                    ->MoveMemoryHolder());
+        } else if (var->IsType<LoDTensorArray>()) {
+          auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
+          for (auto& t : *lod_tensor_arr) {
+            garbages.emplace_back(t.MoveMemoryHolder());
          }
+        } else {
+          PADDLE_THROW("Type %s of %s is not supported eager deletion",
+                       var->Type().name(), name);
        }
      }
    }
@@ -83,8 +128,8 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
  handler(op->Inputs());
  handler(op->Outputs());

-  if (!erase_tensors.empty()) {
-    gc->Add(erase_tensors);
+  if (!garbages.empty()) {
+    gc->Add(std::move(garbages));
  }
 }

@@ -112,9 +157,9 @@ void Executor::Close() {
 #ifdef PADDLE_WITH_DISTRIBUTE
  // TODO(typhoonzero): complete message will need to use real trainer_id,
  // except 0.
-  ::paddle::operators::distributed::RPCClient::GetInstance<
-      ::paddle::operators::distributed::GRPCClient>(0)
-      ->SendComplete();
+  auto client =
+      paddle::operators::distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+  client->SendComplete();
 #endif
 }

@@ -325,9 +370,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope,
 }

 std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
-    const ProgramDesc& program, int block_id) {
+    const ProgramDesc& program, int block_id,
+    const std::vector<std::string>& skip_ref_cnt_vars) {
  std::unique_ptr<ExecutorPrepareContext> ctx(
-      new ExecutorPrepareContext(program, block_id));
+      new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars));
  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), program.Size());
  auto& block = program.Block(block_id);
  for (auto& op_desc : block.AllOps()) {
@@ -338,16 +384,28 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
 }

 std::vector<std::shared_ptr<ExecutorPrepareContext>> Executor::Prepare(
-    const ProgramDesc& program, const std::vector<int>& block_ids) {
+    const ProgramDesc& program, const std::vector<int>& block_ids,
+    const std::vector<std::vector<std::string>>& skip_ref_cnt_vars) {
+  PADDLE_ENFORCE(
+      skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(),
+      "skip_ref_cnt_vars should be either empty or equals to block number %d",
+      block_ids.size());
  std::vector<std::shared_ptr<ExecutorPrepareContext>> result;
+  size_t idx = 0;
  for (auto& bid : block_ids) {
-    auto* ctx = new ExecutorPrepareContext(program, bid);
+    ExecutorPrepareContext* ctx;
+    if (skip_ref_cnt_vars.empty()) {
+      ctx = new ExecutorPrepareContext(program, bid);
+    } else {
+      ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]);
+    }
    PADDLE_ENFORCE_LT(static_cast<size_t>(bid), program.Size());
    auto& block = program.Block(bid);
    for (auto& op_desc : block.AllOps()) {
      ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
    }
    result.push_back(std::shared_ptr<ExecutorPrepareContext>(ctx));
+    ++idx;
  }
  return result;
 }
@@ -365,22 +423,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  }

  int64_t max_memory_size = GetEagerDeletionThreshold();
-  std::unique_ptr<GarbageCollector<Tensor>> gc;
-  // WhileOp would set keep_kids to true,
-  // because WhileGradOp needs the scopes created in WhileOp.
-  // Perhaps, we should not perform eager deletion in WhileOp
-  // The scopes and variables created by WhileOp would be deleted
-  // in WhileGradOp.
+  std::unique_ptr<GarbageCollector> gc;
+  // skip while_op and while_grad_op temporarily
  if (max_memory_size >= 0 && !keep_kids) {
    ctx->ResetReferenceCount();
 #ifdef PADDLE_WITH_CUDA
    if (platform::is_gpu_place(place_)) {
-      gc.reset(new DefaultStreamGarbageCollector<Tensor>(
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new UnsafeFastGPUGarbageCollector(
            boost::get<platform::CUDAPlace>(place_), max_memory_size));
      } else {
+        gc.reset(new DefaultStreamGarbageCollector(
+            boost::get<platform::CUDAPlace>(place_), max_memory_size));
+      }
+    } else if (platform::is_cpu_place(place_)) {
 #endif
-      gc.reset(new CPUGarbageCollector<Tensor>(
-          boost::get<platform::CPUPlace>(place_), max_memory_size));
+      gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place_),
+                                       max_memory_size));
 #ifdef PADDLE_WITH_CUDA
    }
 #endif
@@ -389,17 +448,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
  for (auto& op : ctx->ops_) {
    op->Run(*local_scope, place_);

-    if (gc != nullptr) {
+    if (gc) {
      DeleteUnusedTensors(*local_scope, op.get(), gc.get(),
-                          &(ctx->cur_ref_cnts_));
+                          &(ctx->runtime_ref_cnts_));
    }
  }

-  if (gc != nullptr) {
-    gc->Wait();
-  } else {
  platform::DeviceContextPool::Instance().Get(place_)->Wait();
-  }

  if (local_scope != scope) {
    scope->DeleteScope(local_scope);

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -27,52 +27,21 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-template <typename T>
-std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
-    const ProgramDesc& prog, size_t block_id) {
-  auto& block = prog.Block(block_id);
-  std::unordered_map<std::string, T> ref_cnts;
-
-  auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) {
-    for (auto& name_pair : name_map) {
-      for (auto& name : name_pair.second) {
-        auto* var_desc = block.FindVar(name);
-        if (var_desc == nullptr || var_desc->Persistable()) continue;
-        auto type = var_desc->Proto()->type().type();
-        if (type != proto::VarType::LOD_TENSOR &&
-            type != proto::VarType::SELECTED_ROWS) {
-          continue;
-        }
-
-        auto it = ref_cnts.find(name);
-        if (it != ref_cnts.end()) {
-          ++it->second;
-        } else {
-          ref_cnts[name] = 1;
-        }
-      }
-    }
-  };
-
-  for (auto op_desc : block.AllOps()) {
-    update_ref_cnts(op_desc, op_desc->Inputs());
-    update_ref_cnts(op_desc, op_desc->Outputs());
-  }
-  return ref_cnts;
-}
-
 struct ExecutorPrepareContext {
-  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
+  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
+                         const std::vector<std::string>& skip_ref_cnt_vars =
+                             std::vector<std::string>());
+
  ~ExecutorPrepareContext();

-  void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
+  void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; }

  const framework::ProgramDesc& prog_;
  size_t block_id_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;

-  std::unordered_map<std::string, int> ref_cnts_;
-  std::unordered_map<std::string, int> cur_ref_cnts_;
+  std::unordered_map<std::string, size_t> global_ref_cnts_;
+  std::unordered_map<std::string, size_t> runtime_ref_cnts_;
 };

 class Executor {
@@ -108,10 +77,14 @@ class Executor {
           const std::string& fetch_holder_name = "fetch");

  static std::unique_ptr<ExecutorPrepareContext> Prepare(
-      const ProgramDesc& program, int block_id);
+      const ProgramDesc& program, int block_id,
+      const std::vector<std::string>& skip_ref_cnt_vars =
+          std::vector<std::string>());

  static std::vector<std::shared_ptr<ExecutorPrepareContext>> Prepare(
-      const ProgramDesc& program, const std::vector<int>& block_ids);
+      const ProgramDesc& program, const std::vector<int>& block_ids,
+      const std::vector<std::vector<std::string>>& skip_ref_cnt_vars =
+          std::vector<std::vector<std::string>>());

  void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id);


--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
@@ -154,6 +155,8 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
 }

 void ExecutorThreadWorker::TrainFiles() {
+  platform::SetNumThreads(1);
+
  // todo: configurable
  SetDevice();


--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+#include "paddle/fluid/framework/garbage_collector.h"
+
+namespace paddle {
+namespace framework {
+
+GarbageCollector::GarbageCollector(const platform::Place &place,
+                                   size_t max_memory_size)
+    : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
+  garbages_.reset(new GarbageQueue());
+  dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
+}
+
+CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+
+#ifdef PADDLE_WITH_CUDA
+UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void UnsafeFastGPUGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
+
+DefaultStreamGarbageCollector::DefaultStreamGarbageCollector(
+    const platform::CUDAPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void DefaultStreamGarbageCollector::Wait() const {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->WaitStreamCallback();
+}
+
+void DefaultStreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+      ->AddStreamCallback(callback);
+}
+
+StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
+                                               size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+}
+
+StreamGarbageCollector::~StreamGarbageCollector() {
+  auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
+  platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+cudaStream_t StreamGarbageCollector::stream() const { return stream_; }
+
+void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); }
+
+void StreamGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback_manager_->AddCallback(callback);
+}
+#endif
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -14,7 +14,6 @@

 #pragma once

-#include <algorithm>
 #include <deque>
 #include <functional>
 #include <memory>
@@ -24,134 +23,74 @@
 namespace paddle {
 namespace framework {

-// T should have memory_size() and clear() method
-template <typename T>
 class GarbageCollector {
 public:
-  GarbageCollector(const platform::Place &place, size_t max_memory_size)
-      : max_memory_size_((std::max)(max_memory_size, static_cast<size_t>(1))) {
-    garbages_.reset(new std::deque<T *>());
-    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
-  }
+  using GarbageQueue = std::deque<std::shared_ptr<memory::Allocation>>;

-  virtual ~GarbageCollector() {}
+  GarbageCollector(const platform::Place &place, size_t max_memory_size);

-  void Reset() {
-    std::lock_guard<std::mutex> guard(mutex_);
-    garbages_.reset(new std::deque<T *>());
-    cur_memory_size_ = 0;
-  }
+  virtual ~GarbageCollector() = default;
+
+  virtual void Wait() const {}

  template <typename Container>
-  void Add(const Container &objs) {
-    Add(objs, []() {});
-  }
+  void Add(Container &&objs);

  template <typename Container, typename Callback>
-  void Add(const Container &objs, Callback &&callback) {
-    std::shared_ptr<std::deque<T *>> clear_deque;
-    {
-      std::lock_guard<std::mutex> guard(mutex_);
-      for (auto *obj : objs) {
-        garbages_->push_back(obj);
-        cur_memory_size_ += obj->memory_size();
-      }
-      if (cur_memory_size_ >= max_memory_size_) {
-        cur_memory_size_ = 0;
-        clear_deque = garbages_;
-        garbages_.reset(new std::deque<T *>());
-      }
-    }
-
-    if (clear_deque != nullptr) {
-      callback();
-      ClearCallback([=]() {
-        for (auto *obj : *clear_deque) obj->clear();
-      });
-    }
-  }
-
-  virtual void Wait() const {}
+  void Add(Container &&objs, Callback &&callback);

 protected:
  virtual void ClearCallback(const std::function<void()> &callback) = 0;

  platform::DeviceContext *dev_ctx_;
-  std::shared_ptr<std::deque<T *>> garbages_;
+  std::unique_ptr<GarbageQueue> garbages_;
  mutable std::mutex mutex_;
  const size_t max_memory_size_;
-  size_t cur_memory_size_ = 0;
+  size_t cur_memory_size_{0};
 };

-template <typename T>
-class CPUGarbageCollector : public GarbageCollector<T> {
+class CPUGarbageCollector : public GarbageCollector {
 public:
-  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size);

 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    callback();
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };

 #ifdef PADDLE_WITH_CUDA
-template <typename T>
-class DefaultStreamGarbageCollector : public GarbageCollector<T> {
+class UnsafeFastGPUGarbageCollector : public GarbageCollector {
 public:
-  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
-                                size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {}
+  UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size);

-  cudaStream_t stream() const {
-    return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->stream();
-  }
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};

-  void Wait() const override {
-    this->dev_ctx_->Wait();
-    static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->WaitStreamCallback();
-  }
+class DefaultStreamGarbageCollector : public GarbageCollector {
+ public:
+  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size);
+
+  void Wait() const override;

 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
-        ->AddStreamCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;
 };

-template <typename T>
-class StreamGarbageCollector : public GarbageCollector<T> {
+class StreamGarbageCollector : public GarbageCollector {
 public:
  StreamGarbageCollector(const platform::CUDAPlace &place,
-                         size_t max_memory_size)
-      : GarbageCollector<T>(place, max_memory_size) {
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
-    PADDLE_ENFORCE(cudaStreamCreate(&stream_));
-    callback_manager_.reset(new platform::StreamCallbackManager(stream_));
-  }
+                         size_t max_memory_size);

-  ~StreamGarbageCollector() {
-    auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
-    PADDLE_ENFORCE(cudaSetDevice(place.device));
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
-  }
+  ~StreamGarbageCollector();

-  void Wait() const override {
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->Wait();
-  }
+  void Wait() const override;

-  cudaStream_t stream() const { return stream_; }
+  cudaStream_t stream() const;

 protected:
-  void ClearCallback(const std::function<void()> &callback) override {
-    std::lock_guard<std::mutex> guard(this->mutex_);
-    callback_manager_->AddCallback(callback);
-  }
+  void ClearCallback(const std::function<void()> &callback) override;

 private:
  cudaStream_t stream_;
@@ -159,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector<T> {
 };
 #endif

+template <typename Container>
+void GarbageCollector::Add(Container &&objs) {
+  Add(std::forward<Container>(objs), []() {});
+}
+
+template <typename Container, typename Callback>
+void GarbageCollector::Add(Container &&objs, Callback &&callback) {
+  GarbageQueue *garbage_queue = nullptr;
+  {
+    std::lock_guard<std::mutex> guard(mutex_);
+    for (auto &obj : objs) {
+      if (!obj) continue;
+      cur_memory_size_ += obj->size();
+      garbages_->push_back(std::move(obj));
+    }
+    if (cur_memory_size_ >= max_memory_size_) {
+      cur_memory_size_ = 0;
+      garbage_queue = garbages_.release();
+      garbages_.reset(new GarbageQueue());
+    }
+  }
+
+  if (garbage_queue) {
+    callback();
+    ClearCallback([garbage_queue]() { delete garbage_queue; });
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -42,6 +42,8 @@ pass_library(multi_batch_merge_pass base)
 pass_library(conv_bn_fuse_pass inference)
 pass_library(seqconv_eltadd_relu_fuse_pass inference)
 pass_library(is_test_pass base)
+pass_library(conv_elementwise_add_act_fuse_pass inference)
+pass_library(conv_elementwise_add2_act_fuse_pass inference)
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base)
    pass_library(depthwise_conv_mkldnn_pass base)

--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                      \
+  GET_IR_NODE(conv_op);                \
+  GET_IR_NODE(conv_out);               \
+  GET_IR_NODE(conv_filter);            \
+  GET_IR_NODE(elementwise_add_op);     \
+  GET_IR_NODE(elementwise_add_in_y);   \
+  GET_IR_NODE(elementwise_add_out);    \
+  GET_IR_NODE(elementwise_add_op_1);   \
+  GET_IR_NODE(elementwise_add_in_y_1); \
+  GET_IR_NODE(elementwise_add_out_1);  \
+  GET_IR_NODE(act_op);                 \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& bias1, const std::string& activation,
+    const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {bias1});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+  desc.SetAttr("use_cudnn", false);
+
+  return *desc.Proto();
+}
+
+std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
+      "conv2d", "Input");
+
+  patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string bias1_name = elementwise_add_in_y_1->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name,
+                                      act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    auto new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);             // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op);  // ResidualData
+    IR_NODE_LINK_TO(new_conv_op, act_out);                 // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+                          elementwise_add_out});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h"
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                      \
+  GET_IR_NODE(conv_op);                \
+  GET_IR_NODE(conv_out);               \
+  GET_IR_NODE(conv_filter);            \
+  GET_IR_NODE(elementwise_add_op);     \
+  GET_IR_NODE(elementwise_add_in_y);   \
+  GET_IR_NODE(elementwise_add_out);    \
+  GET_IR_NODE(elementwise_add_op_1);   \
+  GET_IR_NODE(elementwise_add_in_y_1); \
+  GET_IR_NODE(elementwise_add_out_1);  \
+  GET_IR_NODE(act_op);                 \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& bias1, const std::string& activation,
+    const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {bias1});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+
+  return *desc.Proto();
+}
+
+std::unique_ptr<ir::Graph> ConvElementwiseAdd2ActFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input(
+      "conv2d", "Input");
+
+  patterns::ConvElementwiseadd2Act pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string bias1_name = elementwise_add_in_y_1->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name,
+                                      act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, conv_op);            // Input
+    IR_NODE_LINK_TO(conv_filter, conv_op);             // Filter
+    IR_NODE_LINK_TO(conv_op, conv_out);                // Output
+    IR_NODE_LINK_TO(elementwise_add_in_y, conv_op);    // Bias
+    IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op);  // Bias
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {conv_op, elementwise_add_op, elementwise_add_op_1,
+                          elementwise_add_out});
+  };
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add2_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAdd2ActFusePass);
--- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ConvElementwiseAdd2ActFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAdd2ActFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                    \
+  GET_IR_NODE(conv_op);              \
+  GET_IR_NODE(conv_out);             \
+  GET_IR_NODE(conv_filter);          \
+  GET_IR_NODE(elementwise_add_op);   \
+  GET_IR_NODE(elementwise_add_in_y); \
+  GET_IR_NODE(elementwise_add_out);  \
+  GET_IR_NODE(act_op);               \
+  GET_IR_NODE(act_out);
+
+// Inherient the basic infomation from `base_desc`, and modify some fields.
+framework::proto::OpDesc PrepareOpDesc(
+    const framework::proto::OpDesc& base_desc, const std::string& bias,
+    const std::string& activation, const std::string& output) {
+  auto proto = base_desc;
+  framework::OpDesc desc(proto, nullptr);
+  desc.SetType("conv2d_fusion");
+  desc.SetInput("Bias", {bias});
+  desc.SetInput("ResidualData", {});
+  desc.SetAttr("activation", activation);
+  desc.SetOutput("Output", {output});
+  desc.SetAttr("is_test", true);
+  desc.SetAttr("use_cudnn", false);
+  desc.Flush();
+  return *desc.Proto();
+}
+
+std::unique_ptr<ir::Graph> ConvElementwiseAddActFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "conv_elementwise_add_act_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("conv2d", "Input")
+                ->AsInput();
+
+  patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    auto base_op_desc = *conv_op->Op()->Proto();
+    std::string bias_name = elementwise_add_in_y->Name();
+    std::string act_op_type = act_op->Op()->Type();
+    std::string act_op_out = act_out->Name();
+
+    auto new_op_proto =
+        PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out);
+    framework::OpDesc new_op_desc(new_op_proto, nullptr);
+
+    // Create a new node for the fused op.
+    auto* new_conv_op = graph->CreateOpNode(&new_op_desc);
+
+    // Link inputs and outputs.
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* conv_in_node = subgraph.at(x);
+
+    IR_NODE_LINK_TO(conv_in_node, new_conv_op);          // Input
+    IR_NODE_LINK_TO(conv_filter, new_conv_op);           // Filter
+    IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op);  // Bias
+    IR_NODE_LINK_TO(new_conv_op, act_out);               // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op,
+                                       elementwise_add_out, act_op});
+  };
+
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(conv_elementwise_add_act_fuse_pass,
+              paddle::framework::ir::ConvElementwiseAddActFusePass);
--- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ConvElementwiseAddActFusePass : public FusePassBase {
+ public:
+  virtual ~ConvElementwiseAddActFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -73,14 +73,21 @@ class Graph {
  }

  bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
  }

  template <typename AttrType>
  AttrType &Get(const std::string &attr_name) const {
    PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.",
                   attr_name);
+    try {
      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
  }

  template <typename AttrType>

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>

+#include "graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/graph_traits.h"
@@ -25,6 +26,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/string/pretty_log.h"
 #include "paddle/fluid/string/printf.h"
+
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -104,7 +106,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
  for (auto &node : GraphTraits::DFS(graph)) {
    for (const auto &pdnode : pattern_.nodes()) {
      if (pdnode->Tell(&node)) {
-        VLOG(4) << "pdnode " << pdnode->name() << " marked";
+        VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name();
        pdnodes2nodes_[pdnode.get()].insert(&node);
      }
    }
@@ -1099,6 +1101,115 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) {

  return out_var;
 }
+
+std::unordered_set<std::string> conv_act_set({"identity", "sigmoid", "relu",
+                                              "relu6", "relux", "tanh",
+                                              "band_pass"});
+
+PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) {
+  conv_in->AsInput();
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto conv_out = pattern->NewNode(conv_out_repr())
+                      ->assert_is_op_output("conv2d")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsIntermediate();
+  auto conv_filter = pattern->NewNode(conv_filter_repr())
+                         ->assert_is_op_input("conv2d", "Filter")
+                         ->AsInput();
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
+                                  ->assert_is_op_input("elementwise_add", "Y")
+                                  ->AsInput();
+  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
+                                 ->assert_is_op_output("elementwise_add")
+                                 ->AsIntermediate();
+
+  auto act_op = pattern->NewNode(act_op_repr())
+                    ->assert_is_op()
+                    ->assert_more([&](Node *node) {
+                      auto op_type = node->Name();
+                      return conv_act_set.count(op_type);
+                    });
+
+  auto act_out = pattern->NewNode(act_out_repr())
+                     ->assert_is_var()
+                     // is activation op's output.
+                     ->assert_more([&](Node *node) {
+                       for (auto *in_op : node->inputs) {
+                         if (conv_act_set.count(in_op->Name())) {
+                           return true;
+                         }
+                       }
+                       return false;
+                     })
+                     ->AsOutput();
+
+  conv_op->LinksFrom({conv_in, conv_filter});
+  conv_out->LinksFrom({conv_op});
+  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
+      .LinksTo({elementwise_add_out});
+  act_op->LinksFrom({elementwise_add_out}).LinksTo({act_out});
+
+  return act_out;
+}
+
+PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+  auto conv_filter = pattern->NewNode(conv_filter_repr())
+                         ->assert_is_op_input("conv2d", "Filter")
+                         ->AsInput();
+  auto conv_out = pattern->NewNode(conv_out_repr())
+                      ->assert_is_op_output("conv2d")
+                      ->assert_is_op_input("elementwise_add", "X")
+                      ->AsIntermediate();
+  auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr())
+                                ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr())
+                                  ->assert_is_op_input("elementwise_add", "Y")
+                                  ->AsInput();
+  auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr())
+                                 ->assert_is_op_output("elementwise_add")
+                                 ->assert_is_op_input("elementwise_add", "X")
+                                 ->AsIntermediate();
+
+  auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr())
+                                  ->assert_is_op("elementwise_add");
+  auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr())
+                                    ->assert_is_op_input("elementwise_add", "Y")
+                                    ->AsInput();
+  auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr())
+                                   ->assert_is_op_output("elementwise_add")
+                                   ->AsIntermediate();
+
+  auto act_op = pattern->NewNode(act_op_repr())
+                    ->assert_is_op()
+                    ->assert_more([&](Node *node) {
+                      auto op_type = node->Name();
+                      return conv_act_set.count(op_type);
+                    });
+  auto act_out = pattern->NewNode(act_out_repr())
+                     ->assert_is_var()
+                     // is activation op's output.
+                     ->assert_more([&](Node *node) {
+                       for (auto *in_op : node->inputs) {
+                         if (conv_act_set.count(in_op->Name())) {
+                           return true;
+                         }
+                       }
+                       return false;
+                     })
+                     ->AsOutput();
+
+  conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out});
+  elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y})
+      .LinksTo({elementwise_add_out});
+  elementwise_add_op_1->LinksFrom(
+      {elementwise_add_out, elementwise_add_in_y_1});
+  act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out});
+  return act_out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -671,6 +671,51 @@ struct ElementwiseAdd : public PatternBase {
  PATTERN_DECL_NODE(elementwise_add_y);
  PATTERN_DECL_NODE(elementwise_add_out);
 };
+
+// Conv + ElementwiseAdd + an activation
+// This pattern can futher fuse the conv related ops after the conv+bn fusion.
+struct ConvElementwiseaddAct : public PatternBase {
+  ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {}
+
+  PDNode* operator()(PDNode* conv_in);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(conv_filter);
+
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_in_y);  // input
+  PATTERN_DECL_NODE(elementwise_add_out);
+
+  PATTERN_DECL_NODE(act_op);
+  PATTERN_DECL_NODE(act_out);
+};
+
+// Conv + ElementwiseAdd + ElementwiseAdd + Activation
+struct ConvElementwiseadd2Act : public PatternBase {
+  ConvElementwiseadd2Act(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope,
+                    "conv_elementwiseadd2_elementwiseadd_act") {}
+
+  PDNode* operator()(PDNode* conv_in);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_out);
+
+  PATTERN_DECL_NODE(elementwise_add_op);
+  PATTERN_DECL_NODE(elementwise_add_in_y);  // input
+  PATTERN_DECL_NODE(elementwise_add_out);
+
+  PATTERN_DECL_NODE(elementwise_add_op_1);
+  PATTERN_DECL_NODE(elementwise_add_in_y_1);  // input
+  PATTERN_DECL_NODE(elementwise_add_out_1);
+
+  PATTERN_DECL_NODE(act_op);
+  PATTERN_DECL_NODE(act_out);
+};
+
 }  // namespace patterns

 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -51,11 +51,18 @@ class Pass {
  AttrType &Get(const std::string &attr_name) const {
    PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(),
                   "%s attr not registered for pass.", attr_name);
+    try {
      return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
+    } catch (boost::bad_any_cast &) {
+      PADDLE_THROW(
+          "Invalid attribute type of %s error, expected: %s, actual: %s",
+          attr_name, typeid(AttrType *).name(),
+          attrs_.at(attr_name).type().name());
+    }
  }

  bool Has(const std::string &attr_name) const {
-    return attrs_.find(attr_name) != attrs_.end();
+    return attrs_.count(attr_name) > 0;
  }

  void Erase(const std::string &attr_name) {

--- a/paddle/fluid/framework/op_kernel_type_test.cc
+++ b/paddle/fluid/framework/op_kernel_type_test.cc
@@ -34,7 +34,8 @@ TEST(OpKernelType, ToString) {
  OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW,
                               LibraryType::kCUDNN);
  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2),
-            "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_"
+            "data_type[::paddle::platform::float16]:data_layout[NCHW]:place["
+            "CUDAPlace(0)]:library_"
            "type[CUDNN]");
 }


--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -879,6 +879,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
          t = &(var->Get<SelectedRows>().value());
        }
        if (t != nullptr) {
+          PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s",
+                         ipt_name, DebugString());
          int tmp = static_cast<int>(t->type());
          PADDLE_ENFORCE(
              tmp == data_type || data_type == -1,

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -26,6 +26,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/reference_count_pass_helper.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -72,6 +73,26 @@ class ParallelExecutorPrivate {
      }
    }
  }
+
+  std::unique_ptr<ir::Graph> PrepareGCAndRefCnts(
+      std::unique_ptr<ir::Graph> graph, size_t max_memory_size);
+
+  inline bool HasGarbageCollectors() const { return !gcs_.empty(); }
+
+  void ResetRuntimeReferenceCount(const std::vector<std::string> &fetch_tensors,
+                                  const std::string &fetched_var_name) {
+    for (size_t i = 0; i < runtime_ref_cnts_.size(); ++i) {
+      for (auto &pair : global_ref_cnts_[i]) {
+        runtime_ref_cnts_[i][pair.first] = pair.second;
+      }
+
+      for (auto &fetch_name : fetch_tensors) {
+        runtime_ref_cnts_[i].erase(fetch_name);
+      }
+      runtime_ref_cnts_[i].erase(fetched_var_name);
+    }
+  }
+
  std::vector<platform::Place> places_;
  std::vector<Scope *> local_scopes_;
  Scope *global_scope_;  // not owned
@@ -83,8 +104,76 @@ class ParallelExecutorPrivate {
  bool own_local_scope_;
  bool use_cuda_;
  bool use_all_reduce_;
+
+  // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and
+  // then keeps unchanged
+  // Before each iteration, runtime_ref_cnts_ is reset to global_ref_cnts_
+  std::vector<details::ReferenceCountMap> global_ref_cnts_;
+  std::vector<details::AtomicReferenceCountMap> runtime_ref_cnts_;
+  details::GarbageCollectorMap gcs_;
 };

+std::unique_ptr<ir::Graph> ParallelExecutorPrivate::PrepareGCAndRefCnts(
+    std::unique_ptr<ir::Graph> graph, size_t max_memory_size) {
+  for (size_t i = 0; i < places_.size(); ++i) {
+    auto &place = places_[i];
+    if (gcs_.count(place) > 0) {
+      continue;
+    }
+    std::unique_ptr<GarbageCollector> gc;
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(place)) {
+      if (IsFastEagerDeletionModeEnabled()) {
+        gc.reset(new UnsafeFastGPUGarbageCollector(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
+      } else {
+        gc.reset(new StreamGarbageCollector(
+            boost::get<platform::CUDAPlace>(place), max_memory_size));
+      }
+      VLOG(10) << "Created " << i << "-th GarbageCollector at " << place;
+    } else {
+#endif
+      if (platform::is_cpu_place(place)) {
+        gc.reset(new CPUGarbageCollector(boost::get<platform::CPUPlace>(place),
+                                         max_memory_size));
+        VLOG(10) << "Created GarbageCollector at " << place;
+      } else {
+        PADDLE_THROW("Unsupported place for garbage collection");
+      }
+#ifdef PADDLE_WITH_CUDA
+    }
+#endif
+
+    gcs_.emplace(place, std::move(gc));
+  }
+
+  if (!gcs_.empty()) {
+    std::vector<details::LastLiveOpsOfVars> last_live_ops_of_vars;
+
+    auto ref_cnt_pass =
+        ir::PassRegistry::Instance().Get("reference_count_pass");
+    ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount,
+                              &global_ref_cnts_);
+    ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                              &last_live_ops_of_vars);
+    graph = ref_cnt_pass->Apply(std::move(graph));
+    VLOG(10) << "ReferenceCountPass Applied";
+
+    auto eager_deletion_pass =
+        ir::PassRegistry::Instance().Get("eager_deletion_pass");
+    eager_deletion_pass->SetNotOwned(details::kRuntimeReferenceCount,
+                                     &runtime_ref_cnts_);
+    eager_deletion_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+    eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars,
+                                     &last_live_ops_of_vars);
+    eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_);
+    graph = eager_deletion_pass->Apply(std::move(graph));
+    VLOG(10) << "EagerDeletionPass Applied";
+  }
+
+  return graph;
+}
+
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
  return member_->local_scopes_;
 }
@@ -151,36 +240,18 @@ ParallelExecutor::ParallelExecutor(
  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
      main_program, member_->places_, loss_var_name, params,
      member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
-
-  auto max_memory_size = GetEagerDeletionThreshold();
-  if (max_memory_size >= 0) {
-    for (auto &place : member_->places_) {
-      if (!platform::is_gpu_place(place)) continue;
-      auto gpu_place = boost::get<platform::CUDAPlace>(place);
-      if (gcs_[gpu_place.device] == nullptr) {
-        ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap());
-        cur_ref_cnts_[gpu_place.device].reset(
-            new details::AtomicReferenceCountMap());
-        gcs_[gpu_place.device].reset(
-            new StreamGarbageCollector<Tensor>(gpu_place, max_memory_size));
-      }
-    }
-    if (!gcs_.empty()) {
-      auto ref_cnt_pass =
-          ir::PassRegistry::Instance().Get("reference_count_pass");
-      ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_);
-      ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_);
-      ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
-      graph = ref_cnt_pass->Apply(std::move(graph));
-      graph->SetNotOwned("garbage_collector", &gcs_);
-    }
-  }
 #else
  std::unique_ptr<ir::Graph> graph =
      build_strategy.Apply(main_program, member_->places_, loss_var_name,
                           params, member_->local_scopes_, member_->use_cuda_);
 #endif

+  auto max_memory_size = GetEagerDeletionThreshold();
+  if (max_memory_size >= 0) {
+    graph = member_->PrepareGCAndRefCnts(std::move(graph),
+                                         static_cast<size_t>(max_memory_size));
+  }
+
  // Step 3. Create vars in each scope. Passes may also create new vars.
  //         skip control vars and empty vars
  std::vector<details::VariableInfo> var_infos;
@@ -300,18 +371,9 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
 #endif

  platform::RecordBlock b(0);
-#ifdef PADDLE_WITH_CUDA
-  if (!gcs_.empty()) {
-    ResetReferenceCount();
-    for (auto &pair : cur_ref_cnts_) {
-      auto &name_map = *(pair.second);
-      for (auto &fetch_name : fetch_tensors) {
-        name_map.erase(fetch_name);
-      }
-      name_map.erase(fetched_var_name);
+  if (member_->HasGarbageCollectors()) {
+    member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name);
  }
-  }
-#endif
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
@@ -355,13 +417,11 @@ ParallelExecutor::~ParallelExecutor() {
  for (auto &p : member_->places_) {
    platform::DeviceContextPool::Instance().Get(p)->Wait();
  }
-  // member_ must be destructed before gcs_ since the destructor of
-  // ReferenceCountOpHandle use raw pointers of gcs_ inside.
-  member_.reset();
+  delete member_;
 }

 }  // namespace framework
 }  // namespace paddle
-#ifdef PADDLE_WITH_CUDA
+
 USE_PASS(reference_count_pass);
-#endif
+USE_PASS(eager_deletion_pass);
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,7 +14,6 @@ limitations under the License. */

 #pragma once

-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -29,10 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"

-#ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/framework/details/reference_count_pass.h"
-#endif
-
 namespace paddle {
 namespace framework {

@@ -75,24 +70,7 @@ class ParallelExecutor {
 private:
  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;

-  std::unique_ptr<ParallelExecutorPrivate> member_;
-
-#ifdef PADDLE_WITH_CUDA
-  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
-  // keeps unchanged
-  // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_
-  details::DeviceReferenceCountMap ref_cnts_;
-  details::AtomicDeviceReferenceCountMap cur_ref_cnts_;
-  details::DeviceGarbageCollectorMap gcs_;
-
-  void ResetReferenceCount() {
-    for (auto &pair1 : ref_cnts_) {
-      for (auto &pair2 : *(pair1.second)) {
-        (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second;
-      }
-    }
-  }
-#endif
+  ParallelExecutorPrivate *member_;
 };

 }  // namespace framework

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -38,6 +38,10 @@ DEFINE_double(
    "Memory size threshold (GB) when the garbage collector clear tensors."
    "Disabled when this value is less than 0");

+DEFINE_bool(fast_eager_deletion_mode, false,
+            "Fast eager deletion mode. If enabled, memory would release "
+            "immediately without waiting GPU kernel ends.");
+
 // When in inference scenario, the scopes will not be written by two threads in
 // a mean time, but a scope may be read by multiple threads concurrently, and
 // the mutex will cause serious performance issue.
@@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() {
                                    (static_cast<int64_t>(1) << 30));
 }

+bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; }
+
 Scope::~Scope() { DropKids(); }

 Scope& Scope::NewScope() const {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -27,6 +27,7 @@ namespace paddle {
 namespace framework {

 int64_t GetEagerDeletionThreshold();
+bool IsFastEagerDeletionModeEnabled();

 class Scope;


--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -14,15 +14,14 @@ limitations under the License. */

 #pragma once

-#include <paddle/fluid/framework/framework.pb.h>
 #include <cstdint>
 #include <cstring>
 #include <memory>
 #include <typeindex>
 #include <vector>
-
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -159,6 +158,10 @@ class Tensor {
  const std::shared_ptr<memory::Allocation>& Holder() const { return holder_; }
  size_t offset() const { return offset_; }

+  std::shared_ptr<memory::Allocation> MoveMemoryHolder() {
+    return std::move(holder_);
+  }
+
 private:
  /*! holds the memory block if allocated. */
  std::shared_ptr<memory::Allocation> holder_;

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -55,7 +55,12 @@ TEST(AnalysisPredictor, analysis_off) {
 }

 TEST(AnalysisPredictor, analysis_on) {
-  AnalysisConfig config(false);
+#ifdef PADDLE_WITH_CUDA
+  AnalysisConfig config(true);
+  config.fraction_of_gpu_memory = 0.15;
+#else
+  AnalysisConfig config;
+#endif
  config.model_dir = FLAGS_dirname;
  config.enable_ir_optim = true;


--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -39,7 +39,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
  if (t->type() == framework::proto::VarType::INT64) {
    pt.data.Reset(t->data<void>(), t->numel() * sizeof(int64_t));
    pt.dtype = PaddleDType::INT64;
-  } else if (t->type() == framework::proto::VarType::INT32) {
+  } else if (t->type() == framework::proto::VarType::FP32) {
    pt.data.Reset(t->data<void>(), t->numel() * sizeof(float));
    pt.dtype = PaddleDType::FLOAT32;
  } else {

--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -118,7 +118,10 @@ class GpuPassStrategy : public PassStrategy {
 public:
  GpuPassStrategy() : PassStrategy({}) {
    passes_.assign({
-        "infer_clean_graph_pass", "conv_bn_fuse_pass",
+        "infer_clean_graph_pass",               //
+        "conv_bn_fuse_pass",                    //
+        "conv_elementwise_add_act_fuse_pass",   //
+        "conv_elementwise_add2_act_fuse_pass",  //
    });
  }


--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -79,7 +79,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,

  for (auto* var : global_block.AllVars()) {
    if (IsPersistable(var)) {
-      VLOG(3) << "persistable variable's name: " << var->Name();
+      VLOG(4) << "persistable variable's name: " << var->Name();

      framework::VarDesc* new_var = load_block->Var(var->Name());
      new_var->SetShape(var->GetShape());

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -78,6 +78,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) {
  std::vector<PaddleTensor> outputs;
  if (use_analysis || use_tensorrt) {
    contrib::AnalysisConfig config(true);
+    config.pass_builder()->TurnOnDebug();
    SetConfig<contrib::AnalysisConfig>(&config, model_dir, true, use_tensorrt,
                                       FLAGS_batch_size);
    TestPrediction(reinterpret_cast<PaddlePredictor::Config*>(&config),
@@ -141,9 +142,31 @@ TEST(TensorRT_resnext50, profile) {
  profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
 }

+TEST(resnext50, compare_analysis_native) {
+  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  compare(model_dir, false /*use tensorrt*/);
+}
+
 TEST(TensorRT_mobilenet, analysis) {
  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  compare(model_dir, /* use_tensorrt */ false);
+  compare(model_dir, false /* use_tensorrt */);
+}
+
+TEST(AnalysisPredictor, use_gpu) {
+  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
+  AnalysisConfig config(true);
+  config.model_dir = model_dir;
+  config.fraction_of_gpu_memory = 0.15;
+  config.pass_builder()->TurnOnDebug();
+
+  std::vector<std::vector<PaddleTensor>> inputs_all;
+  auto predictor = CreatePaddlePredictor(config);
+  SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "");
+
+  std::vector<PaddleTensor> outputs;
+  for (auto& input : inputs_all) {
+    ASSERT_TRUE(predictor->Run(input, &outputs));
+  }
 }

 }  // namespace inference

--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
 include(operators)
-register_operators()
+register_operators(DEPS naive_executor)

 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes";
 static constexpr char kX[] = "X";
 static constexpr char kXGRAD[] = "X@GRAD";
 static constexpr char kOutputs[] = "Out";
+static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars";
+
+namespace {  // NOLINT
+static std::string GetSkipEagerDeletionVarsDebugString(
+    const std::vector<std::string> &vars) {
+  std::string str = "Skip " + std::to_string(vars.size()) +
+                    " var(s) in eager deletion mode: ";
+  for (auto &var : vars) {
+    str.append(var);
+    str.push_back(' ');
+  }
+  return str;
+}
+}  // NOLINT

 class WhileOp : public framework::OperatorBase {
 public:
@@ -59,7 +73,10 @@ class WhileOp : public framework::OperatorBase {
                   "Condition of while op must in CPU memory.");

    bool is_test = Attr<bool>("is_test");
-    auto ctx = executor.Prepare(*program, block->ID());
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
+
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);
    while (cond.data<bool>()[0]) {
      auto &current_scope = scope.NewScope();
      step_scopes->push_back(&current_scope);
@@ -96,6 +113,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(bool, default false) Set to true for inference only, false "
                  "for training. Some layers may run faster when this is true.")
        .SetDefault(false);
+    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
+                                      "Vars that would skip eager deletion."
+                                      "Users should not set this manually.")
+        .SetDefault(std::vector<std::string>());
    AddComment(R"DOC(
 )DOC");
  }
@@ -119,7 +140,10 @@ class WhileGradOp : public framework::OperatorBase {
    framework::Executor executor(dev_place);
    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
    auto *program = block->Program();
-    auto ctx = executor.Prepare(*program, block->ID());
+
+    auto &skip_vars = Attr<std::vector<std::string>>(kSkipEagerDeletionVars);
+    VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars);
+    auto ctx = executor.Prepare(*program, block->ID(), skip_vars);

    auto *step_scopes =
        scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
@@ -341,6 +365,8 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
    // while operator could be renamed.
    while_grad->SetAttr("original_output_grad", output_grads_list);

+    while_grad->SetAttr(kSkipEagerDeletionVars, std::vector<std::string>());
+
    return std::unique_ptr<framework::OpDesc>(while_grad);
  }
 };

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -44,7 +44,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");

  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
-                 "Conv intput should be 4-D or 5-D tensor.");
+                 "Conv intput should be 4-D or 5-D tensor, get %u",
+                 in_dims.size());
+
  PADDLE_ENFORCE_EQ(
      in_dims.size(), filter_dims.size(),
      "Conv input dimension and filter dimension should be the same.");

--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -300,9 +300,11 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
    }
    CudnnRNNCache *cudnn_rnn_cache = nullptr;
    if (cache_var->IsInitialized()) {
+      // const_cast is usually bad.
      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
                            ->GetMutable<CudnnRNNCache>();
    } else {
+      // const_cast is usually bad.
      cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
                            ->GetMutable<CudnnRNNCache>();
      std::random_device rnd;

--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -12,7 +12,7 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @O
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")

 if(WITH_GRPC)
-  grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
+  grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
        request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc
      PROTO send_recv.proto 
      DEPS lod_tensor selected_rows_functor memory)
@@ -20,36 +20,43 @@ if(WITH_GRPC)
  set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

  cc_test(grpc_serde_test SRCS grpc_serde_test.cc 
-    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
+    DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL)

  cc_test(rpc_server_test SRCS rpc_server_test.cc
-    DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)
+    DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor  proto_desc lookup_sparse_table_op SERIAL)

  cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)

  if(WITH_GPU)
  cc_test(collective_server_test SRCS collective_server_test.cc 
-      DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+      DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
      selected_rows_functor  scope math_function SERIAL)
  endif()

-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
 else()
-  set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
-      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
+      brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc
+      collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})

-  brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
-      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
+  brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
+      brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc
    PROTO send_recv.proto
    DEPS lod_tensor selected_rows memory)

-  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory)
+  cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)

-  set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
+  set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor
+      proto_desc lookup_sparse_table_op snappystream snappy zlib)

-  cc_test(brpc_server_test SRCS rpc_server_test.cc
+  cc_test(rpc_server_test SRCS rpc_server_test.cc
      DEPS ${brpc_test_depends} SERIAL)

  cc_test(brpc_serde_test SRCS brpc_serde_test.cc
      DEPS ${brpc_test_depends} SERIAL)
+
+  if(WITH_GPU)
+  cc_test(collective_server_test SRCS collective_server_test.cc 
+      DEPS ${brpc_test_depends} selected_rows_functor  scope math_function SERIAL)
+  endif()
 endif()
--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@@ -14,135 +14,316 @@

 #include "paddle/fluid/operators/distributed/brpc_client.h"
 #include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace operators {
 namespace distributed {

-DEFINE_int32(brpc_channel_num, 24,
-             "Number of channels to send requests connected to one server");
 DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds");
 DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)");

 BRPCClient::~BRPCClient() { Wait(); }

-void HandleSendResponse(brpc::Controller* cntl,
-                        sendrecv::VoidMessage* response) {
+void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response,
+                        VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                        ChannelContextPtr ch_ctx, BRPCClient* cls) {
  // std::unique_ptr makes sure cntl/response will be deleted before returning.
  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
  std::unique_ptr<sendrecv::VoidMessage> response_guard(response);

+  // this channel can be used by other now.
+  ch_ptr->Push(ch_ctx);
+
  if (cntl->Failed()) {
-    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    LOG(FATAL) << "Fail to send SendVar: " << var_h->name()
+               << ", error text: " << cntl->ErrorText();
+    var_h->Finish(false);
+    cls->DecreaseReqCount();
    return;
  }
-  LOG(INFO) << "Received response from " << cntl->remote_side()
-            << " latency=" << cntl->latency_us() << "us";
+  var_h->Finish(true);
+  cls->DecreaseReqCount();
+
+  VLOG(4) << "HandleSendResponse from: " << cntl->remote_side()
+          << ", varname: " << var_h->name()
+          << ", latency: " << cntl->latency_us() << "us";
+  VLOG(4) << "Finish HandleSendResponse";
 }

-bool BRPCClient::AsyncSendVar(const std::string& ep,
+VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
-                              const std::string& var_name, int64_t time_out) {
+                                      const std::string& var_name,
+                                      int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string var_name_val = var_name;
  const framework::Scope* p_scope = &scope;
  const auto ch_ptr = GetChannel(ep_val);
+  const std::string method = "SendRPC";
+  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));

-  framework::AsyncIO(
-      [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] {
+  framework::AsyncIO([=] {
    auto ch_ctx = ch_ptr->Pop();
    brpc::Controller* cntl = new brpc::Controller();
    sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
    cntl->set_timeout_ms(time_out);

-        google::protobuf::Closure* done =
-            brpc::NewCallback(&HandleSendResponse, cntl, response);
-
+    auto* var = p_scope->FindVar(var_name_val);
    sendrecv::VariableMessage request;
+    distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request,
+                                  &cntl->request_attachment(), "", false,
+                                  trainer_id_);
+
+    google::protobuf::Closure* done = brpc::NewCallback(
+        &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
    ch_ctx->stub->SendVariable(cntl, &request, response, done);
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      var_h->Wait();
+    }
  });
  req_count_++;

-  return true;
+  return var_h;
 }
+void HandleFetchBarrierResponse(brpc::Controller* cntl,
+                                sendrecv::VariableMessage* response,
+                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                                ChannelContextPtr ch_ctx, BRPCClient* cls) {
+  // std::unique_ptr makes sure cntl/response will be deleted before returning.
+  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
+  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);
+
+  // this channel can be used other now.
+  ch_ptr->Push(ch_ctx);

+  if (cntl->Failed()) {
+    LOG(FATAL) << "Fail to get HandleFetchBarrierResponse: " << var_h->name()
+               << ", error text: " << cntl->ErrorText();
+    var_h->Finish(false);
+    cls->DecreaseReqCount();
+    return;
+  }
+
+  var_h->Finish(true);
+  cls->DecreaseReqCount();
+
+  VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side()
+          << ", varname: " << var_h->name()
+          << ", latency: " << cntl->latency_us() << "us";
+  VLOG(4) << "Finish HandleFetchBarrierResponse";
+}
 void HandleGetResponse(brpc::Controller* cntl,
-                       sendrecv::VariableMessage* response) {
+                       sendrecv::VariableMessage* response, VarHandlePtr var_h,
+                       ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx,
+                       BRPCClient* cls) {
  // std::unique_ptr makes sure cntl/response will be deleted before returning.
  std::unique_ptr<brpc::Controller> cntl_guard(cntl);
  std::unique_ptr<sendrecv::VariableMessage> response_guard(response);

+  // this channel can be used other now.
+  ch_ptr->Push(ch_ctx);
+
  if (cntl->Failed()) {
-    LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText();
+    LOG(FATAL) << "Fail to GetVar: " << var_h->name()
+               << ", error text: " << cntl->ErrorText();
+    cls->DecreaseReqCount();
+    var_h->Finish(false);
    return;
  }
-  LOG(INFO) << "Received response from " << cntl->remote_side()
-            << " latency=" << cntl->latency_us() << "us";

-  // framework::Variable* outvar = nullptr;
-  // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar);
+  VLOG(4) << "HandleGetResponse from: " << cntl->remote_side()
+          << ", varname: " << var_h->name()
+          << ", latency: " << cntl->latency_us() << "us";
+
+  framework::Variable* outvar = nullptr;
+  int trainer_id;
+  distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(),
+                                    *var_h->ctx(), var_h->scope(), &outvar,
+                                    &trainer_id);
+  VLOG(4) << "Finish HandleGetResponse";
+  cls->DecreaseReqCount();
+  var_h->Finish(true);
 }

-bool BRPCClient::AsyncGetVar(const std::string& ep,
+VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
                                      const platform::DeviceContext& ctx,
                                      const framework::Scope& scope,
-                             const std::string& var_name, int64_t time_out) {
+                                      const std::string& var_name,
+                                      const std::string& method_name,
+                                      int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string var_name_val = var_name;
  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
+  const auto ch_ptr = GetChannel(ep_val);
+  const std::string method = "GetRPC";
+  VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope));
+
+  framework::AsyncIO([=] {
+    auto ch_ctx = ch_ptr->Pop();
+
+    brpc::Controller* cntl = new brpc::Controller();
+    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
+    cntl->set_timeout_ms(time_out);
+
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+    req.set_trainer_id(trainer_id_);
+
+    google::protobuf::Closure* done = brpc::NewCallback(
+        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+    platform::RecordRPCEvent record_event(method, p_ctx);

-  framework::AsyncIO(
-      [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {});
+    if (method_name == "GetMonomerVariable") {
+      ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
+    } else {
+      ch_ctx->stub->GetVariable(cntl, &req, response, done);
+    }
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      var_h->Wait();
+    }
+  });

  req_count_++;

-  return true;
+  return var_h;
+}
+
+VarHandlePtr BRPCClient::AsyncGetMonomerVariable(
+    const std::string& ep, const platform::DeviceContext& ctx,
+    const framework::Scope& scope, const std::string& var_name,
+    int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out);
+}
+
+VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
+                                                const std::string& var_name,
+                                                int64_t time_out) {
+  return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out);
 }

-bool BRPCClient::AsyncPrefetchVar(const std::string& ep,
+VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
+                                     const platform::DeviceContext& ctx,
+                                     const framework::Scope& scope,
+                                     const std::string& var_name,
+                                     int64_t time_out) {
+  return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out);
+}
+
+VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
                                          const platform::DeviceContext& ctx,
                                          const framework::Scope& scope,
                                          const std::string& in_var_name,
                                          const std::string& out_var_name,
+                                          const std::string& table_name,
                                          int64_t time_out) {
  const platform::DeviceContext* p_ctx = &ctx;
  const std::string ep_val = ep;
  const std::string in_var_name_val = in_var_name;
  const std::string out_var_name_val = out_var_name;
+  const std::string table_name_val = table_name;
  const framework::Scope* p_scope = &scope;
-  const auto ch = GetChannel(ep_val);
+  const auto ch_ptr = GetChannel(ep_val);
+
+  const std::string method = "PrefetchRPC";
+
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope));
+
+  framework::AsyncIO([=] {
+    auto ch_ctx = ch_ptr->Pop();
+
+    brpc::Controller* cntl = new brpc::Controller();
+    sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
+    cntl->set_timeout_ms(time_out);

-  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, ch, this] {});
+    auto* var = p_scope->FindVar(in_var_name_val);
+    sendrecv::VariableMessage req;
+    distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req,
+                                  &cntl->request_attachment(), out_var_name_val,
+                                  false, 0, table_name_val);
+
+    platform::RecordRPCEvent record_event(method, p_ctx);
+
+    google::protobuf::Closure* done = brpc::NewCallback(
+        &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+    ch_ctx->stub->PrefetchVariable(cntl, &req, response, done);
+
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      var_h->Wait();
+    }
+  });

  req_count_++;
-  return true;
+  return var_h;
 }

-void BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
+VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep,
                                               int64_t time_out) {
-  req_count_++;
+  return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE,
+                          time_out);
 }

-void BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
+VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
                                               int64_t time_out) {
+  auto ch_ptr = GetChannel(ep);
+  auto ch_ctx = ch_ptr->Pop();
+
+  brpc::Controller* cntl = new brpc::Controller();
+  sendrecv::VariableMessage* response = new sendrecv::VariableMessage();
+  cntl->set_timeout_ms(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(FETCH_BARRIER_MESSAGE);
+
+  const std::string method = "FetchBarrierRPC";
+  // var handle
+  VarHandlePtr var_h(
+      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+
+  platform::RecordRPCEvent record_event(method, nullptr);
+
+  google::protobuf::Closure* done = brpc::NewCallback(
+      &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+  ch_ctx->stub->GetVariable(cntl, &req, response, done);
+
  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    var_h->Wait();
+  }
+
+  return var_h;
 }

-void BRPCClient::Wait() {
+bool BRPCClient::Wait() {
+  VLOG(9) << "begin to brpcclient wait";
+  {
    std::unique_lock<std::mutex> lk(sync_mutex_);
    sync_cond_.wait(lk, [this] { return req_count_ == 0; });
+  }
+  VLOG(9) << "end to brpcclient wait";
+  return true;
 }

 ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
+  VLOG(4) << "begin to GetChannel:" << ep;
  {
    std::lock_guard<std::mutex> guard(chan_mutex_);
    auto it = channels_.find(ep);
    if (it != channels_.end()) {
+      VLOG(4) << "end to GetChannel:" << ep;
      return it->second;
    }
  }
@@ -150,12 +331,20 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
  ChannelQueuePtr q(new framework::BlockingQueue<ChannelContextPtr>());

  brpc::ChannelOptions options;
+#ifdef PADDLE_WITH_BRPC_RDMA
+  options.use_rdma = true;
+#endif
  options.protocol = "baidu_std";
-  options.connection_type = "pooled";
-  options.connect_timeout_ms = 100;
+  // don't use pooled type. the server can't afford that.
+  options.connection_type = "single";
+  options.connect_timeout_ms = 1000;
  options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/;
  options.max_retry = FLAGS_max_retry;
-  for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
+
+  VLOG(1) << "create " << brpc_channel_num_per_server_
+          << " brpc channels to pserver:" << ep;
+
+  for (int i = 0; i < brpc_channel_num_per_server_; ++i) {
    std::shared_ptr<ChannelContext> c(new ChannelContext());
    if (c->channel.Init(ep.c_str(), &options) != 0) {
      LOG(FATAL) << "Fail to initialize channel";
@@ -172,9 +361,75 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
    channels_[ep] = q;
  }

+  VLOG(4) << "end to GetChannel:" << ep;
  return q;
 }

+VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep,
+                                           int64_t time_out) {
+  return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out);
+}
+
+void BRPCClient::SendComplete() {
+  for (auto& kv : channels_) {
+    AsyncSendComplete(kv.first);
+  }
+}
+
+VarHandlePtr BRPCClient::AsyncSendVarMessage(
+    const std::string& ep, const std::string& method_name,
+    const sendrecv::VariableMessage& req, int64_t time_out) {
+  auto ch_ptr = GetChannel(ep);
+  auto ch_ctx = ch_ptr->Pop();
+
+  brpc::Controller* cntl = new brpc::Controller();
+  sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
+  cntl->set_timeout_ms(time_out);
+
+  platform::RecordRPCEvent record_event(method_name, nullptr);
+
+  VarHandlePtr var_h(
+      new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
+
+  google::protobuf::Closure* done = brpc::NewCallback(
+      &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
+
+  if (method_name == "CheckPointNotifyRPC") {
+    ch_ctx->stub->CheckpointNotify(cntl, &req, response, done);
+  } else if (method_name == "GetMonomerBarrier") {
+    ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done);
+  } else {
+    ch_ctx->stub->SendVariable(cntl, &req, response, done);
+  }
+  req_count_++;
+
+  if (UNLIKELY(platform::IsProfileEnabled())) {
+    var_h->Wait();
+  }
+
+  return var_h;
+}
+
+VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep,
+                                          const std::string& method_name,
+                                          const std::string& message,
+                                          int64_t time_out) {
+  sendrecv::VariableMessage req;
+  req.set_varname(message);
+
+  return AsyncSendVarMessage(ep, method_name, req, time_out);
+}
+
+VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep,
+                                               const std::string& dir,
+                                               int64_t time_out) {
+  sendrecv::VariableMessage req;
+  req.set_varname(CHECKPOINT_SAVE_MESSAGE);
+  req.set_out_varname(dir);
+
+  return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out);
+}
+
 }  // namespace distributed
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/distributed/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc_client.h
@@ -31,6 +31,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/request_handler.h"
 #include "paddle/fluid/operators/distributed/rpc_client.h"
 #include "paddle/fluid/operators/distributed/send_recv.pb.h"
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
@@ -53,33 +55,94 @@ class BRPCClient : public RPCClient {
  BRPCClient() {}
  virtual ~BRPCClient();

-  bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
-                    const framework::Scope& scope, const std::string& var_name,
+  VarHandlePtr AsyncSendVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncGetVar(const std::string& ep,
+                           const platform::DeviceContext& ctx,
+                           const framework::Scope& scope,
+                           const std::string& var_name,
+                           int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncGetMonomerBarrier(
+      const std::string& ep, const std::string& var_name,
      int64_t time_out = FLAGS_rpc_deadline) override;

-  bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx,
+  VarHandlePtr AsyncGetMonomerVariable(
+      const std::string& ep, const platform::DeviceContext& ctx,
      const framework::Scope& scope, const std::string& var_name,
      int64_t time_out = FLAGS_rpc_deadline) override;

-  bool AsyncPrefetchVar(const std::string& ep,
+  VarHandlePtr AsyncPrefetchVar(const std::string& ep,
                                const platform::DeviceContext& ctx,
                                const framework::Scope& scope,
                                const std::string& in_var_name,
                                const std::string& out_var_name,
+                                const std::string& table_name = "",
                                int64_t time_out = FLAGS_rpc_deadline) override;

-  void AsyncSendBatchBarrier(const std::string& ep,
-                             int64_t time_out = FLAGS_rpc_deadline) override;
+  VarHandlePtr AsyncSendBatchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;
+
+  VarHandlePtr AsyncSendFetchBarrier(
+      const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override;

-  void AsyncSendFetchBarrier(const std::string& ep,
+  VarHandlePtr AsyncCheckpointNotify(
+      const std::string& ep, const std::string& dir,
      int64_t time_out = FLAGS_rpc_deadline) override;

-  void Wait() override;
+  bool Wait() override;
+
+  void SendComplete() override;

 private:
+  VarHandlePtr _AsyncGetVar(const std::string& ep,
+                            const platform::DeviceContext& ctx,
+                            const framework::Scope& scope,
+                            const std::string& var_name,
+                            const std::string& method_name,
+                            int64_t time_out = FLAGS_rpc_deadline);
+
  void Proceed();
  ChannelQueuePtr GetChannel(const std::string& ep);

+  VarHandlePtr AsyncSendComplete(const std::string& ep,
+                                 int64_t time_out = FLAGS_rpc_deadline);
+
+  VarHandlePtr AsyncSendMessage(const std::string& ep,
+                                const std::string& method_name,
+                                const std::string& message, int64_t time_out);
+
+  VarHandlePtr AsyncSendVarMessage(const std::string& ep,
+                                   const std::string& method_name,
+                                   const sendrecv::VariableMessage& req,
+                                   int64_t time_out);
+
+  friend void HandleSendResponse(brpc::Controller* cntl,
+                                 sendrecv::VoidMessage* response,
+                                 VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                                 ChannelContextPtr ch_ctx, BRPCClient* cls);
+
+  friend void HandleGetResponse(brpc::Controller* cntl,
+                                sendrecv::VariableMessage* response,
+                                VarHandlePtr var_h, ChannelQueuePtr ch_ptr,
+                                ChannelContextPtr ch_ctx, BRPCClient* cls);
+
+  friend void HandleFetchBarrierResponse(brpc::Controller* cntl,
+                                         sendrecv::VariableMessage* response,
+                                         VarHandlePtr var_h,
+                                         ChannelQueuePtr ch_ptr,
+                                         ChannelContextPtr ch_ctx,
+                                         BRPCClient* cls);
+  void DecreaseReqCount() {
+    if (--req_count_ <= 0) {
+      sync_cond_.notify_all();
+    }
+  }
+
 private:
  std::unordered_map<std::string, ChannelQueuePtr> channels_;

@@ -88,6 +151,8 @@ class BRPCClient : public RPCClient {
  std::condition_variable sync_cond_;
  std::atomic<int64_t> req_count_{0};

+  static constexpr int brpc_channel_num_per_server_ = 4;
+
  // mutex for GetChannel thread safety
  std::mutex chan_mutex_;
  DISABLE_COPY_AND_ASSIGN(BRPCClient);

--- a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
+++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_BRPC_RDMA
+
+#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "brpc/channel.h"
+#include "brpc/rdma/rdma_helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+RdmaMemPool& RdmaMemPool::Instance() {
+  static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool();
+  return *g_rdma_mem_pool;
+}
+
+void* RdmaMemPool::Find(const std::string& varname, int64_t size) {
+  pthread_rwlock_rdlock(&access_);
+  auto it = pool_.find(varname);
+  if (it == pool_.end()) {
+    pthread_rwlock_unlock(&access_);
+    return nullptr;
+  }
+
+  auto info = it->second;
+  if (info.data_size != size) {
+    pthread_rwlock_unlock(&access_);
+    PADDLE_ENFORCE(false, "var:%s size:%ld != %ld", varname, size,
+                   info.data_size);
+    return nullptr;
+  }
+
+  pthread_rwlock_unlock(&access_);
+  return info.data;
+}
+
+void RdmaMemPool::Register(const std::string& varname, void* data,
+                           int64_t data_size) {
+  void* old = Find(varname, data_size);
+  if (old != nullptr) {
+    if (data != old) {
+      PADDLE_ENFORCE(false, "var:%s data:%ld != %ld", varname, data, old);
+    }
+    VLOG(7) << "Find on rdma:" << varname << " data:" << data
+            << " data_size:" << data_size;
+    return;
+  }
+
+  VarInfo info;
+  info.data = data;
+  info.data_size = data_size;
+
+  pthread_rwlock_wrlock(&access_);
+  pool_[varname] = info;
+  pthread_rwlock_unlock(&access_);
+
+  if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) {
+    LOG(FATAL) << "register " << varname << " data:" << data
+               << " data_size:" << data_size << " error";
+  }
+
+  VLOG(4) << "register on rdma:" << varname << " data:" << data
+          << " data_size:" << data_size;
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
+
+#endif
--- a/paddle/fluid/operators/distributed/brpc_rdma_pool.h
+++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_BRPC_RDMA
+
+#include <pthread.h>  // NOLINT
+#include <string>
+#include <unordered_map>
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+/*
+ * This class is used to avoid duplicated registion of brpc::rdma.
+ */
+class RdmaMemPool {
+ public:
+  static RdmaMemPool& Instance();
+  RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {}
+
+  virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); }
+
+  void Register(const std::string& varname, void* data, int64_t size);
+  void* Find(const std::string& varname, int64_t size);
+
+ private:
+  struct VarInfo {
+    void* data;
+    int64_t data_size;
+
+    VarInfo() : data(nullptr), data_size(0) {}
+  };
+
+ private:
+  std::unordered_map<std::string, VarInfo> pool_;
+  pthread_rwlock_t access_;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
+
+#endif
--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_CUDA
+#include <nccl.h>
+#endif
+#include <sys/time.h>
+#include <thread>  // NOLINT
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/platform/profiler.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class IOBufWriter {
+ public:
+  static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) {
+    iobuf->append(reinterpret_cast<char*>(&k), 4);
+    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
+    iobuf->append(v, vlen);
+  }
+
+  static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v,
+                                int64_t vlen, bool in_cuda_pinned,
+                                void (*destroy)(void*), void* user_data) {
+    VLOG(7) << "AppendTCPZeroCopy "
+            << " k:" << k
+            << " data:" << static_cast<void*>(const_cast<char*>(v))
+            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
+
+    iobuf->append(reinterpret_cast<char*>(&k), 4);
+    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
+
+    // FIXME(gongwb): use append_zerocopy
+    /*
+    if (in_cuda_pinned) {
+      iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory);
+    } else {
+      iobuf->append_zerocopy(v, vlen, nullptr);
+    }
+    */
+    iobuf->append(v, vlen);
+    destroy(user_data);
+  }
+
+#ifdef PADDLE_WITH_BRPC_RDMA
+  static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf,
+                                 int k, const char* v, int64_t vlen,
+                                 bool in_cuda_pinned, void (*destroy)(void*),
+                                 void* user_data) {
+    VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k
+            << " data:" << static_cast<void*>(const_cast<char*>(v))
+            << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned;
+
+    iobuf->append(reinterpret_cast<char*>(&k), 4);
+    iobuf->append(reinterpret_cast<char*>(&vlen), 8);
+
+    RdmaMemPool::Instance().Register(
+        varname, static_cast<void*>(const_cast<char*>(v)), vlen);
+
+    // FIXME(gongwb): use append_zerocopy
+    // iobuf->append_zerocopy(v, vlen, nullptr);
+    iobuf->append(v, vlen);
+    destroy(user_data);
+    return;
+  }
+#endif
+
+  static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf,
+                             int k, const char* v, int64_t vlen,
+                             bool in_cuda_pinned, void (*destroy)(void*),
+                             void* user_data) {
+#ifdef PADDLE_WITH_BRPC_RDMA
+    IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned,
+                                    destroy, user_data);
+#else
+    IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy,
+                                   user_data);
+#endif
+  }
+};
+
+void SerializeToIOBuf(const std::string& name, framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      butil::IOBuf* iobuf, const std::string& out_varname,
+                      bool var_is_not_stable, int trainer_id,
+                      const std::string& table_name) {
+  std::unique_ptr<TensorPayload> payload;
+
+  request->set_varname(name);
+  request->set_trainer_id(trainer_id);
+  // Note: normally the profiler is enabled in 1 trainer, hence only
+  // 1 trainer returns true for ShouldSendProfileState(). It tells PS
+  // servers the trainer's profiling state so that PS can follow the
+  // trainer.
+  if (platform::ShouldSendProfileState()) {
+    if (platform::IsProfileEnabled()) {
+      request->set_profile(platform::kEnableProfiler);
+    } else {
+      request->set_profile(platform::kDisableProfiler);
+    }
+  }
+  if (!out_varname.empty()) {
+    request->set_out_varname(out_varname);
+  }
+  if (!table_name.empty()) {
+    request->set_table_name(table_name);
+  }
+  if (var->IsType<framework::LoDTensor>()) {
+    request->set_type(::sendrecv::LOD_TENSOR);
+    payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request)));
+  } else if (var->IsType<framework::SelectedRows>()) {
+    request->set_type(::sendrecv::SELECTED_ROWS);
+    payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request)));
+#ifdef PADDLE_WITH_CUDA
+  } else if (var->IsType<ncclUniqueId>()) {
+    request->set_type(::sendrecv::NCCL_ID);
+    const ncclUniqueId& uid = var->Get<ncclUniqueId>();
+    // TODO(gongwb): use append_zero to avoid data copy.
+    IOBufWriter::Append(iobuf,
+                        sendrecv::VariableMessage::kSerializedFieldNumber,
+                        uid.internal, NCCL_UNIQUE_ID_BYTES);
+    return;
+#endif
+  } else {
+    PADDLE_THROW("Serialize does not support type: %s",
+                 typeid(var->Type()).name());
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(payload);
+
+  // FIXME(gongwb): it seems that can use zero copy.
+  if (var_is_not_stable) {
+    IOBufWriter::Append(
+        iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+        static_cast<const char*>(payload->ptr()), payload->memory_size());
+  } else {
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      IOBufWriter::AppendZeroCopy(
+          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+          static_cast<const char*>(payload->ptr()), payload->memory_size(),
+          true, SerializeDestroyCallback, static_cast<void*>(payload.get()));
+      payload.release();
+#endif
+    } else {
+      IOBufWriter::AppendZeroCopy(
+          name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber,
+          static_cast<const char*>(payload->ptr()), payload->memory_size(),
+          false, SerializeDestroyCallback, static_cast<void*>(payload.get()));
+      payload.release();
+    }
+  }
+
+  if (var->IsType<framework::SelectedRows>()) {
+    auto* slr = var->GetMutable<framework::SelectedRows>();
+    size_t rows_memory_size =
+        slr->rows().size() * framework::SizeOfType(typeid(int64_t));
+
+    IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber,
+                        reinterpret_cast<const char*>(slr->rows().data()),
+                        static_cast<int64_t>(rows_memory_size));
+  }
+}
+
+void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta,
+                          const butil::IOBuf& iobuf,
+                          const platform::DeviceContext& ctx,
+                          const framework::Scope* scope,
+                          framework::Variable** var, int* trainer_id) {
+  operators::distributed::BRPCVariableResponse resp(scope, &ctx);
+  PADDLE_ENFORCE(resp.Parse(iobuf, meta) == 0, "parse iobuf to tensor error!");
+  *var = resp.GetVar();
+  *trainer_id = resp.GetTrainerId();
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <sys/time.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+void SerializeToIOBuf(const std::string& name, framework::Variable* var,
+                      const platform::DeviceContext& ctx, VarMsg* request,
+                      butil::IOBuf* iobuf, const std::string& out_varname,
+                      bool var_is_not_stable, const int trainer_id = 0,
+                      const std::string& table_name = std::string());
+
+void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf,
+                          const platform::DeviceContext& ctx,
+                          const framework::Scope* scope,
+                          framework::Variable** var, int* trainer_id);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/brpc_serde_test.cc
+++ b/paddle/fluid/operators/distributed/brpc_serde_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>  // NOLINT
+
+#include "brpc/channel.h"
+#include "google/protobuf/text_format.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+namespace operators = paddle::operators;
+namespace math = paddle::operators::math;
+namespace memory = paddle::memory;
+
+void RunSerdeTestSelectedRows(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  butil::IOBuf iobuf;
+  sendrecv::VariableMessage msg;
+  int tensor_numel = 564 * 128;
+
+  // serialize var to IOBuf
+  {
+    framework::Variable var;
+    auto* slr = var.GetMutable<framework::SelectedRows>();
+    slr->set_height(1000);
+    auto* tensor = slr->mutable_value();
+    auto* rows = slr->mutable_rows();
+    tensor->Resize(framework::make_ddim({564, 128}));
+    tensor->mutable_data<float>(place);
+    math::set_constant(ctx, tensor, 32.7);
+    for (int i = 0; i < 564; ++i) rows->push_back(i);
+
+    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
+                                             "", false);
+  }
+
+  // desrialize
+  {
+    framework::Scope scope;
+    scope.Var("myvar");
+    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
+    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
+
+    framework::Variable* var2 = resp.GetVar();
+
+    auto* slr2 = var2->GetMutable<framework::SelectedRows>();
+    auto* tensor2 = slr2->mutable_value();
+    auto* rows2 = slr2->mutable_rows();
+    float* tensor_data2 = nullptr;
+    framework::Tensor tmp_tensor;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      platform::CPUPlace cpu;
+      framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
+      tensor_data2 = tmp_tensor.data<float>();
+    } else {
+      tensor_data2 = const_cast<float*>(tensor2->data<float>());
+    }
+    const int64_t* rows_data2 = rows2->data();
+
+    for (int i = 0; i < tensor_numel; ++i) {
+      EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
+    }
+    for (size_t i = 0; i < rows2->size(); ++i) {
+      EXPECT_EQ(rows_data2[i], static_cast<int64_t>(i));
+    }
+    EXPECT_EQ(slr2->height(), 1000);
+  }
+}
+
+void RunTestLodTensor(platform::Place place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& ctx = *pool.Get(place);
+
+  // serialize var to ByteBuffer
+  butil::IOBuf iobuf;
+  sendrecv::VariableMessage msg;
+  int tensor_numel = 512 * 8 * 4 * 2;
+  {
+    framework::Variable var;
+    auto* tensor = var.GetMutable<framework::LoDTensor>();
+    tensor->Resize(framework::make_ddim({512, 8, 4, 2}));
+    framework::LoD lod;
+    lod.push_back(framework::Vector<size_t>({1, 3, 8}));
+    tensor->set_lod(lod);
+    tensor->mutable_data<float>(place);
+    math::set_constant(ctx, tensor, 31.9);
+
+    operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf,
+                                             "", false);
+  }
+
+  // check sendrecv::VariableMessage meta data
+  {
+    EXPECT_EQ(msg.varname(), "myvar");
+    EXPECT_EQ(msg.type(), 0);
+    EXPECT_EQ(msg.dims()[0], 512);
+    EXPECT_EQ(msg.dims()[1], 8);
+    EXPECT_EQ(msg.dims()[2], 4);
+    EXPECT_EQ(msg.dims()[3], 2);
+    EXPECT_EQ(msg.lod_level(), 1);
+    EXPECT_EQ(msg.lod(0).lod_data(0), 1);
+    EXPECT_EQ(msg.lod(0).lod_data(1), 3);
+    EXPECT_EQ(msg.lod(0).lod_data(2), 8);
+  }
+
+  // deserialize
+  {
+    framework::Scope scope;
+    scope.Var("myvar");
+    operators::distributed::BRPCVariableResponse resp(&scope, &ctx);
+    EXPECT_EQ(resp.Parse(iobuf, msg), 0);
+
+    framework::Variable* var2 = resp.GetVar();
+
+    auto tensor2 = var2->Get<framework::LoDTensor>();
+    float* tensor_data2 = nullptr;
+    framework::Tensor tmp_tensor;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      platform::CPUPlace cpu;
+      framework::TensorCopy(tensor2, cpu, &tmp_tensor);
+      tensor_data2 = tmp_tensor.data<float>();
+    } else {
+      tensor_data2 = const_cast<float*>(tensor2.data<float>());
+    }
+
+    for (int i = 0; i < tensor_numel; ++i)
+      EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
+  }
+}
+
+TEST(LodTensor, Run) {
+  platform::CPUPlace place;
+  RunTestLodTensor(place);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu(0);
+  RunTestLodTensor(gpu);
+#endif
+}
+
+TEST(SelectedRows, Run) {
+  platform::CPUPlace place;
+  RunSerdeTestSelectedRows(place);
+#ifdef PADDLE_WITH_CUDA
+  platform::CUDAPlace gpu;
+  RunSerdeTestSelectedRows(gpu);
+#endif
+}
--- a/paddle/fluid/operators/distributed/brpc_server.cc
+++ b/paddle/fluid/operators/distributed/brpc_server.cc
@@ -13,84 +13,287 @@
 // limitations under the License.

 #include "paddle/fluid/operators/distributed/brpc_server.h"
+#include "paddle/fluid/framework/threadpool.h"
+#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h"
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
 #include "paddle/fluid/operators/distributed/request_handler.h"

 namespace sendrecv {

-typedef std::unordered_map<std::string,
-                           paddle::operators::distributed::RequestHandler*>
+namespace distributed = paddle::operators::distributed;
+
+typedef std::unordered_map<std::string, distributed::RequestHandler*>
    HandlerMap;

 class BRPCServiceImpl : public SendRecvService {
 public:
-  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map)
-      : request_send_h_(nullptr),
-        request_get_h_(nullptr),
-        request_prefetch_h_(nullptr) {
-    auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+  explicit BRPCServiceImpl(const HandlerMap& rpc_call_map,
+                           distributed::RPCServer* rpc_server)
+      : rpc_server_(rpc_server) {
+    VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size();
+    auto it = rpc_call_map.find(distributed::kRequestSend);
    if (it != rpc_call_map.end()) {
      request_send_h_ = it->second;
+      send_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestSend)));
    }

-    it = rpc_call_map.find(paddle::operators::distributed::kRequestSend);
+    it = rpc_call_map.find(distributed::kRequestGet);
    if (it != rpc_call_map.end()) {
      request_get_h_ = it->second;
+      get_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestGet)));
    }

-    it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch);
+    it = rpc_call_map.find(distributed::kRequestPrefetch);
    if (it != rpc_call_map.end()) {
      request_prefetch_h_ = it->second;
+      prefetch_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
    }
+
+    it = rpc_call_map.find(distributed::kRequestCheckpoint);
+    if (it != rpc_call_map.end()) {
+      request_checkpoint_h_ = it->second;
+      checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool(
+          rpc_server_->GetThreadNum(distributed::kRequestPrefetch)));
    }

-  virtual ~BRPCServiceImpl() {}
+    it = rpc_call_map.find(distributed::kRequestGetMonomerVariable);
+    if (it != rpc_call_map.end()) {
+      request_get_monomer_handler_h_ = it->second;
+    }

+    it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier);
+    if (it != rpc_call_map.end()) {
+      request_get_monomer_barrier_handler_h_ = it->second;
+    }
+  }
+
+  virtual ~BRPCServiceImpl() {}
  void SendVariable(google::protobuf::RpcController* cntl_butil,
                    const VariableMessage* request, VoidMessage* response,
                    google::protobuf::Closure* done) override {
+    send_threads_->Run(
+        [=] { _SendVariable(cntl_butil, request, response, done); });
+  }
+
+  void _SendVariable(google::protobuf::RpcController* cntl_butil,
+                     const VariableMessage* request, VoidMessage* response,
+                     google::protobuf::Closure* done) {
    PADDLE_ENFORCE(request_send_h_ != nullptr,
                   "RequestSend handler should be registed first!");
    brpc::ClosureGuard done_guard(done);
-
-    paddle::framework::Scope* local_scope = request_send_h_->scope();
-    paddle::framework::Variable* outvar = nullptr;
-    paddle::framework::Variable* invar = nullptr;
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);

    std::string varname = request->varname();
+    VLOG(3) << "RequestSend var_name:" << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();

-    if (!request_send_h_->sync_mode()) {
-      local_scope = &request_send_h_->scope()->NewScope();
-      invar = local_scope->Var(varname);
-    } else {
-      invar = local_scope->FindVar(varname);
-    }
+    distributed::BRPCVariableResponse resp(request_send_h_->scope(),
+                                           request_send_h_->dev_ctx(),
+                                           !request_send_h_->sync_mode());
+    PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0,
+                   "parse iobuf to tensor error!");

-    request_send_h_->Handle(varname, local_scope, invar, &outvar);
+    auto scope = resp.GetMutableLocalScope();
+    auto invar = resp.GetVar();
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;

-    if (!request_send_h_->sync_mode()) {
-      request_send_h_->scope()->DeleteScope(local_scope);
-    }
+    request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id);
  }

  void GetVariable(google::protobuf::RpcController* cntl_butil,
                   const VariableMessage* request, VariableMessage* response,
                   google::protobuf::Closure* done) override {
+    get_threads_->Run(
+        [=] { _GetVariable(cntl_butil, request, response, done); });
+  }
+
+  void _GetVariable(google::protobuf::RpcController* cntl_butil,
+                    const VariableMessage* request, VariableMessage* response,
+                    google::protobuf::Closure* done) {
    PADDLE_ENFORCE(request_get_h_ != nullptr,
                   "RequestGet handler should be registed first!");
-  }

+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    std::string varname = request->varname();
+    VLOG(3) << "RequestGet varname:" << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    auto scope = request_get_h_->scope();
+    auto invar = scope->FindVar(varname);
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id);
+
+    if (outvar) {
+      distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(),
+                                    response, &cntl->response_attachment(), "",
+                                    false);
+    }
+  }
  void PrefetchVariable(google::protobuf::RpcController* cntl_butil,
                        const VariableMessage* request,
                        VariableMessage* response,
                        google::protobuf::Closure* done) override {
+    prefetch_threads_->Run(
+        [=] { _PrefetchVariable(cntl_butil, request, response, done); });
+  }
+
+  void _PrefetchVariable(google::protobuf::RpcController* cntl_butil,
+                         const VariableMessage* request,
+                         VariableMessage* response,
+                         google::protobuf::Closure* done) {
    PADDLE_ENFORCE(request_prefetch_h_ != nullptr,
                   "kRequestPrefetch handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    // prefetch process...
+    std::string in_var_name = request->varname();
+    std::string out_var_name = request->out_varname();
+    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+            << ", out_var_name: " << out_var_name
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    distributed::BRPCVariableResponse resp(
+        request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true);
+
+    PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0,
+                   "parse iobuf to tensor error!");
+
+    auto scope = resp.GetMutableLocalScope();
+    auto invar = scope->FindVar(in_var_name);
+    std::string table_name = request->table_name();
+    int trainer_id = request->trainer_id();
+    paddle::framework::Variable* outvar = scope->Var(out_var_name);
+
+    request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
+                                out_var_name, table_name);
+
+    distributed::SerializeToIOBuf(out_var_name, outvar,
+                                  *request_prefetch_h_->dev_ctx(), response,
+                                  &cntl->response_attachment(), "", true);
+  }
+
+  void CheckpointNotify(google::protobuf::RpcController* cntl_butil,
+                        const VariableMessage* request, VoidMessage* response,
+                        google::protobuf::Closure* done) override {
+    checkpoint_notify_threads_->Run(
+        [=] { _CheckpointNotify(cntl_butil, request, response, done); });
+  }
+
+  void _CheckpointNotify(google::protobuf::RpcController* cntl_butil,
+                         const VariableMessage* request, VoidMessage* response,
+                         google::protobuf::Closure* done) {
+    PADDLE_ENFORCE(
+        request_checkpoint_h_ != nullptr,
+        "kRequestCheckpointNotify handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(),
+                                           request_checkpoint_h_->dev_ctx());
+
+    auto scope = resp.GetMutableLocalScope();
+
+    std::string checkpoint_notify = request->varname();
+    std::string checkpoint_dir = request->out_varname();
+    int trainer_id = request->trainer_id();
+
+    VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify
+            << ", dir: " << checkpoint_dir
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr,
+                                  trainer_id, checkpoint_dir);
+  }
+
+  void GetMonomerVariable(google::protobuf::RpcController* cntl_butil,
+                          const VariableMessage* request,
+                          VariableMessage* response,
+                          google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(
+        request_get_monomer_handler_h_ != nullptr,
+        "kRequestGetMonomerVariable handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    // proc request.
+    std::string varname = request->varname();
+    VLOG(3) << "GetMonomerVariable " << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    rpc_server_->WaitVarCond(varname);
+    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    auto scope = h.scope_;
+    auto invar = scope->FindVar(varname);
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar,
+                                           request->trainer_id());
+
+    if (outvar) {
+      distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response,
+                                    &cntl->response_attachment(), "", false);
+    }
+  }
+
+  void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil,
+                         const VariableMessage* request, VoidMessage* response,
+                         google::protobuf::Closure* done) override {
+    PADDLE_ENFORCE(
+        request_get_monomer_barrier_handler_h_ != nullptr,
+        "RequestGetMonomerBarrier handler should be registed first!");
+
+    brpc::ClosureGuard done_guard(done);
+    brpc::Controller* cntl = static_cast<brpc::Controller*>(cntl_butil);
+
+    std::string varname = request->varname();
+    VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname
+            << ", trainer_id:" << request->trainer_id()
+            << ", from:" << cntl->remote_side();
+
+    rpc_server_->WaitVarCond(varname);
+    distributed::MonomerHandle h = rpc_server_->GetMonomer(varname);
+
+    paddle::framework::Scope* scope = nullptr;
+    paddle::framework::Variable* invar = nullptr;
+    paddle::framework::Variable* outvar = nullptr;
+
+    request_get_monomer_barrier_handler_h_->Handle(
+        varname, scope, invar, &outvar, request->trainer_id());
  }

 private:
-  paddle::operators::distributed::RequestHandler* request_send_h_;
-  paddle::operators::distributed::RequestHandler* request_get_h_;
-  paddle::operators::distributed::RequestHandler* request_prefetch_h_;
+  distributed::RequestHandler* request_send_h_{nullptr};
+  distributed::RequestHandler* request_get_h_{nullptr};
+  distributed::RequestHandler* request_prefetch_h_{nullptr};
+  distributed::RequestHandler* request_checkpoint_h_{nullptr};
+  distributed::RequestHandler* request_get_monomer_handler_h_{nullptr};
+  distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr};
+
+  distributed::RPCServer* rpc_server_{nullptr};
+
+  // FIXME(gongwb): brpc should support process one rpce use one threadpool.
+  std::unique_ptr<paddle::framework::ThreadPool> send_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> get_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> prefetch_threads_;
+  std::unique_ptr<paddle::framework::ThreadPool> checkpoint_notify_threads_;
 };
 }  // namespace sendrecv

@@ -100,7 +303,7 @@ namespace distributed {

 void AsyncBRPCServer::StartServer() {
  // Instance of your service.
-  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_);
+  sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this);

  // Add the service into server. Notice the second parameter, because the
  // service is put on stack, we don't want server to delete it, otherwise
@@ -111,6 +314,9 @@ void AsyncBRPCServer::StartServer() {
  }

  brpc::ServerOptions options;
+#ifdef PADDLE_WITH_BRPC_RDMA
+  options.use_rdma = true;
+#endif
  options.idle_timeout_sec = idle_timeout_s_;
  options.max_concurrency = max_concurrency_;
  if (server_.Start(bind_address_.c_str(), &options) != 0) {

--- a/paddle/fluid/operators/distributed/brpc_variable_response.cc
+++ b/paddle/fluid/operators/distributed/brpc_variable_response.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+#include "paddle/fluid/operators/distributed/brpc_variable_response.h"
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+namespace pb = ::google::protobuf;
+using vr = ::sendrecv::VariableMessage;
+
+int BRPCVariableResponse::Parse(Source* source) {
+  pb::io::ZeroCopyInputStream* input_stream = source->contents();
+  pb::io::CodedInputStream input(input_stream);
+  input.SetTotalBytesLimit(INT_MAX, INT_MAX);
+
+  while (1) {
+    unsigned int tag = 0;
+    if (!input.ReadLittleEndian32(&tag)) {
+      break;
+    }
+
+    uint64_t num_bytes = 0;
+    if (!input.ReadLittleEndian64(&num_bytes)) {
+      break;
+    }
+
+    int field = static_cast<int>(tag);
+    int ret = field == 0 ? -1 : field;
+    switch (field) {
+      case vr::kSerializedFieldNumber: {
+        if (!ProcSerializedField(field, &input, num_bytes)) {
+          return ret;
+        }
+        break;
+      }
+      case vr::kRowsFieldNumber: {
+        PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
+                        meta_.type() == sendrecv::LOD_TENSOR) &&
+                           meta_.varname() != "",
+                       "meta info should be got first!");
+
+        if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
+          return ret;
+        }
+        break;
+      }
+      default: {
+        PADDLE_ENFORCE(false, "not surpported %u fieldnumber", field);
+        return ret;
+      }
+    }
+  }
+
+  return 0;
+}
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/distributed/brpc_variable_response.h
+++ b/paddle/fluid/operators/distributed/brpc_variable_response.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "brpc/channel.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/var_type.h"
+
+#include "paddle/fluid/operators/distributed/send_recv.pb.h"
+
+#include "google/protobuf/io/coded_stream.h"
+#include "google/protobuf/io/zero_copy_stream.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/distributed/variable_response.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class BRPCSourceWrapper : public Source {
+ public:
+  explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {}
+  ::google::protobuf::io::ZeroCopyInputStream* contents() override {
+    return &source_;
+  }
+
+ private:
+  butil::IOBufAsZeroCopyInputStream source_;
+};
+
+class BRPCVariableResponse : public VariableResponse {
+ public:
+  BRPCVariableResponse(const framework::Scope* scope,
+                       const platform::DeviceContext* dev_ctx,
+                       bool create_scope = false)
+      : VariableResponse(scope, dev_ctx, create_scope) {}
+
+  virtual ~BRPCVariableResponse() {}
+
+  // parse attachment from iobuf
+  int Parse(Source* source) override;
+  int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) {
+    BRPCSourceWrapper wrapper(iobuf);
+    return VariableResponse::Parse(&wrapper, meta);
+  }
+};
+
+};  // namespace distributed
+};  // namespace operators
+};  // namespace paddle
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -293,8 +293,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
  const auto ch = GetChannel(ep);
  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
  const std::string method = "SendMonomerFetchBarrierRPC";
-  VarHandlePtr h(
-      new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
+  VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr));
  s->Prepare(h, time_out);

  VLOG(30) << s->GetVarHandlePtr()->String() << " begin";

--- a/paddle/fluid/operators/distributed/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc_serde.cc
@@ -32,13 +32,6 @@ namespace paddle {
 namespace operators {
 namespace distributed {

-static void SerializeDestroyCallback(void* payload) {
-  if (payload != nullptr) {
-    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
-    delete shared_payload;
-  }
-}
-
 void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
                           const platform::DeviceContext& ctx,
                           ::grpc::ByteBuffer* msg, const std::string& out_name,

--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -75,6 +75,10 @@ class RPCServer {
  void RegisterRPC(const std::string& rpc_name, RequestHandler* handler,
                   int thread_num = 5);

+  int GetThreadNum(const std::string& rpc_name) {
+    return rpc_thread_num_[rpc_name];
+  }
+
  // Wait util all the clients have reached the barrier for one
  // rpc method. This function should be called in the
  // RequestHandler if you want to run the server/client in a

--- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <thread>  // NOLINT

 #include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h"
 #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
 #include "paddle/fluid/operators/distributed/variable_response.h"
 #include "paddle/fluid/platform/port.h"
@@ -45,7 +46,6 @@ static TensorPayload GetCommunicationAllocationFromTensor(
    memory::Copy(cuda_pinned, result->ptr(),
                 boost::get<platform::CUDAPlace>(tensor.place()),
                 tensor.data<void>(), copy_size, gpu_dev_ctx.stream());
-
    ctx.Wait();
    return TensorPayload(result);
 #else

--- a/paddle/fluid/operators/distributed/sendrecvop_utils.h
+++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h
@@ -50,6 +50,13 @@ class TensorPayload final {
  size_t memory_size_;
 };

+inline void SerializeDestroyCallback(void* payload) {
+  if (payload != nullptr) {
+    auto* shared_payload = reinterpret_cast<TensorPayload*>(payload);
+    delete shared_payload;
+  }
+}
+
 TensorPayload GetTensorPayload(framework::Variable* var,
                               const platform::DeviceContext& ctx,
                               VarMsg* request);

--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -2,9 +2,9 @@ include(operators)

 set(DISTRIBUTE_DEPS "")
 if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
 else()
-    set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
    if(WITH_BRPC_RDMA)
        find_library(IBVERBS_LIBRARY NAMES ibverbs)
        ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)

--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -26,10 +26,11 @@ limitations under the License. */

 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
+#include "paddle/fluid/platform/profiler.h"

-DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send");
-DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get");
-DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch");
+DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
+DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get");
+DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch");

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -58,7 +58,9 @@ class SendOp : public framework::OperatorBase {
    }
    if (sync_send) {
      for (size_t i = 0; i < rets.size(); i++) {
+        VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i];
        PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
+        VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i];
      }
    }
  }

--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -31,7 +31,7 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@@ -51,16 +51,28 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
    const T* input_data = input.data<T>();
    T* output_data = output->mutable_data<T>(context.GetPlace());

+    int hstart, hend;
+    int wstart, wend;
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
        for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
+          if (adaptive) {
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
+          } else {
+            hstart = ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
            hstart = std::max(hstart, 0);
+          }
          for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
+            if (adaptive) {
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
+            } else {
+              wstart = pw * stride_width - padding_width;
+              wend = std::min(wstart + ksize_width, input_width);
              wstart = std::max(wstart, 0);
+            }

            T ele = pool_process.initial();
            for (int h = hstart; h < hend; ++h) {
@@ -68,7 +80,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                pool_process.compute(input_data[h * input_width + w], &ele);
              }
            }
-            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+            int pool_size = (exclusive || adaptive)
+                                ? (hend - hstart) * (wend - wstart)
                                : ksize_height * ksize_width;
            pool_process.finalize(static_cast<T>(pool_size), &ele);
            output_data[ph * output_width + pw] = ele;
@@ -94,7 +107,7 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
      const framework::Tensor& output, const framework::Tensor& output_grad,
      const std::vector<int>& ksize, const std::vector<int>& strides,
      const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      bool exclusive, framework::Tensor* input_grad) {
+      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@@ -115,17 +128,30 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
    const T* output_grad_data = output_grad.data<T>();
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

+    int hstart, hend;
+    int wstart, wend;
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
        for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
+          if (adaptive) {
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
+          } else {
+            hstart = ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
            hstart = std::max(hstart, 0);
+          }
          for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
+            if (adaptive) {
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
+            } else {
+              wstart = pw * stride_width - padding_width;
+              wend = std::min(wstart + ksize_width, input_width);
              wstart = std::max(wstart, 0);
-            int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+            }
+            int pool_size = (exclusive || adaptive)
+                                ? (hend - hstart) * (wend - wstart)
                                : ksize_height * ksize_width;
            float scale = 1.0 / pool_size;
            for (int h = hstart; h < hend; ++h) {
@@ -251,7 +277,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@@ -276,20 +302,38 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
    const T* input_data = input.data<T>();
    T* output_data = output->mutable_data<T>(context.GetPlace());

+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
        for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
            dstart = std::max(dstart, 0);
+          }
          for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
              hstart = std::max(hstart, 0);
+            }
            for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
                wstart = std::max(wstart, 0);
+              }
              int output_idx = (pd * output_height + ph) * output_width + pw;
              T ele = pool_process.initial();
              for (int d = dstart; d < dend; ++d) {
@@ -302,7 +346,7 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
                }
              }
              int pool_size =
-                  exclusive
+                  (exclusive || adaptive)
                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                      : ksize_depth * ksize_height * ksize_width;
              pool_process.finalize(static_cast<T>(pool_size), &ele);
@@ -330,7 +374,7 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
      const framework::Tensor& output, const framework::Tensor& output_grad,
      const std::vector<int>& ksize, const std::vector<int>& strides,
      const std::vector<int>& paddings, PoolProcess pool_grad_process,
-      bool exclusive, framework::Tensor* input_grad) {
+      bool exclusive, bool adaptive, framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@@ -356,24 +400,41 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
    const T* output_grad_data = output_grad.data<T>();
    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());

+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
        for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
            dstart = std::max(dstart, 0);
+          }
          for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
              hstart = std::max(hstart, 0);
-
+            }
            for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
                wstart = std::max(wstart, 0);
+              }

              int pool_size =
-                  exclusive
+                  (exclusive || adaptive)
                      ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                      : ksize_depth * ksize_height * ksize_width;
              float scale = 1.0 / pool_size;
@@ -517,8 +578,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_height = input.dims()[2];
    const int input_width = input.dims()[3];
@@ -538,16 +599,28 @@ class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
    T1* output_data = output->mutable_data<T1>(context.GetPlace());
    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());

+    int hstart, hend;
+    int wstart, wend;
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
        for (int ph = 0; ph < output_height; ++ph) {
-          int hstart = ph * stride_height - padding_height;
-          int hend = std::min(hstart + ksize_height, input_height);
+          if (adaptive) {
+            hstart = AdaptStartIndex(ph, input_height, output_height);
+            hend = AdaptEndIndex(ph, input_height, output_height);
+          } else {
+            hstart = ph * stride_height - padding_height;
+            hend = std::min(hstart + ksize_height, input_height);
            hstart = std::max(hstart, 0);
+          }
          for (int pw = 0; pw < output_width; ++pw) {
-            int wstart = pw * stride_width - padding_width;
-            int wend = std::min(wstart + ksize_width, input_width);
+            if (adaptive) {
+              wstart = AdaptStartIndex(pw, input_width, output_width);
+              wend = AdaptEndIndex(pw, input_width, output_width);
+            } else {
+              wstart = pw * stride_width - padding_width;
+              wend = std::min(wstart + ksize_width, input_width);
              wstart = std::max(wstart, 0);
+            }

            T1 ele = static_cast<T1>(-FLT_MAX);
            int index = -1;
@@ -584,7 +657,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                  framework::Tensor* input_grad) {
    const int batch_size = input_grad->dims()[0];
    const int input_height = input_grad->dims()[2];
@@ -637,8 +710,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_depth = input.dims()[2];
    const int input_height = input.dims()[3];
@@ -663,20 +736,38 @@ class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
    T1* output_data = output->mutable_data<T1>(context.GetPlace());
    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());

+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
    for (int i = 0; i < batch_size; i++) {
      for (int c = 0; c < output_channels; ++c) {
        for (int pd = 0; pd < output_depth; ++pd) {
-          int dstart = pd * stride_depth - padding_depth;
-          int dend = std::min(dstart + ksize_depth, input_depth);
+          if (adaptive) {
+            dstart = AdaptStartIndex(pd, input_depth, output_depth);
+            dend = AdaptEndIndex(pd, input_depth, output_depth);
+          } else {
+            dstart = pd * stride_depth - padding_depth;
+            dend = std::min(dstart + ksize_depth, input_depth);
            dstart = std::max(dstart, 0);
+          }
          for (int ph = 0; ph < output_height; ++ph) {
-            int hstart = ph * stride_height - padding_height;
-            int hend = std::min(hstart + ksize_height, input_height);
+            if (adaptive) {
+              hstart = AdaptStartIndex(ph, input_height, output_height);
+              hend = AdaptEndIndex(ph, input_height, output_height);
+            } else {
+              hstart = ph * stride_height - padding_height;
+              hend = std::min(hstart + ksize_height, input_height);
              hstart = std::max(hstart, 0);
+            }
            for (int pw = 0; pw < output_width; ++pw) {
-              int wstart = pw * stride_width - padding_width;
-              int wend = std::min(wstart + ksize_width, input_width);
+              if (adaptive) {
+                wstart = AdaptStartIndex(pw, input_width, output_width);
+                wend = AdaptEndIndex(pw, input_width, output_width);
+              } else {
+                wstart = pw * stride_width - padding_width;
+                wend = std::min(wstart + ksize_width, input_width);
                wstart = std::max(wstart, 0);
+              }

              int output_idx = (pd * output_height + ph) * output_width + pw;
              T1 ele = static_cast<T1>(-FLT_MAX);
@@ -718,7 +809,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                  framework::Tensor* input_grad) {
    const int batch_size = input_grad->dims()[0];
    const int input_depth = input_grad->dims()[2];

--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
                             const int ksize_width, const int stride_height,
                             const int stride_width, const int padding_height,
                             const int padding_width, PoolProcess pool_process,
-                             bool exclusive, T* output_data) {
+                             bool exclusive, bool adaptive, T* output_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -37,13 +37,23 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
    int c = (index / output_width / output_height) % channels;
    int batch_idx = index / output_width / output_height / channels;

-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
+
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
+    } else {
+      hstart = ph * stride_height - padding_height;
+      hend = min(hstart + ksize_height, input_height);
      hstart = max(hstart, 0);

-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
+      wstart = pw * stride_width - padding_width;
+      wend = min(wstart + ksize_width, input_width);
      wstart = max(wstart, 0);
+    }

    input_data += (batch_idx * channels + c) * input_height * input_width;
    T ele = pool_process.initial();
@@ -52,7 +62,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data,
        pool_process.compute(input_data[h * input_width + w], &ele);
      }
    }
-    int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+    int pool_size = (exclusive || adaptive) ? (hend - hstart) * (wend - wstart)
                                            : ksize_height * ksize_width;
    pool_process.finalize(static_cast<T>(pool_size), &ele);
    output_data[index] = ele;
@@ -66,22 +76,33 @@ __global__ void KernelPool2DGrad(
    const int input_width, const int output_height, const int output_width,
    const int ksize_height, const int ksize_width, const int stride_height,
    const int stride_width, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, T* input_grad) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width + padding_width;
-    int offsetH = (index / input_width) % input_height + padding_height;
+    int w_offset = index % input_width + padding_width;
+    int h_offset = (index / input_width) % input_height + padding_height;
    int offsetC = (index / input_width / input_height) % channels;
    int batch_idx = index / input_width / input_height / channels;

-    int phstart = (offsetH < ksize_height)
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      phstart = h_offset * output_height / input_height;
+      phend =
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
+    } else {
+      phstart = (h_offset < ksize_height)
                    ? 0
-                      : (offsetH - ksize_height) / stride_height + 1;
-    int pwstart = (offsetW < ksize_width)
+                    : (h_offset - ksize_height) / stride_height + 1;
+      pwstart = (w_offset < ksize_width)
                    ? 0
-                      : (offsetW - ksize_width) / stride_width + 1;
-    int phend = min(offsetH / stride_height + 1, output_height);
-    int pwend = min(offsetW / stride_width + 1, output_width);
+                    : (w_offset - ksize_width) / stride_width + 1;
+      phend = min(h_offset / stride_height + 1, output_height);
+      pwend = min(w_offset / stride_width + 1, output_width);
+    }
    T gradient = 0;
    T input = input_data[index];
    int output_idx =
@@ -90,14 +111,22 @@ __global__ void KernelPool2DGrad(
    output_grad += output_idx;
    for (int ph = phstart; ph < phend; ++ph) {
      for (int pw = pwstart; pw < pwend; ++pw) {
+        int pool_size;
+        if (adaptive) {
+          pool_size = static_cast<int>(ceil(static_cast<double>(input_height) /
+                                            ksize_height)) *
+                      static_cast<int>(
+                          ceil(static_cast<double>(input_width) / ksize_width));
+        } else {
          int hstart = ph * stride_height - padding_height;
          int wstart = pw * stride_width - padding_width;
          int hend = min(hstart + ksize_height, input_height);
          int wend = min(wstart + ksize_width, input_width);
          hstart = max(hstart, 0);
          wstart = max(wstart, 0);
-        int pool_size = exclusive ? (hend - hstart) * (wend - wstart)
+          pool_size = exclusive ? (hend - hstart) * (wend - wstart)
                                : ksize_height * ksize_width;
+        }
        int output_sub_idx = ph * output_width + pw;
        pool_process.compute(input, output_data[output_sub_idx],
                             output_grad[output_sub_idx],
@@ -181,7 +210,7 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
      nthreads, input, input_channels, input_height, input_width, output_height,
      output_width, ksize_height, ksize_width, stride_height, stride_width,
-      padding_height, padding_width, pool_compute, exclusive, output);
+      padding_height, padding_width, pool_compute, exclusive, false, output);
 }

 /*
@@ -196,7 +225,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
@@ -223,7 +252,7 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        nthreads, input_data, input_channels, input_height, input_width,
        output_height, output_width, ksize_height, ksize_width, stride_height,
        stride_width, padding_height, padding_width, pool_process, exclusive,
-        output_data);
+        adaptive, output_data);
  }
 };

@@ -242,7 +271,8 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* input_grad) {
+                  bool exclusive, bool adaptive,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
@@ -270,7 +300,7 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        nthreads, input_data, output_data, output_grad_data, input_channels,
        input_height, input_width, output_height, output_width, ksize_height,
        ksize_width, stride_height, stride_width, padding_height, padding_width,
-        pool_process, exclusive, input_grad_data);
+        pool_process, exclusive, adaptive, input_grad_data);
  }
 };

@@ -359,7 +389,7 @@ __global__ void KernelPool3D(
    const int ksize_depth, const int ksize_height, const int ksize_width,
    const int stride_depth, const int stride_height, const int stride_width,
    const int padding_depth, const int padding_height, const int padding_width,
-    PoolProcess pool_process, bool exclusive, T* output_data) {
+    PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -368,15 +398,30 @@ __global__ void KernelPool3D(
    int c = (index / output_width / output_height / output_depth) % channels;
    int batch_idx =
        index / output_width / output_height / output_depth / channels;
-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
+
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      dstart = AdaptStartIndex(pd, input_depth, output_depth);
+      dend = AdaptEndIndex(pd, input_depth, output_depth);
+
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
+
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
+    } else {
+      dstart = pd * stride_depth - padding_depth;
+      hstart = ph * stride_height - padding_height;
+      wstart = pw * stride_width - padding_width;
+      dend = min(dstart + ksize_depth, input_depth);
+      hend = min(hstart + ksize_height, input_height);
+      wend = min(wstart + ksize_width, input_width);
      dstart = max(dstart, 0);
      hstart = max(hstart, 0);
      wstart = max(wstart, 0);
+    }
    T ele = pool_process.initial();
    input_data +=
        (batch_idx * channels + c) * input_depth * input_height * input_width;
@@ -388,7 +433,7 @@ __global__ void KernelPool3D(
        }
      }
    }
-    int pool_size = exclusive
+    int pool_size = (exclusive || adaptive)
                        ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                        : ksize_depth * ksize_height * ksize_width;
    pool_process.finalize(static_cast<T>(pool_size), &ele);
@@ -405,28 +450,43 @@ __global__ void KernelPool3DGrad(
    const int ksize_height, const int ksize_width, const int stride_depth,
    const int stride_height, const int stride_width, const int padding_depth,
    const int padding_height, const int padding_width, PoolProcess pool_process,
-    bool exclusive, T* input_grad) {
+    bool exclusive, bool adaptive, T* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
-    int offsetW = index % input_width + padding_width;
-    int offsetH = (index / input_width) % input_height + padding_height;
-    int offsetD =
+    int w_offset = index % input_width + padding_width;
+    int h_offset = (index / input_width) % input_height + padding_height;
+    int d_offset =
        (index / input_width / input_height) % input_depth + padding_depth;
    int offsetC = (index / input_width / input_height / input_depth) % channels;
    int batch_idx = index / input_width / input_height / input_depth / channels;

-    int pdstart = (offsetD < ksize_depth)
+    int pdstart, pdend;
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      pdstart = d_offset * output_depth / input_depth;
+      pdend =
+          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
+      phstart = h_offset * output_height / input_height;
+      phend =
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
+    } else {
+      pdstart = (d_offset < ksize_depth)
                    ? 0
-                      : (offsetD - ksize_depth) / stride_depth + 1;
-    int phstart = (offsetH < ksize_height)
+                    : (d_offset - ksize_depth) / stride_depth + 1;
+      phstart = (h_offset < ksize_height)
                    ? 0
-                      : (offsetH - ksize_height) / stride_height + 1;
-    int pwstart = (offsetW < ksize_width)
+                    : (h_offset - ksize_height) / stride_height + 1;
+      pwstart = (w_offset < ksize_width)
                    ? 0
-                      : (offsetW - ksize_width) / stride_width + 1;
-    int pdend = min((offsetD) / stride_depth + 1, output_depth);
-    int phend = min((offsetH) / stride_height + 1, output_height);
-    int pwend = min((offsetW) / stride_width + 1, output_width);
+                    : (w_offset - ksize_width) / stride_width + 1;
+      pdend = min((d_offset) / stride_depth + 1, output_depth);
+      phend = min((h_offset) / stride_height + 1, output_height);
+      pwend = min((w_offset) / stride_width + 1, output_width);
+    }

    T gradient = 0;
    T input = input_data[index];
@@ -439,6 +499,16 @@ __global__ void KernelPool3DGrad(
      for (int ph = phstart; ph < phend; ++ph) {
        for (int pw = pwstart; pw < pwend; ++pw) {
          // figure out the pooling size
+          int pool_size;
+          if (adaptive) {
+            pool_size =
+                static_cast<int>(
+                    ceil(static_cast<double>(input_depth) / ksize_depth)) *
+                static_cast<int>(
+                    ceil(static_cast<double>(input_height) / ksize_height)) *
+                static_cast<int>(
+                    ceil(static_cast<double>(input_width) / ksize_width));
+          } else {
            int dstart = pd * stride_depth - padding_depth;
            int hstart = ph * stride_height - padding_height;
            int wstart = pw * stride_width - padding_width;
@@ -448,9 +518,10 @@ __global__ void KernelPool3DGrad(
            dstart = max(dstart, 0);
            hstart = max(hstart, 0);
            wstart = max(wstart, 0);
-          int pool_size =
+            pool_size =
                exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart)
                          : ksize_depth * ksize_height * ksize_width;
+          }
          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
          pool_process.compute(input, output_data[output_sub_idx],
                               output_grad[output_sub_idx],
@@ -525,7 +596,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* output) {
+                  bool exclusive, bool adaptive, framework::Tensor* output) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_depth = input.dims()[2];
@@ -559,7 +630,7 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        input_width, output_depth, output_height, output_width, ksize_depth,
        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
        padding_depth, padding_height, padding_width, pool_process, exclusive,
-        output_data);
+        adaptive, output_data);
  }
 };

@@ -578,7 +649,8 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_process,
-                  bool exclusive, framework::Tensor* input_grad) {
+                  bool exclusive, bool adaptive,
+                  framework::Tensor* input_grad) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_depth = input.dims()[2];
@@ -614,7 +686,7 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
        input_depth, input_height, input_width, output_depth, output_height,
        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
        stride_height, stride_width, padding_depth, padding_height,
-        padding_width, pool_process, exclusive, input_grad_data);
+        padding_width, pool_process, exclusive, adaptive, input_grad_data);
  }
 };

@@ -703,7 +775,7 @@ __global__ void KernelMaxPool2dWithIdx(
    const int input_height, const int input_width, const int output_height,
    const int output_width, const int ksize_height, const int ksize_width,
    const int stride_height, const int stride_width, const int padding_height,
-    const int padding_width, T1* output_data, T2* mask_data) {
+    const int padding_width, bool adaptive, T1* output_data, T2* mask_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -711,13 +783,23 @@ __global__ void KernelMaxPool2dWithIdx(
    int c = (index / output_width / output_height) % channels;
    int batch_idx = index / output_width / output_height / channels;

-    int hstart = ph * stride_height - padding_height;
-    int hend = min(hstart + ksize_height, input_height);
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
+
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
+    } else {
+      hstart = ph * stride_height - padding_height;
+      hend = min(hstart + ksize_height, input_height);
      hstart = max(hstart, 0);

-    int wstart = pw * stride_width - padding_width;
-    int wend = min(wstart + ksize_width, input_width);
+      wstart = pw * stride_width - padding_width;
+      wend = min(wstart + ksize_width, input_width);
      wstart = max(wstart, 0);
+    }

    input_data += (batch_idx * channels + c) * input_height * input_width;
    T1 ele = -FLT_MAX;
@@ -742,36 +824,47 @@ __global__ void KernelMaxPool2DWithIdxGrad(
    const int channels, const int input_height, const int input_width,
    const int output_height, const int output_width, const int ksize_height,
    const int ksize_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, T1* input_grad) {
+    const int padding_height, const int padding_width, bool adaptive,
+    T1* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int w_offset = index % input_width;
    int h_offset = (index / input_width) % input_height;
-    int c_offset = (index / input_width / input_height) % channels;
+    int offsetC = (index / input_width / input_height) % channels;
    int batch_idx = index / input_width / input_height / channels;

-    int ph_start =
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      phstart = h_offset * output_height / input_height;
+      phend =
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
+    } else {
+      phstart =
          (h_offset + padding_height < ksize_height)
              ? 0
              : (h_offset + padding_height - ksize_height) / stride_height + 1;
-    int pw_start =
+      pwstart =
          (w_offset + padding_width < ksize_width)
              ? 0
              : (w_offset + padding_width - ksize_width) / stride_width + 1;
-    int ph_end =
+      phend =
          min((h_offset + padding_height) / stride_height + 1, output_height);
-    int pw_end =
-        min((w_offset + padding_width) / stride_width + 1, output_width);
+      pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
+    }

    T1 gradient = 0;
    int input_current_featuremap_idx = h_offset * input_width + w_offset;
    int output_idx =
-        (batch_idx * channels + c_offset) * output_height * output_width;
+        (batch_idx * channels + offsetC) * output_height * output_width;

    mask_data += output_idx;
    output_grad += output_idx;
-    for (int ph = ph_start; ph < ph_end; ++ph) {
-      for (int pw = pw_start; pw < pw_end; ++pw) {
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
        if (mask_data[ph * output_width + pw] == input_current_featuremap_idx)
          gradient += output_grad[ph * output_width + pw];
      }
@@ -791,8 +884,8 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_height = input.dims()[2];
@@ -819,7 +912,8 @@ class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
        nthreads, input_data, input_channels, input_height, input_width,
        output_height, output_width, ksize_height, ksize_width, stride_height,
-        stride_width, padding_height, padding_width, output_data, mask_data);
+        stride_width, padding_height, padding_width, adaptive, output_data,
+        mask_data);
  }
 };

@@ -835,7 +929,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                  framework::Tensor* input_grad) {
    const int batch_size = input_grad->dims()[0];
    const int input_channels = input_grad->dims()[1];
@@ -862,7 +956,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
        nthreads, output_grad_data, mask_data, input_channels, input_height,
        input_width, output_height, output_width, ksize_height, ksize_width,
-        stride_height, stride_width, padding_height, padding_width,
+        stride_height, stride_width, padding_height, padding_width, adaptive,
        input_grad_data);
  }
 };
@@ -884,7 +978,7 @@ __global__ void KernelMaxPool3DWithIdx(
    const int ksize_depth, const int ksize_height, const int ksize_width,
    const int stride_depth, const int stride_height, const int stride_width,
    const int padding_depth, const int padding_height, const int padding_width,
-    T1* output_data, T2* mask_data) {
+    bool adaptive, T1* output_data, T2* mask_data) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int pw = index % output_width;
@@ -894,15 +988,29 @@ __global__ void KernelMaxPool3DWithIdx(
    int batch_idx =
        index / output_width / output_height / output_depth / channels;

-    int dstart = pd * stride_depth - padding_depth;
-    int hstart = ph * stride_height - padding_height;
-    int wstart = pw * stride_width - padding_width;
-    int dend = min(dstart + ksize_depth, input_depth);
-    int hend = min(hstart + ksize_height, input_height);
-    int wend = min(wstart + ksize_width, input_width);
+    int dstart, dend;
+    int hstart, hend;
+    int wstart, wend;
+    if (adaptive) {
+      dstart = AdaptStartIndex(pd, input_depth, output_depth);
+      dend = AdaptEndIndex(pd, input_depth, output_depth);
+
+      hstart = AdaptStartIndex(ph, input_height, output_height);
+      hend = AdaptEndIndex(ph, input_height, output_height);
+
+      wstart = AdaptStartIndex(pw, input_width, output_width);
+      wend = AdaptEndIndex(pw, input_width, output_width);
+    } else {
+      dstart = pd * stride_depth - padding_depth;
+      hstart = ph * stride_height - padding_height;
+      wstart = pw * stride_width - padding_width;
+      dend = min(dstart + ksize_depth, input_depth);
+      hend = min(hstart + ksize_height, input_height);
+      wend = min(wstart + ksize_width, input_width);
      dstart = max(dstart, 0);
      hstart = max(hstart, 0);
      wstart = max(wstart, 0);
+    }

    T1 ele = -FLT_MAX;
    int max_index = -1;
@@ -932,46 +1040,58 @@ __global__ void KernelMaxPool3DWithIdxGrad(
    const int output_width, const int ksize_depth, const int ksize_height,
    const int ksize_width, const int stride_depth, const int stride_height,
    const int stride_width, const int padding_depth, const int padding_height,
-    const int padding_width, T1* input_grad) {
+    const int padding_width, bool adaptive, T1* input_grad) {
  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
       index += blockDim.x * gridDim.x) {
    int w_offset = index % input_width;
    int h_offset = (index / input_width) % input_height;
    int d_offset = (index / input_width / input_height) % input_depth;
-    int c_offset =
-        (index / input_width / input_height / input_depth) % channels;
+    int offsetC = (index / input_width / input_height / input_depth) % channels;
    int batch_idx = index / input_width / input_height / input_depth / channels;

-    int pd_start =
+    int pdstart, pdend;
+    int phstart, phend;
+    int pwstart, pwend;
+    if (adaptive) {
+      pdstart = d_offset * output_depth / input_depth;
+      pdend =
+          min((d_offset + 1) * output_depth / input_depth + 1, output_depth);
+      phstart = h_offset * output_height / input_height;
+      phend =
+          min((h_offset + 1) * output_height / input_height + 1, output_height);
+      pwstart = w_offset * output_width / input_width;
+      pwend =
+          min((w_offset + 1) * output_width / input_width + 1, output_width);
+    } else {
+      pdstart =
          (d_offset + padding_depth < ksize_depth)
              ? 0
              : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
-    int ph_start =
+      phstart =
          (h_offset + padding_height < ksize_height)
              ? 0
              : (h_offset + padding_height - ksize_height) / stride_height + 1;
-    int pw_start =
+      pwstart =
          (w_offset + padding_width < ksize_width)
              ? 0
              : (w_offset + padding_width - ksize_width) / stride_width + 1;
-    int pd_end =
-        min((d_offset + padding_depth) / stride_depth + 1, output_depth);
-    int ph_end =
+      pdend = min((d_offset + padding_depth) / stride_depth + 1, output_depth);
+      phend =
          min((h_offset + padding_height) / stride_height + 1, output_height);
-    int pw_end =
-        min((w_offset + padding_width) / stride_width + 1, output_width);
+      pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
+    }

    T1 gradient = 0;
    int input_current_feature_map_idx =
        (d_offset * input_height + h_offset) * input_width + w_offset;
-    int output_idx = (batch_idx * channels + c_offset) * output_depth *
+    int output_idx = (batch_idx * channels + offsetC) * output_depth *
                     output_height * output_width;
    mask += output_idx;
    output_grad += output_idx;

-    for (int pd = pd_start; pd < pd_end; ++pd) {
-      for (int ph = ph_start; ph < ph_end; ++ph) {
-        for (int pw = pw_start; pw < pw_end; ++pw) {
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
          if (mask[(pd * output_height + ph) * output_width + pw] ==
              input_current_feature_map_idx)
            gradient +=
@@ -994,8 +1114,8 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask) {
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask) {
    const int batch_size = input.dims()[0];
    const int input_channels = input.dims()[1];
    const int input_depth = input.dims()[2];
@@ -1029,7 +1149,8 @@ class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
        nthreads, input_data, input_channels, input_depth, input_height,
        input_width, output_depth, output_height, output_width, ksize_depth,
        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
-        padding_depth, padding_height, padding_width, output_data, mask_data);
+        padding_depth, padding_height, padding_width, adaptive, output_data,
+        mask_data);
  }
 };

@@ -1045,7 +1166,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                  framework::Tensor* input_grad) {
    const int batch_size = input_grad->dims()[0];
    const int input_channels = input_grad->dims()[1];
@@ -1079,7 +1200,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
        nthreads, output_grad_data, mask_data, input_channels, input_depth,
        input_height, input_width, output_depth, output_height, output_width,
        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
-        stride_width, padding_depth, padding_height, padding_width,
+        stride_width, padding_depth, padding_height, padding_width, adaptive,
        input_grad_data);
  }
 };

--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -68,6 +68,18 @@ class AvgPoolGrad {
  }
 };

+/* used for adaptive pool to calculate start and end index of each divided grid
+ */
+HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      floor(static_cast<double>(ph * input_size) / output_size));
+}
+
+HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
+  return static_cast<int>(
+      ceil(static_cast<double>((ph + 1) * input_size) / output_size));
+}
+
 /*
 * \brief Getting pooling results, and calculating gradient.
 *
@@ -102,7 +114,7 @@ class Pool2dFunctor {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* output);
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };

 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -114,7 +126,7 @@ class Pool2dGradFunctor {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* input_grad);
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };

 template <typename DeviceContext, class T>
@@ -136,7 +148,7 @@ class Pool3dFunctor {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* output);
+                  bool exclusive, bool adaptive, framework::Tensor* output);
 };

 template <typename DeviceContext, typename PoolProcess, typename T>
@@ -148,7 +160,7 @@ class Pool3dGradFunctor {
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
                  const std::vector<int>& paddings, PoolProcess pool_compute,
-                  bool exclusive, framework::Tensor* input_grad);
+                  bool exclusive, bool adaptive, framework::Tensor* input_grad);
 };

 template <typename DeviceContext, class T>
@@ -176,8 +188,8 @@ class MaxPool2dWithIndexFunctor {
  void operator()(const DeviceContext& context, const framework::Tensor& input,
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask);
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask);
 };

 template <typename DeviceContext, typename T1, typename T2>
@@ -187,7 +199,7 @@ class MaxPool2dWithIndexGradFunctor {
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                  framework::Tensor* input_grad);
 };

@@ -197,8 +209,8 @@ class MaxPool3dWithIndexFunctor {
  void operator()(const DeviceContext& context, const framework::Tensor& input,
                  const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings, framework::Tensor* output,
-                  framework::Tensor* mask);
+                  const std::vector<int>& paddings, bool adaptive,
+                  framework::Tensor* output, framework::Tensor* mask);
 };

 template <typename DeviceContext, typename T1, typename T2>
@@ -208,7 +220,7 @@ class MaxPool3dWithIndexGradFunctor {
                  const framework::Tensor& output_grad,
                  const framework::Tensor& mask, const std::vector<int>& ksize,
                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
+                  const std::vector<int>& paddings, bool adaptive,
                  framework::Tensor* input_grad);
 };


--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -52,6 +52,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
  bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
+  bool adaptive = ctx->Attrs().Get<bool>("adaptive");

  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                 "Pooling intput should be 4-D or 5-D tensor.");
@@ -72,9 +73,13 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const {
                    "Paddings size and pooling size should be the same.");

  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  if (adaptive) {
+    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
+  } else {
    for (size_t i = 0; i < ksize.size(); ++i) {
-    output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                          paddings[i], strides[i], ceil_mode));
+      output_shape.push_back(PoolOutputSize(
+          in_x_dims[i + 2], ksize[i], paddings[i], strides[i], ceil_mode));
+    }
  }
  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
  ctx->ShareLoD("X", "Out");
@@ -185,6 +190,14 @@ void Pool2dOpMaker::Make() {
      "averaging calculating, otherwise, include the zero-padding. Note, it "
      "is only used when pooling_type is avg. The defalut is True.")
      .SetDefault(true);
+  AddAttr<bool>(
+      "adaptive",
+      "(bool, default False) When true, will perform adaptive pooling instead, "
+      "output shape in H and W dimensions will be same as ksize, input data "
+      "will be divided into grids specify by ksize averagely and perform "
+      "pooling in each grid area to get output pooling value.")
+      .SetDefault(false);
+
  AddAttr<bool>(
      "use_cudnn",
      "(bool, default false) Only used in cudnn kernel, need install cudnn")
@@ -263,6 +276,14 @@ Example:
       Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
       $$

+  For adaptive = true:
+      $$
+      hstart = floor(i * H_{in} / H_{out})
+      hend = ceil((i + 1) * H_{in} / H_{out})
+      wstart = floor(j * W_{in} / W_{out})
+      wend = ceil((j + 1) * W_{in} / W_{out})
+      Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)}
+      $$
 )DOC");
 }

@@ -324,6 +345,13 @@ void Pool3dOpMaker::Make() {
      "averaging calculating, otherwise, include the zero-padding. Note, it "
      "is only used when pooling_type is avg. The defalut is True.")
      .SetDefault(true);
+  AddAttr<bool>(
+      "adaptive",
+      "(bool, default False) When true, will perform adaptive pooling instead, "
+      "output shape in H and W dimensions will be same as ksize, input data "
+      "will be divided into grids specify by ksize averagely and perform "
+      "pooling in each grid area to get output pooling value.")
+      .SetDefault(false);

  AddAttr<bool>(
      "use_cudnn",
@@ -375,6 +403,37 @@ Example:
       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
  $$
+  For exclusive = true:
+  $$
+  dstart = i * strides[0] - paddings[0]
+  dend = dstart + ksize[0]
+  hstart = j * strides[1] - paddings[1]
+  hend = hstart + ksize[1]
+  wstart = k * strides[2] - paddings[2]
+  wend = wstart + ksize[2]
+  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]}
+  $$
+  For exclusive = false:
+  $$
+  dstart = max(0, i * strides[0] - paddings[0])
+  dend = min(D, dstart + ksize[0])
+  hstart = max(0, j * strides[1] - paddings[1])
+  hend = min(H, hstart + ksize[1])
+  wstart = max(0, k * strides[2] - paddings[2])
+  wend = min(W, wstart + ksize[2])
+  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+  $$
+
+  For adaptive = true:
+  $$
+  dstart = floor(i * D_{in} / D_{out})
+  dend = ceil((i + 1) * D_{in} / D_{out})
+  hstart = floor(j * H_{in} / H_{out})
+  hend = ceil((j + 1) * H_{in} / H_{out})
+  wstart = floor(k * W_{in} / W_{out})
+  wend = ceil((k + 1) * W_{in} / W_{out})
+  Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)}
+  $$

 )DOC");
 }

--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -70,6 +70,7 @@ class PoolKernel : public framework::OpKernel<T> {
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    bool exclusive = context.Attr<bool>("exclusive");
+    bool adaptive = context.Attr<bool>("adaptive");
    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
@@ -85,7 +86,7 @@ class PoolKernel : public framework::OpKernel<T> {
              pool2d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, out);
+                         true, false, out);

        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool2dFunctor<
@@ -93,7 +94,7 @@ class PoolKernel : public framework::OpKernel<T> {
              pool2d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, out);
+                         exclusive, adaptive, out);
        }
      } break;
      case 3: {
@@ -103,14 +104,14 @@ class PoolKernel : public framework::OpKernel<T> {
              pool3d_forward;
          paddle::operators::math::MaxPool<T> pool_process;
          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         true, out);
+                         true, false, out);
        } else if (pooling_type == "avg") {
          paddle::operators::math::Pool3dFunctor<
              DeviceContext, paddle::operators::math::AvgPool<T>, T>
              pool3d_forward;
          paddle::operators::math::AvgPool<T> pool_process;
          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
-                         exclusive, out);
+                         exclusive, adaptive, out);
        }
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
@@ -133,6 +134,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
    bool exclusive = context.Attr<bool>("exclusive");
+    bool adaptive = context.Attr<bool>("adaptive");

    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
@@ -159,7 +161,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
                pool2d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, in_x_grad);
+                            paddings, pool_process, exclusive, adaptive,
+                            in_x_grad);
          }
        } break;
        case 3: {
@@ -174,7 +177,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
                pool3d_backward;
            paddle::operators::math::AvgPoolGrad<T> pool_process;
            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
-                            paddings, pool_process, exclusive, in_x_grad);
+                            paddings, pool_process, exclusive, adaptive,
+                            in_x_grad);
          }
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }

--- a/paddle/fluid/operators/pool_with_index_op.cc
+++ b/paddle/fluid/operators/pool_with_index_op.cc
@@ -40,6 +40,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    bool adaptive = ctx->Attrs().Get<bool>("adaptive");

    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
                   "Pooling intput should be 4-D or 5-D tensor.");
@@ -60,10 +61,14 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
                      "Paddings size and pooling size should be the same.");

    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    if (adaptive) {
+      output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
+    } else {
      for (size_t i = 0; i < ksize.size(); ++i) {
        output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i],
                                                 paddings[i], strides[i]));
      }
+    }
    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
    ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
  }
@@ -131,6 +136,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
        "(bool, default:false) Whether to use the global pooling. "
        "If global_pooling = true, ksize and paddings will be ignored.")
        .SetDefault(false);
+    AddAttr<bool>(
+        "adaptive",
+        "(bool, default False) When true, will perform adaptive pooling "
+        "instead, "
+        "output shape in H and W dimensions will be same as ksize, input data "
+        "will be divided into grids specify by ksize averagely and perform "
+        "pooling in each grid area to get output pooling value.")
+        .SetDefault(false);
    AddAttr<std::vector<int>>("strides",
                              "(vector<int>, default {1, 1}), strides(height, "
                              "width) of pooling operator.")
@@ -168,6 +181,12 @@ Example:
       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
       $$
  
+  For adaptive = true:
+       $$
+       H_{out} = ksize[0]   W_{out} = ksize[1]
+       $$
+      
+
 )DOC");
  }
 };
@@ -207,6 +226,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
        "(bool, default false) Whether to use the global pooling. "
        "If global_pooling = true, ksize and paddings will be ignored.")
        .SetDefault(false);
+    AddAttr<bool>(
+        "adaptive",
+        "(bool, default False) When true, will perform adaptive pooling "
+        "instead, "
+        "output shape in H and W dimensions will be same as ksize, input data "
+        "will be divided into grids specify by ksize averagely and perform "
+        "pooling in each grid area to get output pooling value.")
+        .SetDefault(false);
    AddAttr<std::vector<int>>("strides",
                              "(vector<int>, default {1,1,1}), strides(depth, "
                              "height, width) of pooling operator.")
@@ -245,6 +272,11 @@ Example:
       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
       $$
  
+  For adaptive = true:
+       $$
+       D_{out} = ksize[0]   H_{out} = ksize[1]   W_{out} = ksize[2]
+       $$
+
 )DOC");
  }
 };

--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -36,6 +36,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool adaptive = context.Attr<bool>("adaptive");

    auto& dev_ctx = context.template device_context<DeviceContext>();
    if (context.Attr<bool>("global_pooling")) {
@@ -50,13 +51,15 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
                                                           T2>
            pool2d_forward;
-        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
+                       mask);
      } break;
      case 3: {
        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
                                                           T2>
            pool3d_forward;
-        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out,
+                       mask);
      } break;
      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
    }
@@ -75,6 +78,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    bool adaptive = context.Attr<bool>("adaptive");
    if (context.Attr<bool>("global_pooling")) {
      for (size_t i = 0; i < ksize.size(); ++i) {
        paddings[i] = 0;
@@ -93,14 +97,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
                                                                 T1, T2>
              pool2d_backward;
          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, in_x_grad);
+                          paddings, adaptive, in_x_grad);
        } break;
        case 3: {
          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
                                                                 T1, T2>
              pool3d_backward;
          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
-                          paddings, in_x_grad);
+                          paddings, adaptive, in_x_grad);
        } break;
        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
      }

--- a/paddle/fluid/operators/psroi_pool_op.cc
+++ b/paddle/fluid/operators/psroi_pool_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/psroi_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), "
+             "the input of PSROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is the batch size, "
+             "C is the number of input channels, "
+             "H is the height of the input feature map, and "
+             "W is the width.");
+    AddInput("ROIs",
+             "(LoDTensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D LoDTensor of shape (num_rois, 4) "
+             "given as [(x1, y1, x2, y2), ...]. "
+             "where (x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates. "
+             "The roi batch index can be calculated from LoD.");
+    AddOutput("Out",
+              "(Tensor), "
+              "the output of PSROIPoolOp is a 4-D Tensor with shape "
+              "(num_rois, output_channels, pooled_h, pooled_w).");
+    AddAttr<int>(
+        "output_channels",
+        "(int), "
+        "the number of channels of the output feature map. "
+        "For a task of C classes of objects, output_channels should be "
+        "(C + 1) for classification only.");
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+        .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "the pooled output height.")
+        .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "the pooled output width.")
+        .SetDefault(1);
+    AddComment(R"Doc(
+**PSROIPool Operator**
+
+Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
+position-sensitive average pooling on regions of interest specified by input, takes as 
+input N position-sensitive score maps and a list of num_rois regions of interest. 
+
+PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
+    )Doc");
+  }
+};
+
+class PSROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of PSROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of PSROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PSROIPoolOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                   "given as [(x1, y1, x2, y2), ...]");
+    PADDLE_ENFORCE(rois_dims[1] == 4,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) "
+                   "given as [(x1, y1, x2, y2), ...]");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    int output_channels = ctx->Attrs().Get<int>("output_channels");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE(
+        input_dims[1] == output_channels * pooled_height * pooled_width,
+        "the channel of X(%d) should be equal to the product of "
+        "output_channels(%d), pooled_height(%d) and pooled_width(%d)",
+        input_dims[1], output_channels, pooled_height, pooled_width);
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must be greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must be greater than 0");
+    PADDLE_ENFORCE_GT(output_channels, 1,
+                      "The pooled output channels must greater than 1");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0.");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] =
+        output_channels;  // input_dims[1] / (pooled_height * pooled_width);
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+    ctx->SetOutputDim("Out", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+class PSROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::Tensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    psroi_pool,
+    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUPSROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    psroi_pool_grad,
+    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUPSROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/psroi_pool_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaximumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaximumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolForward(
+    const int nthreads, const T* input_data, const T* input_rois,
+    const float spatial_scale, const int input_channels, const int height,
+    const int width, const int output_channels, const int pooled_height,
+    const int pooled_width, const int* rois_batch_id_data, T* output_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    const T* offset_input_data =
+        input_data +
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T outsum = 0;
+
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        outsum += offset_input_data[input_index];
+      }
+    }
+
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    output_data[i] = is_empty ? 0. : outsum / bin_area;
+  }
+}
+
+template <typename T>
+__global__ void GPUPSROIPoolBackward(
+    const int nthreads, const T* input_rois, const T* output_grad_data,
+    const float spatial_scale, const int input_channels, const int height,
+    const int width, const int output_channels, const int pooled_height,
+    const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    // The output is in order (n, c, ph, pw)
+    int pw = i % pooled_width;
+    int ph = (i / pooled_width) % pooled_height;
+    int c = (i / pooled_width / pooled_height) % output_channels;
+    int n = i / pooled_width / pooled_height / output_channels;
+
+    // set roi_batch_id
+    int roi_batch_id = rois_batch_id_data[n];
+    int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+    int input_offset =
+        (roi_batch_id * input_channels + input_channel) * height * width;
+    T* offset_input_grad_data = input_grad_data + input_offset;
+
+    // [start, end) interval for spatial sampling
+    const T* offset_input_rois = input_rois + n * 4;
+    T roi_start_w = static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+    T roi_start_h = static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+    T roi_end_w =
+        static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+    T roi_end_h =
+        static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+    // Force too small ROIs to be 1x1
+    T roi_height = max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+    T roi_width = max(roi_end_w - roi_start_w, (T)0.1);
+
+    // Compute w and h at input feature map
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+    int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+    int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+    int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart, 0), height);
+    hend = min(max(hend, 0), height);
+    wstart = min(max(wstart, 0), width);
+    wend = min(max(wend, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Accumulate diff_val into input data
+    T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+    T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
+    for (int ih = hstart; ih < hend; ++ih) {
+      for (int iw = wstart; iw < wend; ++iw) {
+        int input_index = ih * width + iw;
+        platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val);
+      }
+    }
+  }
+}
+
+template <typename Place, typename T>
+class GPUPSROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto output_channels = ctx.Attr<int>("output_channels");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    PADDLE_ENFORCE_EQ(input_channels,
+                      output_channels * pooled_height * pooled_width,
+                      "the channels of input X should equal the product of "
+                      "output_channels x pooled_height x pooled_width");
+
+    int rois_num = rois->dims()[0];
+    if (rois_num == 0) return;
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "The rois_batch_size and input(X) batch_size must be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod,
+                      "The rois_num from input and lod must be the same.");
+
+    // set rois batch id
+    framework::Tensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num});
+    int* rois_batch_id_data =
+        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+
+    framework::Tensor rois_batch_id_list_gpu;
+    framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
+                          ctx.device_context(), &rois_batch_id_list_gpu);
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    // call cuda kernel function
+    GPUPSROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<T>(), spatial_scale,
+        input_channels, height, width, output_channels, pooled_height,
+        pooled_width, rois_batch_id_list_gpu.data<int>(),
+        out->mutable_data<T>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<LoDTensor>("ROIs");
+
+    auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto output_channels = ctx.Attr<int>("output_channels");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    int rois_num = rois->dims()[0];
+    int input_channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (input_grad) {
+      // set roi batch id
+      framework::Tensor rois_batch_id_list;
+      rois_batch_id_list.Resize({rois_num});
+      int* rois_batch_id_data =
+          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+
+      framework::Tensor rois_batch_id_list_gpu;
+      framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(),
+                            ctx.device_context(), &rois_batch_id_list_gpu);
+
+      input_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
+
+      int output_grad_size = output_grad->numel();
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;
+
+      if (output_grad_size > 0) {
+        GPUPSROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<T>(), output_grad->data<T>(),
+            spatial_scale, input_channels, height, width, output_channels,
+            pooled_height, pooled_width, rois_batch_id_list_gpu.data<int>(),
+            input_grad->mutable_data<T>(ctx.GetPlace()));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    psroi_pool,
+    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUPSROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    psroi_pool_grad,
+    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUPSROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ b/paddle/fluid/operators/psroi_pool_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class CPUPSROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto output_channels = ctx.Attr<int>("output_channels");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int input_channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+
+    framework::Tensor rois_batch_id_list;
+    rois_batch_id_list.Resize({rois_num});
+    int* rois_batch_id_data =
+        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
+
+    auto rois_lod = rois->lod().back();
+    int rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size, batch_size,
+        "the rois_batch_size and input(X) batch_size should be the same.");
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num,
+                      "the rois_num from input and lod must be the same");
+
+    PADDLE_ENFORCE_EQ(input_channels,
+                      output_channels * pooled_height * pooled_width,
+                      "the channels of input X should equal the product of "
+                      "output_channels x pooled_height x pooled_width");
+
+    // calculate batch id index for each roi according to LoD
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        rois_batch_id_data[i] = n;
+      }
+    }
+
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    const T* input_rois = rois->data<T>();
+
+    // calculate psroipooling, parallel processing can be implemented per ROI
+    for (int n = 0; n < rois_num; ++n) {
+      // set roi batch id
+      int roi_batch_id = rois_batch_id_data[n];
+
+      // [start, end) interval for spatial sampling
+      const T* offset_input_rois = input_rois + n * 4;
+      T roi_start_w =
+          static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+      T roi_start_h =
+          static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+      T roi_end_w =
+          static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+      T roi_end_h =
+          static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+      // Force too small rois to be 1 x 1
+      T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+      T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+      // Compute bin size w and h at input feature map
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      // calculate each pixel of the output feature map.
+      int out_roi_offset = n * out_stride[0];
+      for (int c = 0; c < output_channels; ++c) {
+        // per category
+        int out_plane_offset = out_roi_offset + c * out_stride[1];
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          int out_row_offset = out_plane_offset + ph * out_stride[2];
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            // calculate w and h at input feature map
+            int hstart = floor(static_cast<T>(ph) * bin_size_h + roi_start_h);
+            int wstart = floor(static_cast<T>(pw) * bin_size_w + roi_start_w);
+            int hend = ceil(static_cast<T>(ph + 1) * bin_size_h + roi_start_h);
+            int wend = ceil(static_cast<T>(pw + 1) * bin_size_w + roi_start_w);
+            //  Add roi offsets and clip to input boundaries
+            hstart = std::min(std::max(hstart, 0), height);
+            wstart = std::min(std::max(wstart, 0), width);
+            hend = std::min(std::max(hend, 0), height);
+            wend = std::min(std::max(wend, 0), width);
+
+            int output_index = out_row_offset + pw;
+            int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+            int input_plane_offset =
+                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
+            const T* offset_input_data = input_data + input_plane_offset;
+            T out_sum = 0.;
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+            for (int ih = hstart; ih < hend; ++ih) {
+              for (int iw = wstart; iw < wend; ++iw) {
+                int input_index = ih * in_stride[2] + iw;
+                out_sum += offset_input_data[input_index];
+              }
+            }
+            T bin_area = (hend - hstart) * (wend - wstart);
+            output_data[output_index] = is_empty ? 0. : out_sum / bin_area;
+          }
+        }
+      }
+    }
+    return;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* output_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* input_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto output_channels = ctx.Attr<int>("output_channels");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    if (input_grad) {
+      auto in_dims = in->dims();
+      int input_channels = in_dims[1];
+      int height = in_dims[2];
+      int width = in_dims[3];
+      int rois_num = rois->dims()[0];
+
+      // set roi batch id
+      framework::Tensor rois_batch_id_list;
+      rois_batch_id_list.Resize({rois_num});
+      int* rois_batch_id_data =
+          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
+      auto rois_lod = rois->lod().back();
+      int rois_batch_size = rois_lod.size() - 1;
+      // calculate batch id index for each roi according to LoD
+      for (int n = 0; n < rois_batch_size; ++n) {
+        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+          rois_batch_id_data[i] = n;
+        }
+      }
+
+      const T* input_rois = rois->data<T>();
+      const T* output_grad_data = output_grad->data<T>();
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+
+      // set gradient of X to be 0. before backpropagate.
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), input_grad,
+               static_cast<T>(0));
+
+      // backpropagate gradient per output pixel
+      int output_grad_size = output_grad->numel();
+      for (int i = 0; i < output_grad_size; ++i) {
+        // The output is in order (n, c, ph, pw)
+        int pw = i % pooled_width;
+        int ph = (i / pooled_width) % pooled_height;
+        int c = (i / pooled_width / pooled_height) % output_channels;
+        int n = i / pooled_width / pooled_height / output_channels;
+
+        // set roi_batch_id
+        int roi_batch_id = rois_batch_id_data[n];
+        int input_channel = (c * pooled_height + ph) * pooled_width + pw;
+        int input_offset =
+            (roi_batch_id * input_channels + input_channel) * height * width;
+        T* offset_input_grad_data = input_grad_data + input_offset;
+
+        // [start, end) interval for spatial sampling
+        const T* offset_input_rois = input_rois + n * 4;
+        T roi_start_w =
+            static_cast<T>(round(offset_input_rois[0])) * spatial_scale;
+        T roi_start_h =
+            static_cast<T>(round(offset_input_rois[1])) * spatial_scale;
+        T roi_end_w =
+            static_cast<T>(round(offset_input_rois[2]) + 1.) * spatial_scale;
+        T roi_end_h =
+            static_cast<T>(round(offset_input_rois[3]) + 1.) * spatial_scale;
+
+        // Force too small ROIs to be 1x1
+        T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1);  // avoid 0
+        T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1);
+
+        // Compute w and h at input feature map
+        T bin_size_h = roi_height / static_cast<T>(pooled_height);
+        T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+        int hstart = floor(bin_size_h * static_cast<T>(ph) + roi_start_h);
+        int wstart = floor(bin_size_w * static_cast<T>(pw) + roi_start_w);
+        int hend = ceil(bin_size_h * static_cast<T>(ph + 1) + roi_start_h);
+        int wend = ceil(bin_size_w * static_cast<T>(pw + 1) + roi_start_w);
+
+        // Add roi offsets and clip to input boundaries
+        hstart = std::min(std::max(hstart, 0), height);
+        hend = std::min(std::max(hend, 0), height);
+        wstart = std::min(std::max(wstart, 0), width);
+        wend = std::min(std::max(wend, 0), width);
+        bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+        // Accumulate diff_val into input data
+        T bin_area = static_cast<T>((hend - hstart) * (wend - wstart));
+        T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area;
+        for (int ih = hstart; ih < hend; ++ih) {
+          for (int iw = wstart; iw < wend; ++iw) {
+            int input_index = ih * width + iw;
+            offset_input_grad_data[input_index] += diff_val;
+          }
+        }
+      }
+    }
+    return;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reader/ctr_reader.h
+++ b/paddle/fluid/operators/reader/ctr_reader.h
@@ -16,6 +16,7 @@

 #include <sys/time.h>

+#include <algorithm>
 #include <chrono>  // NOLINT
 #include <cstdlib>
 #include <fstream>
@@ -55,8 +56,7 @@ class CTRReader : public framework::FileReader {
    PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!");
    PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null");
    PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty");
-    thread_num_ =
-        file_list_.size() > thread_num ? thread_num : file_list_.size();
+    thread_num_ = std::min<size_t>(file_list_.size(), thread_num);
    queue_ = queue;
    SplitFiles();
    for (size_t i = 0; i < thread_num_; ++i) {
@@ -95,10 +95,10 @@ class CTRReader : public framework::FileReader {
    queue_->ReOpen();
    VLOG(3) << "reopen success";
    VLOG(3) << "thread_num " << thread_num_;
-    for (int thread_id = 0; thread_id < thread_num_; thread_id++) {
-      read_threads_.emplace_back(new std::thread(
-          std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_,
-                    thread_id, &read_thread_status_, queue_)));
+    for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) {
+      read_threads_.emplace_back(new std::thread(std::bind(
+          &ReadThread, file_groups_[thread_id], slots_, batch_size_,
+          static_cast<int>(thread_id), &read_thread_status_, queue_)));
    }
    monitor_thread_.reset(new std::thread(
        std::bind(&MonitorThread, &read_thread_status_, queue_)));

--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -56,13 +56,13 @@ class SppKernel : public framework::OpKernel<T> {
        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
        math::MaxPool<T> max_process;
        pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, max_process, true,
+                     kernel_size, strides, paddings, max_process, true, false,
                     &out_level);
      } else if (pooling_type == "avg") {
        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
        math::AvgPool<T> avg_process;
        pool_forward(context.template device_context<DeviceContext>(), *in_x,
-                     kernel_size, strides, paddings, avg_process, true,
+                     kernel_size, strides, paddings, avg_process, true, false,
                     &out_level);
      }
      // flatten pooling output shape
@@ -156,7 +156,7 @@ class SppGradKernel : public framework::OpKernel<T> {
        math::AvgPoolGrad<T> avg_process;
        pool_backward(context.template device_context<DeviceContext>(), *in_x,
                      *&out_level, *&outgrad_level, kernel_size, strides,
-                      paddings, avg_process, true, in_x_grad);
+                      paddings, avg_process, true, false, in_x_grad);
      }
    }
  }

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -56,9 +56,16 @@ ELSE()
    set(MKLDNN_CTX_DEPS)
 ENDIF()

+nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) 
+IF(WITH_GPU)
+  set(STREAM_CALLBACK_DEPS stream_callback_manager)
+ELSE()
+  set(STREAM_CALLBACK_DEPS)
+ENDIF()
+
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc
+cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS}
    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)


--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -3,6 +3,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -222,14 +222,10 @@ class CUDADeviceContext : public DeviceContext {

  template <typename Callback>
  void AddStreamCallback(Callback&& callback) const {
-    std::lock_guard<std::mutex> guard(callback_mtx_);
    callback_manager_->AddCallback(callback);
  }

-  void WaitStreamCallback() const {
-    std::lock_guard<std::mutex> guard(callback_mtx_);
-    callback_manager_->Wait();
-  }
+  void WaitStreamCallback() const { callback_manager_->Wait(); }

 #if CUDA_VERSION >= 9000
  /*! \brief CublasCall may need to change cublas's config,
@@ -260,9 +256,7 @@ class CUDADeviceContext : public DeviceContext {

  mutable std::mutex mtx_;

-  // This lock is only used by callback
-  // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
-  mutable std::mutex callback_mtx_;
+  // StreamCallbackManager is thread-safe
  std::unique_ptr<StreamCallbackManager> callback_manager_;

  mutable std::mutex cublas_mtx_;

--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/platform/stream_callback_manager.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+#if CUDA_VERSION >= 10000
+static void CUDART_CB StreamCallbackFunc(void *user_data);
+#else
+static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
+                                         cudaError_t status, void *user_data)
+#endif
+{
+  std::unique_ptr<std::function<void()>> func(
+      reinterpret_cast<std::function<void()> *>(user_data));
+  (*func)();
+}
+
+StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream)
+    : stream_(stream), thread_pool_(1) {}
+
+void StreamCallbackManager::AddCallback(std::function<void()> callback) const {
+  auto *callback_func = new std::function<void()>(std::move(callback));
+  auto *func = new std::function<void()>([this, callback_func] {
+    std::lock_guard<std::mutex> lock(mtx_);
+    last_future_ = thread_pool_.enqueue([callback_func] {
+      std::unique_ptr<std::function<void()>> releaser(callback_func);
+      (*callback_func)();
+    });
+  });
+#if CUDA_VERSION >= 10000
+  PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func));
+#else
+  PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
+#endif
+}
+
+void StreamCallbackManager::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  {
+    std::lock_guard<std::mutex> lock(mtx_);
+    if (last_future_.valid()) {
+      last_future_.wait();
+    }
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -18,67 +18,32 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <functional>
+#include <future>  // NOLINT
 #include <memory>
+#include <mutex>  // NOLINT
+
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace platform {

-class StreamCallbackManager;
-
-struct StreamCallbackContext {
-  template <typename Callback>
-  inline StreamCallbackContext(const StreamCallbackManager *manager,
-                               Callback &&callback)
-      : manager_(manager), callback_(callback) {}
-
-  const StreamCallbackManager *manager_;  // do not own
-  std::function<void()> callback_;
-};
-
+// NOTE(zjl): clean StreamCallbackManager to make compilation faster
+// Make StreamCallbackManager thread-safe
 class StreamCallbackManager {
 public:
-  explicit inline StreamCallbackManager(cudaStream_t stream = nullptr)
-      : stream_(stream), thread_pool_(new ThreadPool(1)) {}
+  explicit StreamCallbackManager(const cudaStream_t stream);
+
+  ~StreamCallbackManager() = default;

-  template <typename Callback>
-  inline void AddCallback(Callback &&callback) const {
-    auto *stream_callback_context =
-        new StreamCallbackContext(this, std::forward<Callback>(callback));
-#if CUDA_VERSION >= 10000
-    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
-                                      StreamCallbackManager::StreamCallbackFunc,
-                                      stream_callback_context));  // NOLINT
-#else
-    PADDLE_ENFORCE(cudaStreamAddCallback(
-        stream_, StreamCallbackManager::StreamCallbackFunc,
-        stream_callback_context, 0));  // NOLINT
-#endif
-  }
+  void AddCallback(std::function<void()> callback) const;

-  void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
+  void Wait() const;

 private:
  const cudaStream_t stream_;
-  mutable std::unique_ptr<ThreadPool> thread_pool_;
-
-// cudaStreamCallback cannot call CUDA API inside, so we have to use
-// thread_pool here
-#if CUDA_VERSION >= 10000
-  static void CUDART_CB StreamCallbackFunc(void *user_data)
-#else
-  static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
-                                           cudaError_t status, void *user_data)
-#endif
-  {
-    auto *callback_context_ptr =
-        reinterpret_cast<StreamCallbackContext *>(user_data);
-    callback_context_ptr->manager_->thread_pool_->enqueue([=]() {
-      std::unique_ptr<StreamCallbackContext> callback_context(
-          callback_context_ptr);
-      callback_context->callback_();
-    });
-  }
+  mutable ::ThreadPool thread_pool_;
+  mutable std::mutex mtx_;
+  mutable std::future<void> last_future_;
 };

 }  // namespace platform

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -81,6 +81,14 @@ bool IsCompiledWithCUDA() {
 #endif
 }

+bool IsCompiledWithBrpc() {
+#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA)
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool IsCompiledWithDIST() {
 #ifdef PADDLE_WITH_DISTRIBUTE
  return true;
@@ -631,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle.
        [](bool init_p2p) { framework::InitDevices(init_p2p); });

  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+  m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
  m.def("is_compiled_with_dist", IsCompiledWithDIST);
 #ifdef PADDLE_WITH_CUDA
  m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -162,7 +162,7 @@ void PyCPUTensorSetFromArray(
    paddle::platform::CPUPlace place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

@@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray(
    paddle::platform::CPUPlace place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
-  for (int i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

@@ -200,7 +200,7 @@ void PyCUDATensorSetFromArray(
    paddle::platform::CUDAPlace place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

@@ -221,7 +221,7 @@ inline void PyCUDATensorSetFromArray(
    paddle::platform::CUDAPlace place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

@@ -240,7 +240,7 @@ void PyCUDAPinnedTensorSetFromArray(
    const paddle::platform::CUDAPinnedPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }

@@ -260,7 +260,7 @@ inline void PyCUDAPinnedTensorSetFromArray(
    const paddle::platform::CUDAPinnedPlace &place) {
  std::vector<int64_t> dims;
  dims.reserve(array.ndim());
-  for (size_t i = 0; i < array.ndim(); ++i) {
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
    dims.push_back(static_cast<int>(array.shape()[i]));
  }


--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -517,6 +517,18 @@ function assert_api_spec_approvals() {
          fi
      fi
    done
+
+    HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
+    if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
+        APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433`
+        echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
+        if [ "${APPROVALS}" == "FALSE" ]; then
+            echo "You must have at least 2 approvals for the const_cast"
+        exit 1
+        fi
+    fi
+
 }



--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -126,9 +126,9 @@ def __bootstrap__():
        'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn',
        'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
        'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
-        'eager_delete_tensor_gb', 'allocator_strategy',
-        'reader_queue_speed_test_mode', 'print_sub_graph_dir',
-        'pe_profile_fname'
+        'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
+        'allocator_strategy', 'reader_queue_speed_test_mode',
+        'print_sub_graph_dir', 'pe_profile_fname'
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')
@@ -152,6 +152,7 @@ def __bootstrap__():
            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
            'cudnn_exhaustive_search', 'selected_gpus'
        ]
+
    core.init_gflags([sys.argv[0]] +
                     ["--tryfromenv=" + ",".join(read_env_flags)])
    core.init_glog(sys.argv[0])

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -52,6 +52,8 @@ __all__ = [
    'softmax',
    'pool2d',
    'pool3d',
+    'adaptive_pool2d',
+    'adaptive_pool3d',
    'batch_norm',
    'beam_search_decode',
    'conv2d_transpose',
@@ -173,6 +175,7 @@ __all__ = [
    'merge_selected_rows',
    'get_tensor_from_selected_rows',
    'lstm',
+    'psroi_pool',
 ]

 kIgnoreIndex = -100
@@ -2499,6 +2502,204 @@ def pool3d(input,
    return pool_out


+@templatedoc(op_type="pool2d")
+def adaptive_pool2d(input,
+                    pool_size,
+                    pool_type="max",
+                    require_index=False,
+                    name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+        pool_type: ${pooling_type_comment}
+        require_index (bool): If true, the index of max pooling point along with outputs.
+            it cannot be set in average pooling type.
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: 'pool_type' is not 'max' nor 'avg'.
+        ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
+        ValueError: 'pool_size' should be a list or tuple with length as 2.
+
+    Examples:
+        .. code-block:: python
+
+          # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], 
+          # output shape is [N, C, m, n], adaptive pool divide H and W dimentions
+          # of input data into m * n grids averagely and performs poolings in each 
+          # grid to get output.
+          # adaptive average pool performs calculations as follow:
+          # 
+          #     for i in range(m):
+          #         for j in range(n):
+          #             hstart = floor(i * H / m)
+          #             hend = ceil((i + 1) * H / m)
+          #             wstart = floor(i * W / n)
+          #             wend = ceil((i + 1) * W / n)
+          #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
+          #
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          pool_out = fluid.layers.adaptive_pool2d(
+                            input=data,
+                            pool_size=[3, 3],
+                            pool_type='avg')
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+
+    if pool_type == "avg" and require_index:
+        raise ValueError(
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2:
+        raise ValueError(
+            "'pool_size' should be a list or tuple with length as 2.")
+
+    if pool_type == "max":
+        l_type = 'max_pool2d_with_index'
+    else:
+        l_type = "pool2d"
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+    if pool_type == "max":
+        mask = helper.create_variable_for_type_inference(dtype)
+        outputs["Mask"] = mask
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (pool_out, mask) if require_index else pool_out
+
+
+@templatedoc(op_type="pool3d")
+def adaptive_pool3d(input,
+                    pool_size,
+                    pool_type="max",
+                    require_index=False,
+                    name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (Depth, Height, Width).
+        pool_type: ${pooling_type_comment}
+        require_index (bool): If true, the index of max pooling point along with outputs.
+            it cannot be set in average pooling type.
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: 'pool_type' is not 'max' nor 'avg'.
+        ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'.
+        ValueError: 'pool_size' should be a list or tuple with length as 2.
+
+    Examples:
+        .. code-block:: python
+
+          # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n],
+          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions
+          # of input data into l * m * n grids averagely and performs poolings in each 
+          # grid to get output.
+          # adaptive average pool performs calculations as follow:
+          # 
+          #     for i in range(l):
+          #         for j in range(m):
+          #             for k in range(n):
+          #                 dstart = floor(i * D / l)
+          #                 dend = ceil((i + 1) * D / l)
+          #                 hstart = floor(j * H / m)
+          #                 hend = ceil((j + 1) * H / m)
+          #                 wstart = floor(k * W / n)
+          #                 wend = ceil((k + 1) * W / n)
+          #                 output[:, :, i, j, k] = 
+          #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
+          #
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          pool_out, mask = fluid.layers.adaptive_pool3d(
+                            input=data,
+                            pool_size=[3, 3],
+                            pool_type='avg')
+    """
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+
+    if pool_type == "avg" and require_index:
+        raise ValueError(
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+
+    def _is_list_or_tuple_(data):
+        return (isinstance(data, list) or isinstance(data, tuple))
+
+    if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3:
+        raise ValueError(
+            "'pool_size' should be a list or tuple with length as 3.")
+
+    if pool_type == "max":
+        l_type = 'max_pool3d_with_index'
+    else:
+        l_type = "pool3d"
+
+    helper = LayerHelper(l_type, **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+
+    outputs = {"Out": pool_out}
+    if pool_type == "max":
+        mask = helper.create_variable_for_type_inference(dtype)
+        outputs["Mask"] = mask
+
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        })
+
+    return (pool_out, mask) if require_index else pool_out
+
+
 def batch_norm(input,
               act=None,
               is_test=False,
@@ -9122,3 +9323,57 @@ def get_tensor_from_selected_rows(x, name=None):
        outputs={'Out': out},
        attrs={})
    return out
+
+
+@templatedoc()
+def psroi_pool(input,
+               rois,
+               output_channels,
+               spatial_scale,
+               pooled_height,
+               pooled_width,
+               name=None):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        output_channels (integer): ${output_channels_comment}
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        name (str, default None): The name of this layer.
+
+    Returns:
+        Variable: ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            pool_out = fluid.layers.psroi_pool(input=x, rois=rois, 490, 1.0, 7, 7)
+    """
+    helper = LayerHelper('psroi_pool', **locals())
+    # check attrs
+    if not isinstance(output_channels, int):
+        raise TypeError("output_channels must be int type")
+    if not isinstance(spatial_scale, float):
+        raise TypeError("spatial_scale must be float type")
+    if not isinstance(pooled_height, int):
+        raise TypeError("pooled_height must be int type")
+    if not isinstance(pooled_width, int):
+        raise TypeError("pooled_width must be int type")
+    dtype = helper.input_dtype()
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(
+        type='psroi_pool',
+        inputs={'X': input,
+                'ROIs': rois},
+        outputs={'Out': out},
+        attrs={
+            'output_channels': output_channels,
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width
+        })
+    return out
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -111,3 +111,7 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo
 if(NOT APPLE)
    py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
 endif()
+
+if (WITH_NGRAPH)
+    add_subdirectory(ngraph)
+endif()
--- a/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+foreach(TEST_OP ${TEST_OPS})
+    py_test_modules(${TEST_OP} MODULES ${TEST_OP}  ENVS FLAGS_use_ngraph=true)
+endforeach(TEST_OP)
--- a/python/paddle/fluid/tests/unittests/ngraph/__init__.py
+++ b/python/paddle/fluid/tests/unittests/ngraph/__init__.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -227,6 +227,7 @@ class TestDistBase(unittest.TestCase):
    def setUp(self):
        self._trainers = 2
        self._pservers = 2
+        self._port_set = set()
        self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
            self._find_free_port(), self._find_free_port())
        self._python_interp = sys.executable
@@ -242,10 +243,18 @@ class TestDistBase(unittest.TestCase):
        self._after_setup_config()

    def _find_free_port(self):
-        with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        def __free_port():
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as s:
                s.bind(('', 0))
                return s.getsockname()[1]

+        while True:
+            port = __free_port()
+            if port not in self._port_set:
+                self._port_set.add(port)
+                return port
+
    def start_pserver(self, model_file, check_error_log, required_envs):
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
        ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0'
+os.environ['CPU_NUM'] = '2'
+
+import six
+import unittest
+
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+
+
+def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
+    if use_cuda and not core.is_compiled_with_cuda():
+        print('Skip use_cuda=True because Paddle is not compiled with cuda')
+        return
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    train_reader = paddle.batch(
+        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+
+    cost = network(data, label, len(word_dict))
+    optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
+    optimizer.minimize(cost)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+    reader = feeder.decorate_reader(
+        train_reader, multi_devices=use_parallel_executor)
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if use_parallel_executor:
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_cuda, loss_name=cost.name)
+        fetch_list = [cost.name]
+    else:
+        train_exe = exe
+        fetch_list = [cost]
+
+    for pass_id in six.moves.xrange(pass_num):
+        batch_id = 0
+        for data in reader():
+            train_exe.run(feed=data,
+                          fetch_list=fetch_list if batch_id % 4 == 0 else [])
+            batch_id += 1
+            if batch_id > 16:
+                break
+
+
+class TestBase(unittest.TestCase):
+    def setUp(self):
+        self.net = None
+
+    def test_network(self):
+        if self.net is None:
+            return
+
+        for use_cuda in [True, False]:
+            for use_parallel_executor in [False, True]:
+                print('network: {}, use_cuda: {}, use_parallel_executor: {}'.
+                      format(self.net.__name__, use_cuda,
+                             use_parallel_executor))
+                with fluid.program_guard(fluid.Program(), fluid.Program()):
+                    with fluid.scope_guard(core.Scope()):
+                        train(self.net, use_cuda, use_parallel_executor)
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from test_eager_deletion_dynamic_rnn_base import TestBase
+import paddle.fluid as fluid
+
+
+def gru_net(data,
+            label,
+            dict_dim,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2,
+            emb_lr=400.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
+    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
+    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
+    gru_max_tanh = fluid.layers.tanh(gru_max)
+    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class GRUTest(TestBase):
+    def setUp(self):
+        self.net = gru_net
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from test_eager_deletion_dynamic_rnn_base import TestBase
+import paddle.fluid as fluid
+import unittest
+
+
+def lstm_net(data,
+             label,
+             dict_dim,
+             emb_dim=128,
+             hid_dim=128,
+             hid_dim2=96,
+             class_dim=2,
+             emb_lr=30.0):
+    emb = fluid.layers.embedding(
+        input=data,
+        size=[dict_dim, emb_dim],
+        param_attr=fluid.ParamAttr(learning_rate=emb_lr))
+    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4)
+    lstm_h, c = fluid.layers.dynamic_lstm(
+        input=fc0, size=hid_dim * 4, is_reverse=False)
+    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
+    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
+    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+
+
+class LSTMTest(TestBase):
+    def setUp(self):
+        self.net = lstm_net
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_mnist import TestMNIST
+
+
+class EagerDeletionTestMNIST(TestMNIST):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0"
+
+from test_parallel_executor_transformer import TestTransformer
+
+
+class EagerDeletionTestTransformer(TestTransformer):
+    pass
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -233,6 +233,29 @@ class TestBook(unittest.TestCase):
                    pool_stride=[1, 2],
                    pool_padding=(2, 1)))

+    def test_adaptive_pool2d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 224, 224], dtype='float32')
+            self.assertIsNotNone(
+                layers.adaptive_pool2d(
+                    x, [3, 3], pool_type='avg'))
+            pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
+
+    def test_adaptive_pool3d(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32')
+            self.assertIsNotNone(
+                layers.adaptive_pool3d(
+                    x, [3, 3, 3], pool_type='avg'))
+            pool, mask = layers.adaptive_pool3d(
+                x, [3, 3, 3], require_index=True)
+            self.assertIsNotNone(pool)
+            self.assertIsNotNone(mask)
+
    def test_lstm_unit(self):
        program = Program()
        with program_guard(program):
@@ -511,6 +534,16 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(output)
        print(str(program))

+    def test_psroi_pool(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name="x", shape=[245, 30, 30], dtype="float32")
+            rois = layers.data(
+                name="rois", shape=[4], dtype="float32", lod_level=1)
+            output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7)
+            self.assertIsNotNone(output)
+        print(str(program))
+
    def test_roi_align(self):
        program = Program()
        with program_guard(program):

--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 from __future__ import print_function
+from __future__ import division

 import unittest
 import numpy as np
@@ -21,16 +22,28 @@ import paddle.fluid.core as core
 from op_test import OpTest


+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
 def max_pool2D_forward_naive(x,
                             ksize,
                             strides,
                             paddings,
                             global_pool=0,
                             ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
    N, C, H, W = x.shape
    if global_pool == 1:
        ksize = [H, W]
+    if adaptive:
+        H_out, W_out = ksize
+    else:
        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
                 ) // strides[0] + 1 if ceil_mode else (
                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
@@ -40,6 +53,12 @@ def max_pool2D_forward_naive(x,
    out = np.zeros((N, C, H_out, W_out))
    for i in range(H_out):
        for j in range(W_out):
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
                r_start = np.max((i * strides[0] - paddings[0], 0))
                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
                c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -56,10 +75,14 @@ def avg_pool2D_forward_naive(x,
                             paddings,
                             global_pool=0,
                             ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
    N, C, H, W = x.shape
    if global_pool == 1:
        ksize = [H, W]
+    if adaptive:
+        H_out, W_out = ksize
+    else:
        H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
                 ) // strides[0] + 1 if ceil_mode else (
                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
@@ -69,14 +92,20 @@ def avg_pool2D_forward_naive(x,
    out = np.zeros((N, C, H_out, W_out))
    for i in range(H_out):
        for j in range(W_out):
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
                r_start = np.max((i * strides[0] - paddings[0], 0))
                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
                c_start = np.max((j * strides[1] - paddings[1], 0))
                c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
            x_masked = x[:, :, r_start:r_end, c_start:c_end]

-            field_size = ((r_end - r_start) * (c_end - c_start)) if exclusive \
-                            else (ksize[0] * ksize[1])
+            field_size = ((r_end - r_start) * (c_end - c_start)) \
+                        if (exclusive or adaptive) else (ksize[0] * ksize[1])
            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size
    return out

@@ -93,12 +122,13 @@ class TestPool2D_Op(OpTest):
        self.init_pool_type()
        self.init_ceil_mode()
        self.init_exclusive()
+        self.init_adaptive()
        if self.global_pool:
            self.paddings = [0 for _ in range(len(self.paddings))]
        input = np.random.random(self.shape).astype(self.dtype)
        output = self.pool2D_forward_naive(
            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive).astype(self.dtype)
+            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}

        self.attrs = {
@@ -112,7 +142,8 @@ class TestPool2D_Op(OpTest):
            'ceil_mode': self.ceil_mode,
            'data_format':
            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
-            'exclusive': self.exclusive
+            'exclusive': self.exclusive,
+            'adaptive': self.adaptive
        }

        self.outputs = {'Out': output}
@@ -159,6 +190,9 @@ class TestPool2D_Op(OpTest):
    def init_exclusive(self):
        self.exclusive = True

+    def init_adaptive(self):
+        self.adaptive = False
+

 class TestCase1(TestPool2D_Op):
    def init_test_case(self):
@@ -315,5 +349,10 @@ class TestCUDNNAvgInclude(TestCase2):
        self.exclusive = False


+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 from __future__ import print_function
+from __future__ import division

 import unittest
 import numpy as np
@@ -21,16 +22,28 @@ import paddle.fluid.core as core
 from op_test import OpTest


+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
 def max_pool3D_forward_naive(x,
                             ksize,
                             strides,
                             paddings,
                             global_pool=0,
                             ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
    N, C, D, H, W = x.shape
    if global_pool == 1:
        ksize = [D, H, W]
+    if adaptive:
+        D_out, H_out, W_out = ksize
+    else:
        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
                 ) // strides[0] + 1 if ceil_mode else (
                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
@@ -42,14 +55,26 @@ def max_pool3D_forward_naive(x,
                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
    for k in range(D_out):
+        if adaptive:
+            d_start = adaptive_start_index(k, D, ksize[0])
+            d_end = adaptive_end_index(k, D, ksize[0])
+        else:
            d_start = np.max((k * strides[0] - paddings[0], 0))
            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
        for i in range(H_out):
-            h_start = np.max((i * strides[0] - paddings[0], 0))
-            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            if adaptive:
+                h_start = adaptive_start_index(i, H, ksize[1])
+                h_end = adaptive_end_index(i, H, ksize[1])
+            else:
+                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
            for j in range(W_out):
-                w_start = np.max((j * strides[1] - paddings[1], 0))
-                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                if adaptive:
+                    w_start = adaptive_start_index(j, W, ksize[2])
+                    w_end = adaptive_end_index(j, W, ksize[2])
+                else:
+                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]

                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
@@ -62,10 +87,14 @@ def avg_pool3D_forward_naive(x,
                             paddings,
                             global_pool=0,
                             ceil_mode=False,
-                             exclusive=True):
+                             exclusive=True,
+                             adaptive=False):
    N, C, D, H, W = x.shape
    if global_pool == 1:
        ksize = [D, H, W]
+    if adaptive:
+        D_out, H_out, W_out = ksize
+    else:
        D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
                 ) // strides[0] + 1 if ceil_mode else (
                     H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
@@ -77,18 +106,30 @@ def avg_pool3D_forward_naive(x,
                     W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
    for k in range(D_out):
+        if adaptive:
+            d_start = adaptive_start_index(k, D, ksize[0])
+            d_end = adaptive_end_index(k, D, ksize[0])
+        else:
            d_start = np.max((k * strides[0] - paddings[0], 0))
            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
        for i in range(H_out):
-            h_start = np.max((i * strides[0] - paddings[0], 0))
-            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            if adaptive:
+                h_start = adaptive_start_index(i, H, ksize[1])
+                h_end = adaptive_end_index(i, H, ksize[1])
+            else:
+                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
            for j in range(W_out):
-                w_start = np.max((j * strides[1] - paddings[1], 0))
-                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                if adaptive:
+                    w_start = adaptive_start_index(j, W, ksize[2])
+                    w_end = adaptive_end_index(j, W, ksize[2])
+                else:
+                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]

                field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \
-                             if exclusive else ksize[0] * ksize[1] * ksize[2]
+                             if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2]
                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3,
                                                            4)) / field_size
    return out
@@ -105,13 +146,14 @@ class TestPool3d_Op(OpTest):
        self.init_pool_type()
        self.init_ceil_mode()
        self.init_exclusive()
+        self.init_adaptive()

        if self.global_pool:
            self.paddings = [0 for _ in range(len(self.paddings))]
        input = np.random.random(self.shape).astype(self.dtype)
        output = self.pool3D_forward_naive(
            input, self.ksize, self.strides, self.paddings, self.global_pool,
-            self.ceil_mode, self.exclusive).astype(self.dtype)
+            self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype)
        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}

        self.attrs = {
@@ -124,7 +166,8 @@ class TestPool3d_Op(OpTest):
            'ceil_mode': self.ceil_mode,
            'data_format':
            'AnyLayout',  # TODO(dzhwinter) : should be fix latter
-            'exclusive': self.exclusive
+            'exclusive': self.exclusive,
+            'adaptive': self.adaptive
        }

        self.outputs = {'Out': output}
@@ -171,6 +214,9 @@ class TestPool3d_Op(OpTest):
    def init_exclusive(self):
        self.exclusive = True

+    def init_adaptive(self):
+        self.adaptive = False
+

 class TestCase1(TestPool3d_Op):
    def init_test_case(self):
@@ -353,5 +399,10 @@ class TestCUDNNAvgInclude(TestCUDNNCase3):
        self.exclusive = False


+class TestAvgPoolAdaptive(TestCase1):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
@@ -13,33 +13,62 @@
 # limitations under the License.

 from __future__ import print_function
+from __future__ import division

 import unittest
 import numpy as np
 from op_test import OpTest


-def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
+def adaptive_start_index(index, input_size, output_size):
+    return int(np.floor(index * input_size / output_size))
+
+
+def adaptive_end_index(index, input_size, output_size):
+    return int(np.ceil((index + 1) * input_size / output_size))
+
+
+def max_pool3D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=False,
+                             adaptive=False):

    N, C, D, H, W = x.shape
    if global_pool:
        ksize = [D, H, W]
        paddings = [0, 0, 0]

+    if adaptive:
+        D_out, H_out, W_out = ksize
+    else:
        D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1
        H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1
        W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1
    out = np.zeros((N, C, D_out, H_out, W_out))
    mask = np.zeros((N, C, D_out, H_out, W_out))
    for k in range(D_out):
+        if adaptive:
+            d_start = adaptive_start_index(k, D, ksize[0])
+            d_end = adaptive_end_index(k, D, ksize[0])
+        else:
            d_start = np.max((k * strides[0] - paddings[0], 0))
            d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
        for i in range(H_out):
-            h_start = np.max((i * strides[0] - paddings[0], 0))
-            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            if adaptive:
+                h_start = adaptive_start_index(i, H, ksize[1])
+                h_end = adaptive_end_index(i, H, ksize[1])
+            else:
+                h_start = np.max((i * strides[1] - paddings[1], 0))
+                h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H))
            for j in range(W_out):
-                w_start = np.max((j * strides[1] - paddings[1], 0))
-                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                if adaptive:
+                    w_start = adaptive_start_index(j, W, ksize[2])
+                    w_end = adaptive_end_index(j, W, ksize[2])
+                else:
+                    w_start = np.max((j * strides[2] - paddings[2], 0))
+                    w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W))
                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]

                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
@@ -58,19 +87,33 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
    return out, mask


-def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
+def max_pool2D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings,
+                             global_pool=False,
+                             adaptive=False):

    N, C, H, W = x.shape
    if global_pool:
        ksize = [H, W]
        paddings = [0, 0]

+    if adaptive:
+        H_out, W_out = ksize
+    else:
        H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1
        W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1
    out = np.zeros((N, C, H_out, W_out))
    mask = np.zeros((N, C, H_out, W_out))
    for i in range(H_out):
        for j in range(W_out):
+            if adaptive:
+                r_start = adaptive_start_index(i, H, ksize[0])
+                r_end = adaptive_end_index(i, H, ksize[0])
+                c_start = adaptive_start_index(j, W, ksize[1])
+                c_end = adaptive_end_index(j, W, ksize[1])
+            else:
                r_start = np.max((i * strides[0] - paddings[0], 0))
                r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
                c_start = np.max((j * strides[1] - paddings[1], 0))
@@ -95,10 +138,12 @@ class TestMaxPoolWithIndex_Op(OpTest):
    def setUp(self):
        self.init_test_case()
        self.init_global()
+        self.init_adaptive()

        input = np.random.random(self.shape).astype("float32")
        output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
-                                               self.paddings, self.global_pool)
+                                               self.paddings, self.global_pool,
+                                               self.adaptive)
        output = output.astype("float32")
        mask = mask.astype("int32")

@@ -107,6 +152,7 @@ class TestMaxPoolWithIndex_Op(OpTest):
            'paddings': self.paddings,
            'ksize': self.ksize,
            'global_pooling': self.global_pool,
+            'adaptive': self.adaptive,
        }

        self.inputs = {'X': input}
@@ -129,6 +175,9 @@ class TestMaxPoolWithIndex_Op(OpTest):
    def init_global(self):
        self.global_pool = False

+    def init_adaptive(self):
+        self.adaptive = False
+

 class TestCase1(TestMaxPoolWithIndex_Op):
    def init_global(self):
@@ -190,5 +239,15 @@ class TestCase7(TestCase6):
        self.global_pool = False


+class TestCastAdaptive2d(TestCase6):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
+class TestCastAdaptive3d(TestMaxPoolWithIndex_Op):
+    def init_adaptive(self):
+        self.adaptive = True
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import math
+import numpy as np
+import unittest
+from op_test import OpTest
+
+
+class TestPSROIPoolOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_psroi_pool()
+        self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)}
+        self.attrs = {
+            'output_channels': self.output_channels,
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width
+        }
+        self.outputs = {'Out': self.outs}
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3 * 2 * 2
+        self.height = 6
+        self.width = 4
+
+        self.x_dim = [self.batch_size, self.channels, self.height, self.width]
+
+        self.spatial_scale = 1.0 / 4.0
+        self.output_channels = 3
+        self.pooled_height = 2
+        self.pooled_width = 2
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def make_rois(self):
+        rois = []
+        self.rois_lod = [[]]
+        for bno in range(self.batch_size):
+            self.rois_lod[0].append(bno + 1)
+            for i in range(bno + 1):
+                x1 = np.random.random_integers(
+                    0, self.width // self.spatial_scale - self.pooled_width)
+                y1 = np.random.random_integers(
+                    0, self.height // self.spatial_scale - self.pooled_height)
+
+                x2 = np.random.random_integers(x1 + self.pooled_width,
+                                               self.width // self.spatial_scale)
+                y2 = np.random.random_integers(
+                    y1 + self.pooled_height, self.height // self.spatial_scale)
+                roi = [bno, x1, y1, x2, y2]
+                rois.append(roi)
+        self.rois_num = len(rois)
+        self.rois = np.array(rois).astype('float32')
+
+    def calc_psroi_pool(self):
+        output_shape = (self.rois_num, self.output_channels, self.pooled_height,
+                        self.pooled_width)
+        out_data = np.zeros(output_shape)
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = int(roi[0])
+            roi_start_w = round(roi[1]) * self.spatial_scale
+            roi_start_h = round(roi[2]) * self.spatial_scale
+            roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale
+            roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale
+
+            roi_height = max(roi_end_h - roi_start_h, 0.1)
+            roi_width = max(roi_end_w - roi_start_w, 0.1)
+
+            bin_size_h = roi_height / float(self.pooled_height)
+            bin_size_w = roi_width / float(self.pooled_width)
+
+            x_i = self.x[roi_batch_id]
+
+            for c in range(self.output_channels):
+                for ph in range(self.pooled_height):
+                    for pw in range(self.pooled_width):
+                        hstart = int(
+                            math.floor(float(ph) * bin_size_h + roi_start_h))
+                        wstart = int(
+                            math.floor(float(pw) * bin_size_w + roi_start_w))
+                        hend = int(
+                            math.ceil(
+                                float(ph + 1) * bin_size_h + roi_start_h))
+                        wend = int(
+                            math.ceil(
+                                float(pw + 1) * bin_size_w + roi_start_w))
+                        hstart = min(max(hstart, 0), self.height)
+                        hend = min(max(hend, 0), self.height)
+                        wstart = min(max(wstart, 0), self.width)
+                        wend = min(max(wend, 0), self.width)
+
+                        c_in = (c * self.pooled_height + ph
+                                ) * self.pooled_width + pw
+                        is_empty = (hend <= hstart) or (wend <= wstart)
+                        out_sum = 0.
+                        for ih in range(hstart, hend):
+                            for iw in range(wstart, wend):
+                                out_sum += x_i[c_in, ih, iw]
+                        bin_area = (hend - hstart) * (wend - wstart)
+                        out_data[i, c, ph, pw] = 0. if is_empty else (
+                            out_sum / float(bin_area))
+        self.outs = out_data.astype('float32')
+
+    def setUp(self):
+        self.op_type = 'psroi_pool'
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()