Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_ut_for_trt

0514882b · nhzlx · 1f6c9dba · 3043f51b · 0514882b · 0514882b
79 changed file
--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -136,10 +136,6 @@ def parse_args():
        '--no_random',
        action='store_true',
        help='If set, keep the random seed and do not shuffle the data.')
-    parser.add_argument(
-        '--use_lars',
-        action='store_true',
-        help='If set, use lars for optimizers, ONLY support resnet module.')
    parser.add_argument(
        '--reduce_strategy',
        type=str,

--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -200,11 +200,6 @@ def get_model(args, is_train, main_prog, startup_prog):
            # configure optimize
            optimizer = None
            if is_train:
-                if args.use_lars:
-                    lars_decay = 1.0
-                else:
-                    lars_decay = 0.0
-
                total_images = 1281167 / trainer_count

                step = int(total_images / (args.batch_size * args.gpus) + 1)

--- a/benchmark/fluid/models/resnet_with_preprocess.py
+++ b/benchmark/fluid/models/resnet_with_preprocess.py
@@ -224,11 +224,6 @@ def get_model(args, is_train, main_prog, startup_prog):
            # configure optimize
            optimizer = None
            if is_train:
-                if args.use_lars:
-                    lars_decay = 1.0
-                else:
-                    lars_decay = 0.0
-
                total_images = 1281167 / trainer_count

                step = int(total_images / args.batch_size + 1)

--- a/benchmark/fluid/models/se_resnext.py
+++ b/benchmark/fluid/models/se_resnext.py
@@ -244,11 +244,6 @@ def get_model(args, is_train, main_prog, startup_prog):

            optimizer = None
            if is_train:
-                if args.use_lars:
-                    lars_decay = 1.0
-                else:
-                    lars_decay = 0.0
-
                total_images = 1281167 / trainer_count

                step = int(total_images / args.batch_size + 1)
@@ -262,8 +257,7 @@ def get_model(args, is_train, main_prog, startup_prog):
                    learning_rate=fluid.layers.piecewise_decay(
                        boundaries=bd, values=lr),
                    momentum=0.9,
-                    regularization=fluid.regularizer.L2Decay(1e-4),
-                    LARS_weight_decay=lars_decay)
+                    regularization=fluid.regularizer.L2Decay(1e-4))
                optimizer.minimize(avg_cost)

                if args.memory_optimize:

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -29,7 +29,7 @@ INCLUDE(ExternalProject)
 SET(MKLML_PROJECT       "extern_mklml")
 IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
  MESSAGE(STATUS "use pre defined download url")
-  SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE)
+  SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
 ENDIF()
 MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")

--- a/doc/README.md
+++ b/doc/README.md
+# For Readers and Developers
+
+Thanks for reading PaddlePaddle documentation. 
+
+Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [Fluiddoc Repo](https://github.com/PaddlePaddle/Paddle) and updated in Fluiddoc Repo.
+
+Please turn to Fluiddoc Repo for the latest documentation.
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -73,7 +73,6 @@ paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program',
 paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
 paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
 paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
 paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
@@ -296,6 +295,7 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
 paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
 paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
 paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
+paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
 paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
 paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
 paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
@@ -350,25 +350,25 @@ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'fi
 paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
 paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
 paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
-paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate'], varargs=None, keywords='kwargs', defaults=None)
+paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov'], varargs=None, keywords='kwargs', defaults=(False,))
+paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
 paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon'], varargs=None, keywords='kwargs', defaults=(1e-06,))
+paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
 paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08))
+paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
 paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08))
+paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
 paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06))
+paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
 paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5))
+paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
 paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False))
+paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
 paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95))
+paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
 paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
-paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window'], varargs=None, keywords='kwargs', defaults=(10000, 10000))
+paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
 paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
 paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -148,13 +148,13 @@ if(WITH_DISTRIBUTE)
 else()
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
 endif()
-
+ 
 if (NOT WIN32)
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS
-        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
-        graph graph_viz_pass multi_devices_graph_pass
-        multi_devices_graph_print_pass multi_devices_graph_check_pass
-        fast_threaded_ssa_graph_executor)
+  cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+          threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+          graph graph_viz_pass multi_devices_graph_pass
+          multi_devices_graph_print_pass multi_devices_graph_check_pass
+          fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
 endif() # NOT WIN32

 cc_library(prune SRCS prune.cc DEPS framework_proto)

--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -54,6 +54,8 @@ struct BuildStrategy {

  std::string debug_graphviz_path_{""};

+  bool fuse_elewise_add_act_ops_{false};
+
  bool enable_data_balance_{false};
 };


--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -20,41 +20,79 @@ namespace paddle {
 namespace framework {
 namespace details {

-template <class T>
-class COWPtr {
+// Change it to thread safe flags if needed.
+class ThreadUnsafeOwnershipFlags {
 public:
-  typedef std::shared_ptr<T> RefPtr;
+  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}

- private:
-  RefPtr m_sp;
+  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags& operator=(
+      const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;

-  void detach() {
-    T* tmp = m_sp.get();
-    if (!(tmp == nullptr || m_sp.unique())) {
-      m_sp = RefPtr(new T(*tmp));
+  void SetOwnership(bool flag) { flag_ = flag; }
+
+  // Invoke the callback if it is not owned.
+  template <typename Callback>
+  void AcquireOwnershipOnce(Callback acquire) {
+    if (!flag_) {
+      acquire();
+      flag_ = true;
    }
  }

- public:
-  COWPtr() : m_sp(nullptr) {}
-  explicit COWPtr(T* t) : m_sp(t) {}
-  explicit COWPtr(const RefPtr& refptr) : m_sp(refptr) {}
+ private:
+  bool flag_;
+};

-  const T& Data() const { return operator*(); }
+// Copy-On-Write pointer.
+// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
+//
+// The template parameter OwnershipFlags should have:
+//   * a constructor takes a bool. True if own.
+//   * SetOwnership(bool flag).
+//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
+//     owned.
+//
+// https://en.wikipedia.org/wiki/Copy-on-write
+template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
+class COWPtr {
+ public:
+  // Ctor from raw pointer.
+  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}

-  T* MutableData() { return operator->(); }
+  // Move methods. Steal ownership from origin
+  COWPtr(COWPtr&& other)
+      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
+  COWPtr& operator=(COWPtr&& origin) = default;

-  const T& operator*() const { return *m_sp; }
-  T& operator*() {
-    detach();
-    return *m_sp;
+  // Copy methods. Not own payload
+  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
+  COWPtr& operator=(const COWPtr& other) {
+    payload_ = other.payload_;
+    ownership_.SetOwnership(false);
+    return *this;
  }
-  const T* operator->() const { return m_sp.operator->(); }
-  T* operator->() {
-    detach();
-    return m_sp.operator->();
+
+  // Access read only data.
+  const T& Data() const { return *payload_; }
+
+  // Access mutable data. If the data is not owned, the data will be copied
+  // before.
+  T* MutableData() {
+    ownership_.AcquireOwnershipOnce(
+        [this] { payload_.reset(new T(*payload_)); });
+    return payload_.get();
  }
+
+ private:
+  // Actual data pointer.
+  std::shared_ptr<T> payload_;
+
+  // Ownership flag.
+  OwnershipFlags ownership_;
 };
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -30,14 +30,6 @@ TEST(COWPtr, all) {
  ASSERT_EQ(ptr2.Data(), 10);
 }

-TEST(COWPtr, change_old) {
-  COWPtr<int> ptr(new int{0});
-  COWPtr<int> ptr2 = ptr;
-  *ptr.MutableData() = 10;
-  ASSERT_EQ(ptr2.Data(), 0);
-  ASSERT_EQ(ptr.Data(), 10);
-}
-
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -210,43 +210,6 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
  return recv_vars;
 }

-bool MultiDevSSAGraphBuilder::IsDistTrainOp(
-    ir::Node *node, const std::vector<std::string> &send_vars,
-    const std::vector<std::string> &recv_vars) const {
-  if (send_vars.size() == 0 || recv_vars.size() == 0) {
-    return false;
-  }
-
-  /**
-   * Check any of opvars contains `.block` and in sendvars
-   */
-  auto checker = [](const std::vector<std::string> &opvars,
-                    const std::vector<std::string> &rpc_vars) -> bool {
-    for (auto &var : opvars) {
-      // a variable name with the suffix `.block` means it's a splited
-      // variable by (DistributeTranspiler)
-      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
-      if (var.find(".block") != std::string::npos &&
-          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
-        return true;
-      }
-    }
-    return false;
-  };
-
-  std::vector<std::string> input_var_names;
-  std::vector<std::string> output_var_names;
-  for (ir::Node *input : node->inputs) {
-    input_var_names.push_back(input->Name());
-  }
-  for (ir::Node *output : node->outputs) {
-    output_var_names.push_back(output->Name());
-  }
-
-  return checker(output_var_names, send_vars) ||
-         checker(input_var_names, recv_vars);
-}
-
 size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
    const std::vector<std::string> &var_names) const {
  int64_t numel_sum = 0;
@@ -370,7 +333,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
        }
      }
      is_dist_train = true;
-    } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
+    } else if (boost::get<int>(node->Op()->GetAttr(
+                   OpProtoAndCheckerMaker::OpRoleAttrName())) ==
+               static_cast<int>(OpRole::kDist)) {
      int op_dev_id = CreateDistTrainOp(&result, node);
      if (node->Op()->Type() == "concat") {
        auto origin_param_name = node->Op()->OutputArgumentNames()[0];
@@ -736,6 +701,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
          .emplace(varname, op_dev_id);
    }
  } else {
+    LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
    PADDLE_THROW(
        "the distribute training related op should be in [split_byref, "
        "concat].");

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -51,12 +51,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;

-  /**
-   * Is this operator as the end-point operator before/after send operator.
-   */
-  bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
-                     const std::vector<std::string> &recv_vars) const;
-
  std::vector<std::string> FindDistTrainSendVars(
      const std::vector<ir::Node *> &nodes) const;


--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -37,6 +37,8 @@ pass_library(fc_lstm_fuse_pass inference)
 pass_library(fc_gru_fuse_pass inference)
 pass_library(seq_concat_fc_fuse_pass inference)

+cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
+
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")

 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)

--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> FuseElewiseAddActPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  std::unordered_set<std::string> act_types = {"relu", "scale"};
+  graph = FuseActElewiseAdd(std::move(graph), act_types);
+  graph = FuseElewiseAddAct(std::move(graph), act_types);
+  // backward
+  {
+    std::unordered_set<std::string> in_place_act_types = {"relu_grad"};
+    graph = FuseElewiseAddActInplaceGrad(std::move(graph), in_place_act_types);
+  }
+
+  // Remove the removable intermediate_out.
+  RemoveIntermediateOut(graph.get());
+
+  return graph;
+}
+
+// ele_add(x, act(y))
+std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
+    std::unique_ptr<ir::Graph> graph,
+    const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("elewise_add_act", graph.get());
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode("elewise_add_act/x")
+                ->AsInput()
+                ->assert_is_op_input("elementwise_add", "X");
+  patterns::ElewiseAddAct elewise_add_act_pattern(gpd.mutable_pattern(),
+                                                  "elementwise_add");
+
+  elewise_add_act_pattern(x, act_types);
+
+  int found_elewise_add_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle FuseElewiseAddAct fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
+                              elewise_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act, act, elewise_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, elewise_add_act_pattern);
+
+    std::string ele_x_n = subgraph.at(x)->Name();
+    std::string ele_y_n = ele_y->Name();
+    std::string ele_out_n = ele_out->Name();
+    std::string act_out_n = act_out->Name();
+
+    Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
+        g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n);
+
+    VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
+            << ele_add->Name() << " -> " << ele_out_n << "\n"
+            << "\t " << ele_out_n << " -> " << act->Name() << " -> "
+            << act_out_n;
+
+    ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node);
+    found_elewise_add_act_count++;
+  };
+
+  gpd(graph.get(), handler);
+
+  AddStatis(found_elewise_add_act_count);
+  return graph;
+}
+
+// act(ele_add(x,y))
+std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
+    std::unique_ptr<ir::Graph> graph,
+    const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("act_elewise_add", graph.get());
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode("act_elewise_add/x")
+                ->AsInput()
+                ->assert_is_ops_input(act_types, "X");
+  patterns::ActElewiseAdd act_elewise_add_pattern(gpd.mutable_pattern(),
+                                                  "act_elewise_add");
+
+  act_elewise_add_pattern(x, act_types);
+
+  int found_elewise_add_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle FuseElewiseAddAct fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
+                              act_elewise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act, act, act_elewise_add_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, act_elewise_add_pattern);
+
+    std::string act_i_n = subgraph.at(x)->Name();
+    std::string act_o_n = act_out->Name();
+    std::string elewise_add_x_n = ele_x->Name();
+    std::string elewise_add_out_n = ele_out->Name();
+
+    Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
+        g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n);
+
+    VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
+            << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
+            << ele_add->Name() << " -> " << elewise_add_out_n;
+
+    ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node);
+    found_elewise_add_act_count++;
+  };
+
+  gpd(graph.get(), handler);
+
+  AddStatis(found_elewise_add_act_count);
+  return graph;
+}
+
+// the backward of act(ele_add(x,y))
+// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
+// ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
+std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
+    std::unique_ptr<ir::Graph> graph,
+    const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("elewise_add_act_grad", graph.get());
+
+  GraphPatternDetector gpd;
+  auto *d_act_out = gpd.mutable_pattern()
+                        ->NewNode("elewise_add_act_grad_inplace/x")
+                        ->AsInput()
+                        ->assert_is_ops_input(act_types, GradVarName("Out"));
+  patterns::ElewiseAddActInplaceGrad elewise_add_act_grad_pattern(
+      gpd.mutable_pattern(), "elewise_add_act_grad_inplace");
+  elewise_add_act_grad_pattern(d_act_out, act_types);
+
+  int found_elewise_add_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle FuseElewiseAddActGrad1 fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
+                              elewise_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad, ele_add_grad,
+                              elewise_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_ele_x, d_ele_x, elewise_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_ele_y, d_ele_y, elewise_add_act_grad_pattern);
+
+    std::string d_act_out_n = subgraph.at(d_act_out)->Name();
+    std::string act_out_n = act_out->Name();
+    std::string d_itermediate_out_n = d_itermediate_out->Name();
+    std::string ele_y_n = ele_y->Name();
+    std::string d_ele_x_n = d_ele_x->Name();
+    std::string d_ele_y_n = d_ele_y->Name();
+
+    OpDesc desc;
+    desc.SetType("fused_elemwise_activation_grad");
+    desc.SetInput("IntermediateOut", {});
+    desc.SetInput("X", {});
+    desc.SetInput("Y", std::vector<std::string>({ele_y_n}));
+    desc.SetInput("Out", std::vector<std::string>({act_out_n}));
+    desc.SetInput(GradVarName("Out"), std::vector<std::string>({d_act_out_n}));
+    desc.SetOutput(GradVarName("X"), std::vector<std::string>({d_ele_x_n}));
+    desc.SetOutput(GradVarName("Y"), std::vector<std::string>({d_ele_y_n}));
+    desc.SetOutput(GradVarName("IntermediateOut"),
+                   std::vector<std::string>({d_itermediate_out_n}));
+
+    desc.SetAttr("save_intermediate_out", false);
+    desc.SetAttr("functor_list",
+                 std::vector<std::string>(
+                     {act_grad->Op()->Type(), ele_add_grad->Op()->Type()}));
+
+    for (auto &n : {act_grad->Op(), ele_add_grad->Op()}) {
+      for (auto &m_ele : n->GetAttrMap()) {
+        desc.SetAttr(m_ele.first, m_ele.second);
+      }
+    }
+
+    auto fused_node = g->CreateOpNode(&desc);
+
+    VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
+            << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
+            << d_itermediate_out_n << " and " << act_out_n << " -> "
+            << ele_add_grad->Name() << " -> " << d_itermediate_out_n;
+
+    ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node);
+    found_elewise_add_act_count++;
+  };
+
+  gpd(graph.get(), handler);
+
+  AddStatis(found_elewise_add_act_count);
+  return graph;
+}
+
+Node *FuseElewiseAddActPass::CreateFuseElewiseAddActNode(
+    Graph *g, const Node *op_1, const Node *op_2, const std::string &ele_x_n,
+    const std::string &ele_y_n, const std::string &ele_out_n,
+    const std::string &act_out_n) const {
+  OpDesc desc;
+  desc.SetInput("X", std::vector<std::string>({ele_x_n}));
+  desc.SetInput("Y", std::vector<std::string>({ele_y_n}));
+  desc.SetOutput("Out", std::vector<std::string>({act_out_n}));
+  desc.SetOutput("IntermediateOut", std::vector<std::string>({ele_out_n}));
+  desc.SetType("fused_elemwise_activation");
+  desc.SetAttr("save_intermediate_out", true);
+  desc.SetAttr("functor_list", std::vector<std::string>(
+                                   {op_1->Op()->Type(), op_2->Op()->Type()}));
+
+  // Set attrs
+  for (auto &n : {op_1->Op(), op_2->Op()}) {
+    for (auto &m_ele : n->GetAttrMap()) {
+      desc.SetAttr(m_ele.first, m_ele.second);
+    }
+  }
+
+  auto elewise_add_act_node = g->CreateOpNode(&desc);
+  return elewise_add_act_node;
+}
+
+void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const {
+  std::unordered_set<const Node *> need_removed_nodes;
+  for (auto &cur_node : graph->Nodes()) {
+    if (cur_node->IsVar()) continue;
+    if (cur_node->Name() == "fused_elemwise_activation") {
+      bool save_intermediate_out =
+          boost::get<bool>(cur_node->Op()->GetAttr("save_intermediate_out"));
+      auto intermediate_out_args = cur_node->Op()->Output("IntermediateOut");
+      PADDLE_ENFORCE(
+          save_intermediate_out && !intermediate_out_args.empty(),
+          "The %s should save the intermediate_out in the fusing stage.",
+          cur_node->Name());
+
+      // If the intermediate_out's output is empty, it should be removed.
+      auto cur_node_outputs = cur_node->outputs;
+      for (auto &out : cur_node_outputs) {
+        if (out->Name() == intermediate_out_args[0]) {
+          if (out->outputs.size() == 0) {
+            cur_node->outputs = this->RemoveNode(out, cur_node->outputs);
+            need_removed_nodes.insert(std::move(out));
+            cur_node->Op()->SetAttr("save_intermediate_out", false);
+          }
+        }
+      }
+    } else if (cur_node->Name() == "fused_elemwise_activation_grad") {
+      auto intermediate_out_grad_args =
+          cur_node->Op()->Output(GradVarName("IntermediateOut"));
+      PADDLE_ENFORCE(
+          !intermediate_out_grad_args.empty(),
+          "The %s should save the intermediate_out in the fusing stage.",
+          cur_node->Name());
+      auto cur_node_outputs = cur_node->outputs;
+      // If the intermediate_out_g's output is empty, it should be removed.
+      for (auto &out : cur_node_outputs) {
+        if (out->Name() == intermediate_out_grad_args[0] &&
+            out->outputs.empty()) {
+          cur_node->Op()->SetOutput(GradVarName("IntermediateOut"), {});
+          cur_node->outputs = this->RemoveNode(out, cur_node->outputs);
+          need_removed_nodes.insert(std::move(out));
+        }
+      }
+    }
+  }
+  GraphSafeRemoveNodes(graph, need_removed_nodes);
+}
+
+void FuseElewiseAddActPass::ReLinkNodes(Graph *graph,
+                                        const Node *intermediate_out,
+                                        Node *op_1, Node *op_2,
+                                        Node *fused_op) const {  // delete act
+  for (auto &in : op_1->inputs) {
+    fused_op->inputs.emplace_back(in);
+    in->outputs = this->ReplaceNode(op_1, fused_op, in->outputs);
+  }
+
+  std::unordered_set<const Node *> nodes2delete;
+  for (auto &out : op_1->outputs) {
+    if (out->IsCtrlVar()) {
+      auto result_iter = std::find_if(
+          op_2->inputs.begin(), op_2->inputs.end(),
+          [&out](const Node *node) -> bool { return node == out; });
+
+      if (result_iter == op_2->inputs.end()) {
+        IR_OP_VAR_LINK(fused_op, out);
+      } else {
+        nodes2delete.emplace(out);
+      }
+    } else {
+      PADDLE_ENFORCE(out == intermediate_out);
+      IR_OP_VAR_LINK(fused_op, out);
+    }
+  }
+
+  for (auto &in : op_2->inputs) {
+    if (in == intermediate_out || nodes2delete.count(in)) {
+      continue;
+    }
+    fused_op->inputs.emplace_back(in);
+    in->outputs = this->ReplaceNode(op_2, fused_op, in->outputs);
+  }
+
+  for (auto &out : op_2->outputs) {
+    IR_OP_VAR_LINK(fused_op, out);
+  }
+
+  nodes2delete.insert(std::move(op_1));
+  nodes2delete.insert(std::move(op_2));
+
+  GraphSafeRemoveNodes(graph, nodes2delete);
+}
+
+std::vector<Node *> FuseElewiseAddActPass::ReplaceNode(
+    Node *cur_node, Node *new_node, const std::vector<Node *> &nodes) const {
+  std::vector<Node *> new_list(nodes.size());
+  bool has_replaced = false;
+  std::transform(nodes.begin(), nodes.end(), new_list.begin(),
+                 [&](Node *node) -> Node * {
+                   if (node == cur_node) {
+                     has_replaced = true;
+                     return new_node;
+                   }
+                   return node;
+                 });
+  PADDLE_ENFORCE(has_replaced, "Not find %s in the node list.",
+                 cur_node->Name());
+  return new_list;
+}
+
+std::vector<Node *> FuseElewiseAddActPass::RemoveNode(
+    Node *trg_node, const std::vector<Node *> &nodes) const {
+  std::vector<Node *> new_list(nodes.size());
+  auto end_iter =
+      std::copy_if(nodes.begin(), nodes.end(), new_list.begin(),
+                   [&](Node *node) -> bool { return node != trg_node; });
+  new_list.resize(
+      static_cast<uint64_t>(std::distance(new_list.begin(), end_iter)));
+  return new_list;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_elewise_add_act_pass,
+              paddle::framework::ir::FuseElewiseAddActPass);
--- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the ElewiseAdd and activation
+ */
+class FuseElewiseAddActPass : public FusePassBase {
+ public:
+  virtual ~FuseElewiseAddActPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  std::unique_ptr<ir::Graph> FuseElewiseAddAct(
+      std::unique_ptr<ir::Graph> graph,
+      const std::unordered_set<std::string> &act_types) const;
+
+  std::unique_ptr<ir::Graph> FuseActElewiseAdd(
+      std::unique_ptr<ir::Graph> graph,
+      const std::unordered_set<std::string> &act_types) const;
+
+  std::unique_ptr<ir::Graph> FuseElewiseAddActInplaceGrad(
+      std::unique_ptr<ir::Graph> graph,
+      const std::unordered_set<std::string> &act_types) const;
+
+  /**
+   * Remove the removable intermediate_out.
+   *   - If the intermediate_out is only used by the backward op, but the
+   *     backward op doesn't use intermediate_out.
+   *   - If the intermediate_out_grad is not used by any op.
+   */
+  void RemoveIntermediateOut(Graph *graph) const;
+
+  std::vector<Node *> ReplaceNode(Node *cur_node, Node *new_node,
+                                  const std::vector<Node *> &nodes) const;
+
+  std::vector<Node *> RemoveNode(Node *trg_node,
+                                 const std::vector<Node *> &nodes) const;
+
+  void ReLinkNodes(Graph *graph, const Node *intermediate_out, Node *op_1,
+                   Node *op_2, Node *fused_op) const;
+  Node *CreateFuseElewiseAddActNode(Graph *g, const Node *op_1,
+                                    const Node *op_2,
+                                    const std::string &ele_x_n,
+                                    const std::string &ele_y_n,
+                                    const std::string &ele_out_n,
+                                    const std::string &act_out_n) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -95,6 +95,7 @@ struct PDNode {
  PDNode* assert_is_op();
  PDNode* assert_is_op(const std::string& op_type);
  PDNode* assert_is_var();
+  PDNode* assert_is_not_ctrl_var();
  PDNode* assert_var_not_persistable();
  PDNode* assert_is_persistable_var();
  PDNode* assert_is_op_output(const std::string& op_type);
@@ -113,6 +114,20 @@ struct PDNode {
  PDNode* assert_op_has_n_outputs(const std::string& op_type, size_t n);
  PDNode* assert_more(teller_t&& teller);

+  PDNode* assert_is_ops_output(const std::unordered_set<std::string>& op_types);
+  PDNode* assert_is_ops(const std::unordered_set<std::string>& op_types);
+  PDNode* assert_is_ops_output(const std::unordered_set<std::string>& op_types,
+                               const std::string& argument);
+  PDNode* assert_is_ops_nth_input(
+      const std::unordered_set<std::string>& op_types,
+      const std::string& argument, int nth);
+  PDNode* assert_is_ops_input(const std::unordered_set<std::string>& op_types);
+  PDNode* assert_is_ops_input(const std::unordered_set<std::string>& op_types,
+                              const std::string& argument);
+  PDNode* assert_is_ops_nth_output(
+      const std::unordered_set<std::string>& op_types,
+      const std::string& argument, int nth);
+
 private:
  PDNode(PDPattern* pattern, const std::string& name = "",
         Type type = Type::kVar)
@@ -447,6 +462,68 @@ struct GRU : public PatternBase {
  PATTERN_DECL_NODE(Hidden);
 };

+// The following patterns are used to fuse elewise_add and act
+// formula: act(ele_add(x, y))
+// op: elementwise_add + act
+// named nodes: elementwise_add, act
+//              ele_x, ele_y, elewise_add_out, act_out
+struct ElewiseAddAct : public PatternBase {
+  ElewiseAddAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elewise_add_act") {}
+
+  PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(ele_add);
+  PATTERN_DECL_NODE(act);
+  // declare variable node's name
+  PATTERN_DECL_NODE(elewise_add_out);
+  PATTERN_DECL_NODE(ele_y);
+  PATTERN_DECL_NODE(act_out);
+};
+
+// formula: ele_add(x, act(y))
+// op: elementwise_add + act
+// named nodes: elementwise_add, act
+//              act_in, act_out, ele_x, elewise_add_out
+struct ActElewiseAdd : public PatternBase {
+  ActElewiseAdd(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "act_elewise_add") {}
+
+  PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(act);
+  PATTERN_DECL_NODE(ele_add);
+  // declare variable node's name
+  PATTERN_DECL_NODE(act_out);
+  PATTERN_DECL_NODE(ele_x);
+  PATTERN_DECL_NODE(elewise_add_out);
+};
+
+// the backward of act(ele_add(x, y))
+// the act is inplace.
+// op: elementwise_add_grad + act_grad
+// named nodes: elementwise_add_grad, act_grad
+//              act_out, act_out_g, ele_y, d_itermediate_out, d_ele_x, d_ele_y
+struct ElewiseAddActInplaceGrad : public PatternBase {
+  ElewiseAddActInplaceGrad(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "elewise_add_act_grad1") {}
+
+  // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
+  // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
+  PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(act_grad);
+  PATTERN_DECL_NODE(ele_add_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(act_out);
+  PATTERN_DECL_NODE(d_itermediate_out);
+  PATTERN_DECL_NODE(d_ele_x);
+  PATTERN_DECL_NODE(d_ele_y);
+  PATTERN_DECL_NODE(ele_y);
+};
 }  // namespace patterns

 // Link two ir::Nodes from each other.
@@ -454,6 +531,12 @@ struct GRU : public PatternBase {
  a->outputs.push_back(b);    \
  b->inputs.push_back(a);

+// Set the out_var as the output of the op
+#define IR_OP_VAR_LINK(op, out_var) \
+  op->outputs.push_back(out_var);   \
+  out_var->inputs.clear();          \
+  out_var->inputs.push_back(op);
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -48,6 +48,10 @@ class Node {

  bool IsOp() const { return type_ == Type::kOperation; }
  bool IsVar() const { return type_ == Type::kVariable; }
+  bool IsCtrlVar() const {
+    return type_ == Type::kVariable &&
+           Name().find(ir::Node::kControlDepVarName) != std::string::npos;
+  }

  std::vector<Node*> inputs;
  std::vector<Node*> outputs;

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -17,12 +17,10 @@
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
-#include <utility>
 #include <vector>
-#include "paddle/fluid/framework/details/cow_ptr.h"
+
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/memory/memcpy.h"

 #include "glog/logging.h"

@@ -30,401 +28,206 @@ namespace paddle {
 namespace framework {

 #if defined(PADDLE_WITH_CUDA)
-namespace details {
-struct CUDABuffer {
-  void *data_{nullptr};
-  size_t size_{0};
-  platform::CUDAPlace place_;
-
-  CUDABuffer() {}
-  CUDABuffer(platform::Place place, size_t size)
-      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
-    data_ = memory::Alloc(place_, size);
-  }
-
-  ~CUDABuffer() { ClearMemory(); }
-
-  CUDABuffer(const CUDABuffer &o) = delete;
-  CUDABuffer &operator=(const CUDABuffer &o) = delete;
-
-  void Resize(platform::Place place, size_t size) {
-    ClearMemory();
-    place_ = boost::get<platform::CUDAPlace>(place);
-    data_ = memory::Alloc(place_, size);
-    size_ = size;
-  }
-
-  void Swap(CUDABuffer &o) {
-    std::swap(data_, o.data_);
-    std::swap(place_, o.place_);
-    std::swap(size_, o.size_);
-  }
-
- private:
-  void ClearMemory() const {
-    if (data_) {
-      memory::Free(place_, data_);
-    }
-  }
-};
-}  // namespace details
-
 // Vector<T> implements the std::vector interface, and can get Data or
 // MutableData from any place. The data will be synced implicitly inside.
 template <typename T>
 class Vector {
 public:
  using value_type = T;
-  using iterator = typename std::vector<T>::iterator;
-  using const_iterator = typename std::vector<T>::const_iterator;
-
- private:
-  // The actual class to implement vector logic
-  class VectorData {
-   public:
-    VectorData() : flag_(kDataInCPU) {}
-    VectorData(size_t count, const T &value)
-        : cpu_(count, value), flag_(kDataInCPU) {}
-    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
-    template <typename U>
-    explicit VectorData(const std::vector<U> &dat)
-        : cpu_(dat), flag_(kDataInCPU) {}
-
-    VectorData(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-    }
-
-    VectorData &operator=(const VectorData &o) {
-      o.ImmutableCPU();
-      cpu_ = o.cpu_;
-      flag_ = kDataInCPU;
-      details::CUDABuffer null;
-      gpu_.Swap(null);
-      return *this;
-    }
-
-    T &operator[](size_t i) {
-      MutableCPU();
-      return cpu_[i];
-    }
-
-    const T &operator[](size_t i) const {
-      ImmutableCPU();
-      return cpu_[i];
-    }
-
-    size_t size() const { return cpu_.size(); }
-
-    iterator begin() {
-      MutableCPU();
-      return cpu_.begin();
-    }
-
-    iterator end() {
-      MutableCPU();
-      return cpu_.end();
-    }
-
-    T &front() {
-      MutableCPU();
-      return cpu_.front();
-    }
-
-    T &back() {
-      MutableCPU();
-      return cpu_.back();
-    }
-
-    const_iterator begin() const {
-      ImmutableCPU();
-      return cpu_.begin();
-    }
-
-    const_iterator end() const {
-      ImmutableCPU();
-      return cpu_.end();
-    }
-
-    const T &back() const {
-      ImmutableCPU();
-      return cpu_.back();
-    }
-
-    T *data() { return &(*this)[0]; }
-
-    const T *data() const { return &(*this)[0]; }
-
-    const T &front() const {
-      ImmutableCPU();
-      return cpu_.front();
-    }
-
-    // assign this from iterator.
-    // NOTE: the iterator must support `end-begin`
-    template <typename Iter>
-    void assign(Iter begin, Iter end) {
-      MutableCPU();
-      cpu_.assign(begin, end);
-    }
-
-    // push_back. If the previous capacity is not enough, the memory will
-    // double.
-    void push_back(T elem) {
-      MutableCPU();
-      cpu_.push_back(elem);
-    }
-
-    // extend a vector by iterator.
-    // NOTE: the iterator must support end-begin
-    template <typename It>
-    void Extend(It begin, It end) {
-      MutableCPU();
-      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
-      std::copy(begin, end, out_it);
-    }
-
-    // resize the vector
-    void resize(size_t size) {
-      MutableCPU();
-      cpu_.resize(size);
-    }
-
-    // get cuda ptr. immutable
-    const T *CUDAData(platform::Place place) const {
-      PADDLE_ENFORCE(platform::is_gpu_place(place),
-                     "CUDA Data must on CUDA place");
-      ImmutableCUDA(place);
-      return reinterpret_cast<T *>(gpu_.data_);
-    }
-
-    // get cuda ptr. mutable
-    T *CUDAMutableData(platform::Place place) {
-      const T *ptr = CUDAData(place);
-      flag_ = kDirty | kDataInCUDA;
-      return const_cast<T *>(ptr);
-    }
-
-    // clear
-    void clear() {
-      cpu_.clear();
-      flag_ = kDirty | kDataInCPU;
-    }
-
-    size_t capacity() const { return cpu_.capacity(); }
-
-    // reserve data
-    void reserve(size_t size) { cpu_.reserve(size); }
-
-    // implicit cast operator. Vector can be cast to std::vector implicitly.
-    operator std::vector<T>() const {
-      ImmutableCPU();
-      return cpu_;
-    }
-
-    bool operator==(const VectorData &other) const {
-      ImmutableCPU();
-      other.ImmutableCPU();
-      return cpu_ == other.cpu_;
-    }
-
-   private:
-    enum DataFlag {
-      kDataInCPU = 0x01,
-      kDataInCUDA = 0x02,
-      // kDirty means the data has been changed in one device.
-      kDirty = 0x10
-    };
-
-    void CopyToCPU() const {
-      // COPY GPU Data To CPU
-      void *src = gpu_.data_;
-      void *dst = cpu_.data();
-      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
-                   nullptr);
-    }
-
-    void MutableCPU() {
-      if (IsInCUDA() && IsDirty()) {
-        CopyToCPU();
-      }
-      flag_ = kDirty | kDataInCPU;
-    }
-
-    void ImmutableCUDA(platform::Place place) const {
-      if (IsDirty()) {
-        if (IsInCPU()) {
-          CopyCPUDataToCUDA(place);
-          UnsetFlag(kDirty);
-          SetFlag(kDataInCUDA);
-        } else if (IsInCUDA() &&
-                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-          CopyCUDADataToAnotherPlace(place);
-          // Still dirty
-        } else {
-          // Dirty && DataInCUDA && Device is same
-          // Do nothing
-        }
-      } else {
-        if (!IsInCUDA()) {
-          // Even data is not dirty. However, data is not in CUDA. Copy data.
-          CopyCPUDataToCUDA(place);
-          SetFlag(kDataInCUDA);
-        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-          CopyCUDADataToAnotherPlace(place);
-        } else {
-          // Not Dirty && DataInCUDA && Device is same
-          // Do nothing.
-        }
-      }
-    }
-    void CopyCUDADataToAnotherPlace(const platform::Place &place) const {
-      details::CUDABuffer tmp(place, gpu_.size_);
-      const void *src = gpu_.data_;
-      void *dst = tmp.data_;
-
-      memory::Copy(tmp.place_, dst, gpu_.place_, src, gpu_.size_, nullptr);
-      gpu_.Swap(tmp);
-    }
-    void CopyCPUDataToCUDA(const platform::Place &place) const {
-      void *src = cpu_.data();
-      gpu_.Resize(place, cpu_.size() * sizeof(T));
-      void *dst = gpu_.data_;
-      auto stream = static_cast<platform::CUDADeviceContext *>(
-                        platform::DeviceContextPool::Instance().Get(place))
-                        ->stream();
-      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
-                   stream);
-    }
-
-    void ImmutableCPU() const {
-      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
-                                      // CPU has no data.
-        CopyToCPU();
-        UnsetFlag(kDirty);
-      }
-      SetFlag(kDataInCPU);
-    }
-
-    void UnsetFlag(int flag) const { flag_ &= ~flag; }
-    void SetFlag(int flag) const { flag_ |= flag; }
-
-    bool IsDirty() const { return flag_ & kDirty; }
-
-    bool IsInCUDA() const { return flag_ & kDataInCUDA; }

-    bool IsInCPU() const { return flag_ & kDataInCPU; }
-
-    mutable std::vector<T> cpu_;
-    mutable details::CUDABuffer gpu_;
-    mutable int flag_;
-  };
-
- public:
  // Default ctor. Create empty Vector
-  Vector() : m_(new VectorData()) {}
+  Vector() { InitEmpty(); }

  // Fill vector with value. The vector size is `count`.
-  explicit Vector(size_t count, const T &value = T())
-      : m_(new VectorData(count, value)) {}
+  explicit Vector(size_t count, const T &value = T()) {
+    InitEmpty();
+    if (count != 0) {
+      resize(count);
+      T *ptr = begin();
+      for (size_t i = 0; i < count; ++i) {
+        ptr[i] = value;
+      }
+    }
+  }

  // Ctor with init_list
-  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
+  Vector(std::initializer_list<T> init) {
+    if (init.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(init.size(), init.begin(), init.end());
+    }
+  }

  // implicit cast from std::vector.
  template <typename U>
-  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
+  Vector(const std::vector<U> &dat) {  // NOLINT
+    if (dat.size() == 0) {
+      InitEmpty();
+    } else {
+      InitByIter(dat.size(), dat.begin(), dat.end());
+    }
  }

  // Copy ctor
-  Vector(const Vector<T> &other) { m_ = other.m_; }
+  Vector(const Vector<T> &other) { this->operator=(other); }

  // Copy operator
  Vector<T> &operator=(const Vector<T> &other) {
-    m_ = other.m_;
+    if (other.size() != 0) {
+      this->InitByIter(other.size(), other.begin(), other.end());
+    } else {
+      InitEmpty();
+    }
    return *this;
  }

  // Move ctor
-  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
+  Vector(Vector<T> &&other) {
+    this->size_ = other.size_;
+    this->flag_ = other.flag_;
+    if (other.cuda_vec_.memory_size()) {
+      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
+    }
+    if (other.cpu_vec_.memory_size()) {
+      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
+    }
+  }

  // CPU data access method. Mutable.
-  T &operator[](size_t i) { return (*m_)[i]; }
+  T &operator[](size_t i) {
+    MutableCPU();
+    return const_cast<T *>(cpu_vec_.data<T>())[i];
+  }

  // CPU data access method. Immutable.
-  const T &operator[](size_t i) const { return (*m_)[i]; }
+  const T &operator[](size_t i) const {
+    ImmutableCPU();
+    return cpu_vec_.data<T>()[i];
+  }

  // std::vector iterator methods. Based on CPU data access method
-  size_t size() const { return m_->size(); }
+  size_t size() const { return size_; }

-  iterator begin() { return m_->begin(); }
+  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }

-  iterator end() { return m_->end(); }
+  T *end() {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }

-  T &front() { return m_->front(); }
+  T &front() { return *begin(); }

-  T &back() { return m_->back(); }
+  T &back() {
+    auto it = end();
+    --it;
+    return *it;
+  }

-  const_iterator begin() const { return m_->begin(); }
+  const T *begin() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
+  }

-  const_iterator end() const { return m_->end(); }
+  const T *end() const {
+    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+  }

-  const_iterator cbegin() const { return begin(); }
+  const T *cbegin() const { return begin(); }

-  const_iterator cend() const { return end(); }
+  const T *cend() const { return end(); }

-  const T &back() const { return m_->back(); }
+  const T &back() const {
+    auto it = end();
+    --it;
+    return *it;
+  }

-  T *data() { return m_->data(); }
+  T *data() { return begin(); }

-  const T *data() const { return m_->data(); }
+  const T *data() const { return begin(); }

-  const T &front() const { return m_->front(); }
+  const T &front() const { return *begin(); }
  // end of std::vector iterator methods

  // assign this from iterator.
  // NOTE: the iterator must support `end-begin`
  template <typename Iter>
  void assign(Iter begin, Iter end) {
-    m_->assign(begin, end);
+    InitByIter(end - begin, begin, end);
  }

  // push_back. If the previous capacity is not enough, the memory will
  // double.
-  void push_back(T elem) { m_->push_back(elem); }
+  void push_back(T elem) {
+    if (size_ + 1 > capacity()) {
+      reserve((size_ + 1) << 1);
+    }
+    *end() = elem;
+    ++size_;
+  }

  // extend a vector by iterator.
  // NOTE: the iterator must support end-begin
  template <typename It>
  void Extend(It begin, It end) {
-    m_->Extend(begin, end);
+    size_t pre_size = size_;
+    resize(pre_size + (end - begin));
+    T *ptr = this->begin() + pre_size;
+    for (; begin < end; ++begin, ++ptr) {
+      *ptr = *begin;
+    }
  }

  // resize the vector
  void resize(size_t size) {
-    if (m_.Data().size() != size) {
-      m_->resize(size);
+    if (size + 1 <= capacity()) {
+      size_ = size;
+    } else {
+      MutableCPU();
+      Tensor cpu_tensor;
+      platform::Place cpu = platform::CPUPlace();
+      T *ptr = cpu_tensor.mutable_data<T>(
+          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+      const T *old_ptr =
+          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
+      if (old_ptr != nullptr) {
+        std::copy(old_ptr, old_ptr + size_, ptr);
+      }
+      size_ = size;
+      cpu_vec_.ShareDataWith(cpu_tensor);
    }
  }

  // get cuda ptr. immutable
  const T *CUDAData(platform::Place place) const {
-    return m_.Data().CUDAData(place);
+    PADDLE_ENFORCE(platform::is_gpu_place(place),
+                   "CUDA Data must on CUDA place");
+    ImmutableCUDA(place);
+    return cuda_vec_.data<T>();
  }

  // get cuda ptr. mutable
  T *CUDAMutableData(platform::Place place) {
-    return m_->CUDAMutableData(place);
+    const T *ptr = CUDAData(place);
+    flag_ = kDirty | kDataInCUDA;
+    return const_cast<T *>(ptr);
  }

  // clear
-  void clear() { m_->clear(); }
+  void clear() {
+    size_ = 0;
+    flag_ = kDirty | kDataInCPU;
+  }

-  size_t capacity() const { return m_->capacity(); }
+  size_t capacity() const {
+    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
+  }

  // reserve data
-  void reserve(size_t size) { m_->reserve(size); }
+  void reserve(size_t size) {
+    size_t pre_size = size_;
+    resize(size);
+    resize(pre_size);
+  }

  // the unify method to access CPU or CUDA data. immutable.
  const T *Data(platform::Place place) const {
@@ -445,7 +248,12 @@ class Vector {
  }

  // implicit cast operator. Vector can be cast to std::vector implicitly.
-  operator std::vector<T>() const { return *m_; }
+  operator std::vector<T>() const {
+    std::vector<T> result;
+    result.resize(size());
+    std::copy(begin(), end(), result.begin());
+    return result;
+  }

  bool operator==(const Vector<T> &other) const {
    if (size() != other.size()) return false;
@@ -459,11 +267,118 @@ class Vector {
    return true;
  }

-  const void *Handle() const { return &m_.Data(); }
-
 private:
-  // Vector is an COW object.
-  details::COWPtr<VectorData> m_;
+  void InitEmpty() {
+    size_ = 0;
+    flag_ = kDataInCPU;
+  }
+
+  template <typename Iter>
+  void InitByIter(size_t size, Iter begin, Iter end) {
+    platform::Place cpu = platform::CPUPlace();
+    T *ptr = this->cpu_vec_.template mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+    for (size_t i = 0; i < size; ++i) {
+      *ptr++ = *begin++;
+    }
+    flag_ = kDataInCPU | kDirty;
+    size_ = size;
+  }
+
+  enum DataFlag {
+    kDataInCPU = 0x01,
+    kDataInCUDA = 0x02,
+    // kDirty means the data has been changed in one device.
+    kDirty = 0x10
+  };
+
+  void CopyToCPU() const {
+    // COPY GPU Data To CPU
+    TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
+    WaitPlace(cuda_vec_.place());
+  }
+
+  void MutableCPU() {
+    if (IsInCUDA() && IsDirty()) {
+      CopyToCPU();
+    }
+    flag_ = kDirty | kDataInCPU;
+  }
+
+  void ImmutableCUDA(platform::Place place) const {
+    if (IsDirty()) {
+      if (IsInCPU()) {
+        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+                   &cuda_vec_);
+        WaitPlace(place);
+        UnsetFlag(kDirty);
+        SetFlag(kDataInCUDA);
+      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        cuda_vec_.ShareDataWith(tmp);
+        // Still dirty
+      } else {
+        // Dirty && DataInCUDA && Device is same
+        // Do nothing
+      }
+    } else {
+      if (!IsInCUDA()) {
+        // Even data is not dirty. However, data is not in CUDA. Copy data.
+        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+                   &cuda_vec_);
+        WaitPlace(place);
+        SetFlag(kDataInCUDA);
+      } else if (!(place == cuda_vec_.place())) {
+        framework::Tensor tmp;
+        WaitPlace(cuda_vec_.place());
+        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+        WaitPlace(cuda_vec_.place());
+        WaitPlace(place);
+        cuda_vec_.ShareDataWith(tmp);
+      } else {
+        // Not Dirty && DataInCUDA && Device is same
+        // Do nothing.
+      }
+    }
+  }
+
+  void ImmutableCPU() const {
+    if (IsDirty() &&
+        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
+      CopyToCPU();
+      UnsetFlag(kDirty);
+    }
+    SetFlag(kDataInCPU);
+  }
+
+  void UnsetFlag(int flag) const { flag_ &= ~flag; }
+  void SetFlag(int flag) const { flag_ |= flag; }
+
+  bool IsDirty() const { return flag_ & kDirty; }
+
+  bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+  bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+  static void WaitPlace(const platform::Place place) {
+    if (platform::is_gpu_place(place)) {
+      platform::DeviceContextPool::Instance()
+          .Get(boost::get<platform::CUDAPlace>(place))
+          ->Wait();
+    }
+  }
+
+  static T &EmptyDummy() {
+    static T dummy = T();
+    return dummy;
+  }
+
+  mutable int flag_;
+  mutable Tensor cpu_vec_;
+  mutable Tensor cuda_vec_;
+  size_t size_;
 };

 #else  // PADDLE_WITH_CUDA

--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -120,6 +120,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
          {static_cast<int>(OpRole::kForward),
           static_cast<int>(OpRole::kBackward),
           static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
+           static_cast<int>(OpRole::kDist), static_cast<int>(OpRole::kLRSched),
           static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
           static_cast<int>(OpRole::kLoss) |
               static_cast<int>(OpRole::kBackward),

--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -26,7 +26,13 @@ enum class OpRole {
  kForward = 0x0000,
  kBackward = 0x0001,
  kOptimize = 0x0002,
+  // RPC role is for send/recv releated op
  kRPC = 0x0003,
+  // Dist role is for split_byref/split_selected_rows/concat
+  // used for distributed training.
+  kDist = 0x0004,
+  // Tag all learning rate scheduler operators.
+  kLRSched = 0x0005,

  kLoss = 0x0100,
  // The default value of op's role. This should be only used for unittests and

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -57,6 +57,21 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
    graph = viz_pass->Apply(std::move(graph));
  }

+  // Apply op fusion.
+  if (strategy.fuse_elewise_add_act_ops_) {
+    auto fuse_elewise_add_act_pass =
+        ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
+    graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
+    // Apply a graph viz pass to record a graph.
+    if (!strategy.debug_graphviz_path_.empty()) {
+      auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
+      const std::string graph_path = string::Sprintf(
+          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+      graph = viz_pass->Apply(std::move(graph));
+    }
+  }
+
  // Convert graph to run on multi-devices.
  auto multi_devices_pass =
      ir::PassRegistry::Instance().Get("multi_devices_pass");
@@ -359,6 +374,7 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle

+USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);

--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -103,108 +103,74 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  input_slots->assign({input_tensor});
 }

-const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,
-                                25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43,
-                                44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39,
-                                14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
-
-void TestLACPrediction(const std::string &model_path,
-                       const std::string &data_file, const int batch_size,
-                       const int repeat, bool use_analysis = false) {
-  AnalysisConfig cfg;
-  cfg.model_dir = model_path;
-  cfg.use_gpu = false;
-  cfg.device = 0;
-  cfg.specify_input_name = true;
-  cfg.enable_ir_optim = true;
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}

-  std::vector<PaddleTensor> input_slots, outputs_slots;
-  DataRecord data(data_file, batch_size);
-  GetOneBatch(&input_slots, &data, batch_size);
-  std::unique_ptr<PaddlePredictor> predictor;
-  if (use_analysis) {
-    predictor =
-        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
-  } else {
-    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-  }
-  for (int i = 0; i < FLAGS_burning; i++) {
-    predictor->Run(input_slots, &outputs_slots);
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.batched_datas.size() : 1;
+  LOG(INFO) << "number of samples: " << epoch;
+  for (int bid = 0; bid < epoch; ++bid) {
+    GetOneBatch(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
  }
-  Timer timer;
-  if (FLAGS_test_all_data) {
-    LOG(INFO) << "test all data";
-    std::vector<std::vector<PaddleTensor>> input_slots_all;
-    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
-      GetOneBatch(&input_slots, &data, batch_size);
-      input_slots_all.emplace_back(input_slots);
-    }
-    LOG(INFO) << "total number of samples: " << data.datasets.size();
-    TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
-    return;
-  }
-  timer.tic();
-  for (int i = 0; i < repeat; i++) {
-    predictor->Run(input_slots, &outputs_slots);
-  }
-  PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
+}

-  // check result
-  EXPECT_EQ(outputs_slots.size(), 1UL);
-  auto &out = outputs_slots[0];
-  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                [](int a, int b) { return a * b; });
-  size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
-  PADDLE_ENFORCE_GT(size, 0);
-  EXPECT_GE(size, batch1_size);
-  int64_t *pdata = static_cast<int64_t *>(out.data.data());
-  for (size_t i = 0; i < batch1_size; ++i) {
-    EXPECT_EQ(pdata[i], lac_ref_data[i]);
-  }
+// Easy for profiling independently.
+TEST(Analyzer_LAC, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-  if (use_analysis) {
-    // run once for comparion as reference
-    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-    std::vector<PaddleTensor> ref_outputs_slots;
-    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    CompareResult(ref_outputs_slots, outputs_slots);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);

-    AnalysisPredictor *analysis_predictor =
-        dynamic_cast<AnalysisPredictor *>(predictor.get());
-    auto &fuse_statis = analysis_predictor->analysis_argument()
-                            .Get<std::unordered_map<std::string, int>>(
-                                framework::ir::kFuseStatisAttr);
-    for (auto &item : fuse_statis) {
-      LOG(INFO) << "fused " << item.first << " " << item.second;
-    }
-    int num_ops = 0;
-    for (auto &node :
-         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-      if (node->IsFunction()) {
-        ++num_ops;
-      }
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    // the first inference result
+    const int64_t lac_ref_data[] = {
+        24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
+        44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
+        15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
+    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    size_t size = GetSize(outputs[0]);
+    size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
+    PADDLE_ENFORCE_GE(size, batch1_size);
+    int64_t *pdata = static_cast<int64_t *>(outputs[0].data.data());
+    for (size_t i = 0; i < batch1_size; ++i) {
+      EXPECT_EQ(pdata[i], lac_ref_data[i]);
    }
-    LOG(INFO) << "has num ops: " << num_ops;
-    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
-    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
-    EXPECT_EQ(num_ops, 11);
  }
 }

-TEST(Analyzer_LAC, native) {
-  LOG(INFO) << "LAC with native";
-  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
-                    FLAGS_repeat);
+// Check the fuse status
+TEST(Analyzer_LAC, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+  EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
+  EXPECT_EQ(num_ops, 11);
 }

-TEST(Analyzer_LAC, analysis) {
-  LOG(INFO) << "LAC with analysis";
-  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
-                    FLAGS_repeat, true);
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_LAC, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -95,97 +95,73 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-// the first inference result
-const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
-                                       48, 39, 38, 16, 25};
-
-void TestChineseNERPrediction(bool use_analysis) {
-  AnalysisConfig cfg;
-  cfg.prog_file = FLAGS_infer_model + "/__model__";
-  cfg.param_file = FLAGS_infer_model + "/param";
-  cfg.use_gpu = false;
-  cfg.device = 0;
-  cfg.specify_input_name = true;
-  cfg.enable_ir_optim = true;
-
-  std::vector<PaddleTensor> input_slots, outputs;
-  std::unique_ptr<PaddlePredictor> predictor;
-  Timer timer;
-  if (use_analysis) {
-    predictor =
-        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
-  } else {
-    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-  }
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  cfg->param_file = FLAGS_infer_model + "/param";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}

-  if (FLAGS_test_all_data) {
-    LOG(INFO) << "test all data";
-    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-    std::vector<std::vector<PaddleTensor>> input_slots_all;
-    for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
-      PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-      input_slots_all.emplace_back(input_slots);
-    }
-    LOG(INFO) << "total number of samples: " << data.num_samples;
-    TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
-    return;
-  }
-  // Prepare inputs.
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-  PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-
-  timer.tic();
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    predictor->Run(input_slots, &outputs);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
  }
-  PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat);
+}

-  PADDLE_ENFORCE(outputs.size(), 1UL);
-  auto &out = outputs[0];
-  size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                [](int a, int b) { return a * b; });
-  PADDLE_ENFORCE_GT(size, 0);
-  int64_t *result = static_cast<int64_t *>(out.data.data());
-  for (size_t i = 0; i < std::min(11UL, size); i++) {
-    PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
-  }
+// Easy for profiling independently.
+TEST(Analyzer_Chinese_ner, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-  if (use_analysis) {
-    // run once for comparion as reference
-    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-    std::vector<PaddleTensor> ref_outputs_slots;
-    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    CompareResult(ref_outputs_slots, outputs);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);

-    AnalysisPredictor *analysis_predictor =
-        dynamic_cast<AnalysisPredictor *>(predictor.get());
-    auto &fuse_statis = analysis_predictor->analysis_argument()
-                            .Get<std::unordered_map<std::string, int>>(
-                                framework::ir::kFuseStatisAttr);
-    for (auto &item : fuse_statis) {
-      LOG(INFO) << "fused " << item.first << " " << item.second;
-    }
-    int num_ops = 0;
-    for (auto &node :
-         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-      if (node->IsFunction()) {
-        ++num_ops;
-      }
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    // the first inference result
+    const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
+                                           48, 39, 38, 16, 25};
+    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(size, 0);
+    int64_t *result = static_cast<int64_t *>(outputs[0].data.data());
+    for (size_t i = 0; i < std::min(11UL, size); i++) {
+      EXPECT_EQ(result[i], chinese_ner_result_data[i]);
    }
-    LOG(INFO) << "has num ops: " << num_ops;
-    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
-    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-    EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
-    EXPECT_EQ(num_ops, 14);
  }
 }

-TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); }
+// Check the fuse status
+TEST(Analyzer_Chinese_ner, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);

-TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); }
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+  EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
+  EXPECT_EQ(num_ops, 14);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_Chinese_ner, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
+}

 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -25,6 +25,7 @@ struct DataRecord {
  std::vector<size_t> lod1, lod2, lod3;
  std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
      rnn_minute_datas;
+  size_t num_samples;  // total number of samples
  size_t batch_iter{0};
  size_t batch_size{1};
  DataRecord() = default;
@@ -97,6 +98,7 @@ struct DataRecord {
      week_data_all.push_back(std::move(week_data));
      minute_data_all.push_back(std::move(minute_data));
    }
+    num_samples = num_lines;
  }
 };
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
@@ -147,89 +149,72 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-// Test with a really complicate model.
-void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
-  AnalysisConfig config;
-  config.prog_file = FLAGS_infer_model + "/__model__";
-  config.param_file = FLAGS_infer_model + "/param";
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-  config.enable_ir_optim = activate_ir;
-  PADDLE_ENFORCE(config.ir_mode ==
-                 AnalysisConfig::IrPassMode::kExclude);  // default
-  config.ir_passes.clear();  // Do not exclude any pass.
-
-  int batch_size = FLAGS_batch_size;
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  cfg->param_file = FLAGS_infer_model + "/param";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+  cfg->ir_passes.clear();  // Do not exclude any pass.
+}

-  auto base_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
  std::vector<PaddleTensor> input_slots;
-  DataRecord data(FLAGS_infer_data, batch_size);
-  // Prepare inputs.
-  PrepareInputs(&input_slots, &data, batch_size);
-  std::vector<PaddleTensor> outputs, base_outputs;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
+  }
+}

-  base_predictor->Run(input_slots, &base_outputs);
+// Easy for profiling independently.
+TEST(Analyzer_rnn1, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

  std::vector<std::vector<PaddleTensor>> input_slots_all;
-  input_slots_all.emplace_back(input_slots);
-  if (num_threads == 1) {
-    TestOneThreadPrediction(config, input_slots_all, &outputs);
-    CompareResult(outputs, base_outputs);
-  } else {
-    // only return the output of first thread
-    TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
-  }
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+}

-  if (use_analysis && activate_ir) {
-    AnalysisPredictor *analysis_predictor =
-        dynamic_cast<AnalysisPredictor *>(predictor.get());
-    auto &fuse_statis = analysis_predictor->analysis_argument()
-                            .Get<std::unordered_map<std::string, int>>(
-                                framework::ir::kFuseStatisAttr);
-    for (auto &item : fuse_statis) {
-      LOG(INFO) << "fused " << item.first << " " << item.second;
-    }
+// Check the fuse status
+TEST(Analyzer_rnn1, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);

-    int num_ops = 0;
-    for (auto &node :
-         analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-      if (node->IsFunction()) {
-        ++num_ops;
-      }
-    }
-    LOG(INFO) << "has num ops: " << num_ops;
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+  ASSERT_TRUE(fuse_statis.count("fc_fuse"));
+  EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
+  EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
+  EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
+  EXPECT_EQ(num_ops,
+            13);  // After graph optimization, only 13 operators exists.
+}

-    ASSERT_TRUE(fuse_statis.count("fc_fuse"));
-    EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
-    EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2);  // bi-directional LSTM
-    EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
-    EXPECT_EQ(num_ops,
-              13);  // After graph optimization, only 13 operators exists.
-  }
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_rnn1, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
 }

-// Inference with analysis and IR, easy for profiling independently.
-TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); }
+// Test Multi-Thread.
+TEST(Analyzer_rnn1, multi_thread) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-// Other unit-tests of RNN1, test different options of use_analysis,
-// activate_ir and multi-threads.
-TEST(Analyzer, RNN_tests) {
-  int num_threads[2] = {1, 4};
-  for (auto i : num_threads) {
-    // Directly infer with the original model.
-    TestRNN1Prediction(false, false, i);
-    // Inference with the original model with the analysis turned on, the
-    // analysis module will transform the program to a data flow graph.
-    TestRNN1Prediction(true, false, i);
-    // Inference with analysis and IR. The IR module will fuse some large
-    // kernels.
-    TestRNN1Prediction(true, true, i);
-  }
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */);
 }

 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
@@ -12,24 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-
-#include <google/protobuf/text_format.h>
-#include <gtest/gtest.h>
-#include <thread>  // NOLINT
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@@ -41,6 +24,7 @@ struct DataRecord {
  std::vector<size_t> lod;
  std::vector<std::vector<float>> rnn_link_data;
  std::vector<float> result_data;
+  size_t num_samples;  // total number of samples
  size_t batch_iter{0};
  size_t batch_size{1};
  DataRecord() = default;
@@ -100,6 +84,7 @@ struct DataRecord {
        result_data.insert(result_data.end(), tmp.begin(), tmp.end());
      }
    }
+    num_samples = num_lines / 2;
  }
 };
 void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
@@ -118,64 +103,58 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  input_slots->assign({feed_tensor});
 }

-void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<float> &base_result) {
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_result[i], 1e-3);
-    }
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  cfg->param_file = FLAGS_infer_model + "/param";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
  }
 }
-// Test with a really complicate model.
-void TestRNN2Prediction() {
-  AnalysisConfig config;
-  config.prog_file = FLAGS_infer_model + "/__model__";
-  config.param_file = FLAGS_infer_model + "/param";
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-  config.enable_ir_optim = true;
-  PADDLE_ENFORCE(config.ir_mode ==
-                 AnalysisConfig::IrPassMode::kExclude);  // default

-  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;
+// Easy for profiling independently.
+TEST(Analyzer_rnn2, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-  auto base_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
-  std::vector<PaddleTensor> input_slots;
-  DataRecord data(FLAGS_infer_data, batch_size);
-  PrepareInputs(&input_slots, &data, batch_size);
-  std::vector<PaddleTensor> outputs, base_outputs;
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);

-  Timer timer1;
-  timer1.tic();
-  for (int i = 0; i < num_times; i++) {
-    base_predictor->Run(input_slots, &base_outputs);
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    // the first inference result
+    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+    PADDLE_ENFORCE_GT(outputs.size(), 0);
+    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *result = static_cast<float *>(outputs[0].data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(result[i], data.result_data[i], 1e-3);
+    }
  }
-  PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times);
+}

-  Timer timer2;
-  timer2.tic();
-  for (int i = 0; i < num_times; i++) {
-    predictor->Run(input_slots, &outputs);
-  }
-  PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_rnn2, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);

-  CompareResult(base_outputs, data.result_data);
-  CompareResult(outputs, data.result_data);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
 }

-TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
-
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -46,54 +46,63 @@ struct DataReader {
  std::unique_ptr<std::ifstream> file;
 };

-void Main(int batch_size) {
-  // shape --
-  // Create Predictor --
-  AnalysisConfig config;
-  config.model_dir = FLAGS_infer_model;
-  config.use_gpu = false;
-  config.enable_ir_optim = true;
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}

-  std::vector<PaddleTensor> input_slots, output_slots;
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  std::vector<PaddleTensor> input_slots;
  DataReader reader(FLAGS_infer_data);
-  std::vector<std::vector<PaddleTensor>> input_slots_all;
-
-  if (FLAGS_test_all_data) {
-    LOG(INFO) << "test all data";
-    int num_batches = 0;
-    while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
-      input_slots_all.emplace_back(input_slots);
-      ++num_batches;
-    }
-    LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
-    TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
-    return;
+  int num_batches = 0;
+  while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
+    (*inputs).emplace_back(input_slots);
+    ++num_batches;
+    if (!FLAGS_test_all_data) return;
  }
+  LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
+}

-  // one batch starts
-  // data --
-  reader.NextBatch(&input_slots, FLAGS_batch_size);
-  input_slots_all.emplace_back(input_slots);
-  TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
+// Easy for profiling independently.
+TEST(Analyzer_Text_Classification, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;

-  // Get output
-  LOG(INFO) << "get outputs " << output_slots.size();
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);

-  for (auto &output : output_slots) {
-    LOG(INFO) << "output.shape: " << to_string(output.shape);
-    // no lod ?
-    CHECK_EQ(output.lod.size(), 0UL);
-    LOG(INFO) << "output.dtype: " << output.dtype;
-    std::stringstream ss;
-    for (int i = 0; i < 5; i++) {
-      ss << static_cast<float *>(output.data.data())[i] << " ";
+  if (FLAGS_num_threads == 1) {
+    // Get output
+    LOG(INFO) << "get outputs " << outputs.size();
+    for (auto &output : outputs) {
+      LOG(INFO) << "output.shape: " << to_string(output.shape);
+      // no lod ?
+      CHECK_EQ(output.lod.size(), 0UL);
+      LOG(INFO) << "output.dtype: " << output.dtype;
+      std::stringstream ss;
+      for (int i = 0; i < 5; i++) {
+        ss << static_cast<float *>(output.data.data())[i] << " ";
+      }
+      LOG(INFO) << "output.data summary: " << ss.str();
+      // one batch ends
    }
-    LOG(INFO) << "output.data summary: " << ss.str();
-    // one batch ends
  }
 }

-TEST(text_classification, basic) { Main(FLAGS_batch_size); }
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_Text_Classification, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
+}

 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
@@ -49,84 +49,83 @@ Record ProcessALine(const std::string &line) {
  return record;
 }

-/*
- * Use the native and analysis fluid engine to inference the demo.
- * ocr, mobilenet and se_resnext50
- */
-void TestVisualPrediction(bool use_mkldnn) {
-  std::unique_ptr<PaddlePredictor> predictor;
-  AnalysisConfig cfg;
-  cfg.param_file = FLAGS_infer_model + "/__params__";
-  cfg.prog_file = FLAGS_infer_model + "/__model__";
-  cfg.use_gpu = false;
-  cfg._use_mkldnn = use_mkldnn;
-  cfg.device = 0;
-  cfg.enable_ir_optim = true;
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->param_file = FLAGS_infer_model + "/__params__";
+  cfg->prog_file = FLAGS_infer_model + "/__model__";
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->enable_ir_optim = true;
+  cfg->specify_input_name = true;
  // TODO(TJ): fix fusion gru
-  cfg.ir_passes.push_back("fc_gru_fuse_pass");
+  cfg->ir_passes.push_back("fc_gru_fuse_pass");
 #ifdef PADDLE_WITH_MKLDNN
+  cfg->_use_mkldnn = true;
  // disable mkldnn fuse since it should have some bugs
-  cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
+  cfg->ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
 #endif
-  predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+}

-  // Only have single batch of data.
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
  std::string line;
  std::ifstream file(FLAGS_infer_data);
  std::getline(file, line);
  auto record = ProcessALine(line);
-  file.close();

-  // Inference.
  PaddleTensor input;
  input.shape = record.shape;
-  input.data =
-      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
  input.dtype = PaddleDType::FLOAT32;
+  size_t input_size = record.data.size() * sizeof(float);
+  input.data.Resize(input_size);
+  memcpy(input.data.data(), record.data.data(), input_size);
+  std::vector<PaddleTensor> input_slots;
+  input_slots.assign({input});
+  (*inputs).emplace_back(input_slots);
+}

-  std::vector<PaddleTensor> outputs_slots;
-  Timer timer;
-  timer.tic();
-  for (int i = 0; i < FLAGS_repeat; i++) {
-    predictor->Run({input}, &outputs_slots);
-  }
-  PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
-            timer.toc() / FLAGS_repeat);
-
-  VLOG(3) << "output.size " << outputs_slots.size();
-
-  // run native as reference
-  auto ref_predictor =
-      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
-  std::vector<PaddleTensor> ref_outputs_slots;
-  ref_predictor->Run({input}, &ref_outputs_slots);
-  CompareResult(outputs_slots, ref_outputs_slots);
-  // print what are fused
-  AnalysisPredictor *analysis_predictor =
-      dynamic_cast<AnalysisPredictor *>(predictor.get());
-  auto &fuse_statis = analysis_predictor->analysis_argument()
-                          .Get<std::unordered_map<std::string, int>>(
-                              framework::ir::kFuseStatisAttr);
-  for (auto &item : fuse_statis) {
-    LOG(INFO) << "fused " << item.first << " " << item.second;
-  }
-  int num_ops = 0;
-  for (auto &node :
-       analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
-    if (node->IsFunction()) {
-      ++num_ops;
+// Easy for profiling independently.
+//  ocr, mobilenet and se_resnext50
+TEST(Analyzer_vis, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    const float ocr_result_data[] = {
+        5.273636460856323538e-08, 3.296741795111302054e-07,
+        1.873261190610264748e-08, 3.403730275408634043e-08,
+        3.383312474625199684e-08};
+    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *result = static_cast<float *>(outputs[0].data.data());
+    for (size_t i = 0; i < std::min(5UL, size); i++) {
+      EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3);
    }
  }
-  LOG(INFO) << "has num ops: " << num_ops;
 }

-TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); }
-#ifdef PADDLE_WITH_MKLDNN
-TEST(Analyzer_vis, analysis_mkldnn) {
-  TestVisualPrediction(/*use_mkldnn*/ true);
+// Check the fuse status
+TEST(Analyzer_vis, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  GetFuseStatis(cfg, &num_ops);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_vis, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
 }
-#endif

 }  // namespace analysis
 }  // namespace inference

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <gtest/gtest.h>
+#include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
@@ -28,17 +29,18 @@
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data file");
 DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(burning, 0, "Burning before repeat.");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
 DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
 DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+DEFINE_bool(use_analysis, true,
+            "Running the inference program in analysis mode.");

 namespace paddle {
 namespace inference {

 void CompareResult(const std::vector<PaddleTensor> &outputs,
                   const std::vector<PaddleTensor> &ref_outputs) {
-  EXPECT_GT(outputs.size(), 0);
+  EXPECT_GT(outputs.size(), 0UL);
  EXPECT_EQ(outputs.size(), ref_outputs.size());
  for (size_t i = 0; i < outputs.size(); i++) {
    auto &out = outputs[i];
@@ -72,14 +74,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
  }
 }

+std::unique_ptr<PaddlePredictor> GetPrediction(AnalysisConfig config,
+                                               bool use_analysis = true) {
+  if (use_analysis) {
+    return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+        config);
+  } else {
+    return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
+        config);
+  }
+}
+
+size_t GetSize(const PaddleTensor &out) {
+  return std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                         [](int a, int b) { return a * b; });
+}
+
+std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
+                                                   int *num_ops) {
+  auto predictor = GetPrediction(config);
+  AnalysisPredictor *analysis_predictor =
+      dynamic_cast<AnalysisPredictor *>(predictor.get());
+  auto &fuse_statis = analysis_predictor->analysis_argument()
+                          .Get<std::unordered_map<std::string, int>>(
+                              framework::ir::kFuseStatisAttr);
+  for (auto &item : fuse_statis) {
+    LOG(INFO) << "fused " << item.first << " " << item.second;
+  }
+  int num = 0;
+  for (auto &node :
+       analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+    if (node->IsFunction()) {
+      ++num;
+    }
+  }
+  *num_ops = num;
+  return fuse_statis;
+}
+
 void TestOneThreadPrediction(
    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
-    std::vector<PaddleTensor> *outputs) {
+    std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);
+  auto predictor = GetPrediction(config, use_analysis);
  Timer timer;
  timer.tic();
  for (int i = 0; i < num_times; i++) {
@@ -93,7 +131,8 @@ void TestOneThreadPrediction(

 void TestMultiThreadPrediction(
    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
-    std::vector<PaddleTensor> *outputs, int num_threads) {
+    std::vector<PaddleTensor> *outputs, int num_threads,
+    bool use_analysis = true) {
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
  std::vector<std::thread> threads;
@@ -101,9 +140,7 @@ void TestMultiThreadPrediction(
  // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
  // because AttentionLSTM's hard code nodeid will be damanged.
  for (int tid = 0; tid < num_threads; ++tid) {
-    predictors.emplace_back(
-        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-            config));
+    predictors.emplace_back(GetPrediction(config, use_analysis));
  }
  for (int tid = 0; tid < num_threads; ++tid) {
    threads.emplace_back([&, tid]() {
@@ -129,13 +166,25 @@ void TestMultiThreadPrediction(

 void TestPrediction(AnalysisConfig config,
                    const std::vector<std::vector<PaddleTensor>> inputs,
-                    std::vector<PaddleTensor> *outputs, int num_threads) {
+                    std::vector<PaddleTensor> *outputs, int num_threads,
+                    bool use_analysis = FLAGS_use_analysis) {
+  LOG(INFO) << "use_analysis: " << use_analysis;
  if (num_threads == 1) {
-    TestOneThreadPrediction(config, inputs, outputs);
+    TestOneThreadPrediction(config, inputs, outputs, use_analysis);
  } else {
-    TestMultiThreadPrediction(config, inputs, outputs, num_threads);
+    TestMultiThreadPrediction(config, inputs, outputs, num_threads,
+                              use_analysis);
  }
 }

+void CompareNativeAndAnalysis(
+    AnalysisConfig config,
+    const std::vector<std::vector<PaddleTensor>> inputs) {
+  std::vector<PaddleTensor> native_outputs, analysis_outputs;
+  TestOneThreadPrediction(config, inputs, &native_outputs, false);
+  TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
+  CompareResult(analysis_outputs, native_outputs);
+}
+
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/operators/adam_op.h
+++ b/paddle/fluid/operators/adam_op.h
@@ -174,12 +174,13 @@ struct SparseAdamFunctor {

  const int64_t* rows_;
  int64_t row_numel_;
+  int64_t row_count_;

  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
                    const T* beta2_pow, const T* mom1, T* mom1_out,
                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
                    const T* param, T* param_out, const int64_t* rows,
-                    int64_t row_numel)
+                    int64_t row_numel, int64_t row_count)
      : beta1_(beta1),
        beta2_(beta2),
        epsilon_(epsilon),
@@ -194,28 +195,47 @@ struct SparseAdamFunctor {
        param_(param),
        param_out_(param_out),
        rows_(rows),
-        row_numel_(row_numel) {}
+        row_numel_(row_numel),
+        row_count_(row_count) {}
+
+  inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
+    int64_t beg = 0, end = row_count_ - 1;
+    while (beg <= end) {
+      auto mid = ((beg + end) >> 1);
+      if (rows_[mid] == row)
+        return mid;
+      else if (rows_[mid] < row)
+        beg = mid + 1;
+      else
+        end = mid - 1;
+    }
+    return -1;
+  }

  inline HOSTDEVICE void operator()(size_t i) const {
+    int64_t row = i / row_numel_;
+    auto row_idx = BinarySearchInRows(row);
+    T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
+
+    // The following code is the same as dense
+    T mom1 = moment1_[i];
+    T mom2 = moment2_[i];
+    T lr = *lr_;
    T beta1_pow = *beta1_pow_;
    T beta2_pow = *beta2_pow_;
-    for (int64_t j = 0; j < row_numel_; ++j) {
-      T g = grad_[i * row_numel_ + j];
-      T mom1 = moment1_[rows_[i] * row_numel_ + j];
-      T mom2 = moment2_[rows_[i] * row_numel_ + j];
-      T lr = *lr_;
-      T p = param_[rows_[i] * row_numel_ + j];
-
-      lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-
-      mom1 = beta1_ * mom1 + (1 - beta1_) * g;
-      mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
-      p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
-
-      moment1_out_[rows_[i] * row_numel_ + j] = mom1;
-      moment2_out_[rows_[i] * row_numel_ + j] = mom2;
-      param_out_[rows_[i] * row_numel_ + j] = p;
-    }  // for col id
+    T p = param_[i];
+
+    // Calculation
+    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+
+    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+    // Write back to global memory
+    moment1_out_[i] = mom1;
+    moment2_out_[i] = mom2;
+    param_out_[i] = p;
  }
 };

@@ -287,9 +307,14 @@ class AdamOpKernel : public framework::OpKernel<T> {
        return;
      }
      // merge duplicated rows if any.
+      // The rows of grad_merge have been sorted inside MergeAdd functor
      scatter::MergeAdd<DeviceContext, T> merge_func;
-      auto grad_merge =
-          merge_func(ctx.template device_context<DeviceContext>(), grad);
+      auto& grad_merge = *(ctx.scope()
+                               .NewScope()
+                               .Var("sparse_adam_grad_merge")
+                               ->GetMutable<framework::SelectedRows>());
+      merge_func(ctx.template device_context<DeviceContext>(), grad,
+                 &grad_merge);
      auto& grad_tensor = grad_merge.value();
      const T* grad_data = grad_tensor.template data<T>();
      int64_t* rows = nullptr;
@@ -314,10 +339,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
          mom2.template data<T>(),
          mom2_out.template mutable_data<T>(ctx.GetPlace()),
          lr.template data<T>(), grad_data, param.template data<T>(),
-          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel);
+          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
+          grad_merge.rows().size());
      platform::ForRange<DeviceContext> for_range(
          static_cast<const DeviceContext&>(ctx.device_context()),
-          grad_merge.rows().size());
+          param.numel());
      for_range(functor);
    } else {
      PADDLE_THROW("Variable type not supported by adam_op");

--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/transform.h"

 namespace paddle {
@@ -61,14 +62,32 @@ class ClipKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto max = context.Attr<T>("max");
    auto min = context.Attr<T>("min");
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-    const T* x_data = x->data<T>();
-    int64_t numel = x->numel();
-    Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x_data,
-          x_data + numel, out_data, ClipFunctor<T>(min, max));
+    auto* x_var = context.InputVar("X");
+    if (x_var->IsType<framework::LoDTensor>()) {
+      auto* x = context.Input<framework::LoDTensor>("X");
+      auto* out = context.Output<framework::LoDTensor>("Out");
+      T* out_data = out->mutable_data<T>(context.GetPlace());
+      const T* x_data = x->data<T>();
+      int64_t numel = x->numel();
+      Transform<DeviceContext> trans;
+      trans(context.template device_context<DeviceContext>(), x_data,
+            x_data + numel, out_data, ClipFunctor<T>(min, max));
+    } else if (x_var->IsType<framework::SelectedRows>()) {
+      auto* x = context.Input<framework::SelectedRows>("X");
+      auto* out = context.Output<framework::SelectedRows>("Out");
+      PADDLE_ENFORCE_NE(x, out,
+                        "Inplace clip is not allowed when x is SelectedRows");
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      merge_func(context.template device_context<DeviceContext>(), *x, out);
+      auto* out_tensor = out->mutable_value();
+      auto* out_data = out_tensor->data<T>();
+      int64_t numel = out_tensor->numel();
+      Transform<DeviceContext> trans;
+      trans(context.template device_context<DeviceContext>(), out_data,
+            out_data + numel, out_data, ClipFunctor<T>(min, max));
+    } else {
+      PADDLE_THROW("ClipOp only supports LoDTensor and SelectedRows");
+    }
  }
 };

@@ -78,10 +97,12 @@ class ClipGradKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto max = context.Attr<T>("max");
    auto min = context.Attr<T>("min");
-    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_out =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* d_x =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
    if (d_x != nullptr) {
-      auto* x = context.Input<Tensor>("X");
+      auto* x = context.Input<framework::LoDTensor>("X");
      int64_t numel = d_out->numel();
      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
      const T* d_out_data = d_out->data<T>();

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -31,5 +31,6 @@ polygon_box_transform_op.cu)
 detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
 detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
 #Export local libraries to parent
 set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr int kROISize = 4;
+
+template <typename T>
+bool GT_E(T a, T b) {
+  return (a > b) || fabs(a - b) < 1e-4;
+}
+
+template <typename T>
+bool LT_E(T a, T b) {
+  return (a < b) || fabs(a - b) < 1e-4;
+}
+
+template <typename T>
+bool GT(T a, T b) {
+  return (a - b) > 1e-4;
+}
+
+/*
+*check if (x, y) is in the boundary of roi
+*/
+template <typename T>
+bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
+  for (int i = 0; i < 4; i++) {
+    T xs = roi_x[i];
+    T ys = roi_y[i];
+    T xe = roi_x[(i + 1) % 4];
+    T ye = roi_y[(i + 1) % 4];
+    if (fabs(ys - ye) < 1e-4) {
+      if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 &&
+          GT_E<T>(x, std::min(xs, xe)) && LT_E<T>(x, std::max(xs, xe))) {
+        return true;
+      }
+    } else {
+      T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
+      if (fabs(intersec_x - x) < 1e-4 && GT_E<T>(y, std::min(ys, ye)) &&
+          LT_E<T>(y, std::max(ys, ye))) {
+        return true;
+      }
+    }
+  }
+
+  int n_cross = 0;
+  for (int i = 0; i < 4; i++) {
+    T xs = roi_x[i];
+    T ys = roi_y[i];
+    T xe = roi_x[(i + 1) % 4];
+    T ye = roi_y[(i + 1) % 4];
+    if (fabs(ys - ye) < 1e-4) {
+      continue;
+    }
+    if (LT_E<T>(y, std::min(ys, ye)) || GT<T>(y, std::max(ys, ye))) {
+      continue;
+    }
+    T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
+    if (fabs(intersec_x - x) < 1e-4) {
+      return true;
+    }
+    if (GT<T>(intersec_x, x)) {
+      n_cross++;
+    }
+  }
+  return (n_cross % 2 == 1);
+}
+
+/**
+ * Get the matrix of perspective transform.
+ *
+ * dx1 = x1 - x2
+ * dx2 = x3 - x2
+ * dx3 = x0 - x1 + x2 - x3
+ * dy1 = y1 - y2
+ * dy2 = y3 - y2
+ * dy3 = y0 - y1 + y2 - y3
+ *
+ * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
+ * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
+ * a13 = x0
+ * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
+ * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
+ * a23 = y0
+ * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
+ * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
+ * a33 = 1
+ *
+ */
+template <typename T>
+void get_transform_matrix(const int transformed_width,
+                          const int transformed_height, T roi_x[], T roi_y[],
+                          T matrix[]) {
+  T x0 = roi_x[0];
+  T x1 = roi_x[1];
+  T x2 = roi_x[2];
+  T x3 = roi_x[3];
+  T y0 = roi_y[0];
+  T y1 = roi_y[1];
+  T y2 = roi_y[2];
+  T y3 = roi_y[3];
+
+  // Estimate the height and width of RoI
+  T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
+  T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
+  T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
+  T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
+  T estimated_height = (len2 + len4) / 2.0;
+  T estimated_width = (len1 + len3) / 2.0;
+
+  // Get the normalized height and normalized width
+  int normalized_height = transformed_height;
+  int normalized_width =
+      std::round(estimated_width * (normalized_height - 1) / estimated_height) +
+      1;
+  normalized_width = std::min(normalized_width, transformed_width);
+
+  T dx1 = x1 - x2;
+  T dx2 = x3 - x2;
+  T dx3 = x0 - x1 + x2 - x3;
+  T dy1 = y1 - y2;
+  T dy2 = y3 - y2;
+  T dy3 = y0 - y1 + y2 - y3;
+
+  matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) /
+              (normalized_width - 1);
+  matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) /
+              (normalized_height - 1);
+  matrix[8] = 1;
+
+  matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
+              (normalized_width - 1);
+  matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
+              (normalized_height - 1);
+  matrix[5] = y0;
+
+  matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
+              (normalized_width - 1);
+  matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
+              (normalized_height - 1);
+  matrix[2] = x0;
+}
+
+/**
+ * Get the source coordinates in the input feature map.
+ *
+ * (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix
+ *
+ * in_w = u / w
+ * in_h = v / w
+ *
+ */
+template <typename T>
+void get_source_coords(T matrix[], int out_w, int out_h, T* in_w, T* in_h) {
+  T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
+  T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
+  T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
+
+  in_w[0] = u / w;
+  in_h[0] = v / w;
+}
+
+/**
+ * Perform bilinear interpolation in the input feature map.
+ */
+template <typename T>
+void bilinear_interpolate(const T* in_data, const int channels, const int width,
+                          const int height, int in_n, int in_c, T in_w, T in_h,
+                          T* val) {
+  // Deal with cases that source coords are out of feature map boundary
+  if (GT<T>(-0.5, in_w) || GT<T>(in_w, width - 0.5) || GT<T>(-0.5, in_h) ||
+      GT<T>(in_h, height - 0.5)) {
+    // empty
+    val[0] = 0.0;
+    return;
+  }
+
+  if (GT<T>(0, in_w)) {
+    in_w = 0;
+  }
+  if (GT<T>(0, in_h)) {
+    in_h = 0;
+  }
+
+  int in_w_floor = floor(in_w);
+  int in_h_floor = floor(in_h);
+  int in_w_ceil;
+  int in_h_ceil;
+
+  if (GT_E<T>(in_w_floor, width - 1)) {
+    in_w_ceil = in_w_floor = width - 1;
+    in_w = static_cast<T>(in_w_floor);
+  } else {
+    in_w_ceil = in_w_floor + 1;
+  }
+
+  if (GT_E<T>(in_h_floor, height - 1)) {
+    in_h_ceil = in_h_floor = height - 1;
+    in_h = static_cast<T>(in_h_floor);
+  } else {
+    in_h_ceil = in_h_floor + 1;
+  }
+  T w_floor = in_w - in_w_floor;
+  T h_floor = in_h - in_h_floor;
+  T w_ceil = 1 - w_floor;
+  T h_ceil = 1 - h_floor;
+  const T* data = in_data + (in_n * channels + in_c) * height * width;
+  // Do bilinear interpolation
+  T v1 = data[in_h_floor * width + in_w_floor];
+  T v2 = data[in_h_ceil * width + in_w_floor];
+  T v3 = data[in_h_ceil * width + in_w_ceil];
+  T v4 = data[in_h_floor * width + in_w_ceil];
+  T w1 = w_ceil * h_ceil;
+  T w2 = w_ceil * h_floor;
+  T w3 = w_floor * h_floor;
+  T w4 = w_floor * h_ceil;
+  val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+}
+
+template <typename T>
+class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    auto transformed_height = ctx.Attr<int>("transformed_height");
+    auto transformed_width = ctx.Attr<int>("transformed_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int channels = in_dims[1];
+    int in_height = in_dims[2];
+    int in_width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    const T* input_data = in->data<T>();
+
+    framework::Tensor roi2image;
+    roi2image.Resize({rois_num});
+    int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
+    auto lod = rois->lod().back();
+    for (int i = 0; i < lod.size() - 1; ++i) {
+      for (int j = lod[i]; j < lod[i + 1]; ++j) {
+        roi2image_data[j] = i;
+      }
+    }
+
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    const T* rois_data = rois->data<T>();
+
+    for (int n = 0; n < rois_num; ++n) {
+      const T* n_rois = rois_data + n * 8;
+      T roi_x[4];
+      T roi_y[4];
+      for (int k = 0; k < 4; ++k) {
+        roi_x[k] = n_rois[2 * k] * spatial_scale;
+        roi_y[k] = n_rois[2 * k + 1] * spatial_scale;
+      }
+      int image_id = roi2image_data[n];
+      // Get transform matrix
+      T transform_matrix[9];
+      get_transform_matrix<T>(transformed_width, transformed_height, roi_x,
+                              roi_y, transform_matrix);
+
+      for (int c = 0; c < channels; ++c) {
+        for (int out_h = 0; out_h < transformed_height; ++out_h) {
+          for (int out_w = 0; out_w < transformed_width; ++out_w) {
+            int out_index =
+                n * channels * transformed_height * transformed_width +
+                c * transformed_height * transformed_width +
+                out_h * transformed_width + out_w;
+            T in_w, in_h;
+            get_source_coords<T>(transform_matrix, out_w, out_h, &in_w, &in_h);
+            if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
+              if (GT<T>(-0.5, in_w) ||
+                  GT<T>(in_w, static_cast<T>(in_width - 0.5)) ||
+                  GT<T>(-0.5, in_h) ||
+                  GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
+                output_data[out_index] = 0.0;
+              } else {
+                bilinear_interpolate(input_data, channels, in_width, in_height,
+                                     image_id, c, in_w, in_h,
+                                     output_data + out_index);
+              }
+            } else {
+              output_data[out_index] = 0.0;
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+T get_feature_gradient(T xs, T ys, int w, int h, const int width,
+                       const int height) {
+  if (GT<T>(-0.5, xs) || GT<T>(xs, width - 0.5) || GT<T>(-0.5, ys) ||
+      GT<T>(ys, height - 0.5)) {
+    return 0;
+  }
+
+  if (GT<T>(0, xs)) {
+    xs = 0;
+  }
+  if (GT<T>(0, ys)) {
+    ys = 0;
+  }
+
+  int xs_floor = floor(xs);
+  int ys_floor = floor(ys);
+  int xs_ceil;
+  int ys_ceil;
+
+  if (GT_E(xs_floor, width - 1)) {
+    xs_ceil = xs_floor = width - 1;
+    xs = static_cast<T>(xs_floor);
+  } else {
+    xs_ceil = xs_floor + 1;
+  }
+
+  if (GT_E(ys_floor, height - 1)) {
+    ys_ceil = ys_floor = height - 1;
+    ys = static_cast<T>(ys_floor);
+  } else {
+    ys_ceil = ys_floor + 1;
+  }
+
+  T weight = 0;
+  if (w == xs_floor) {
+    if (h == ys_floor) {
+      weight = (w + 1 - xs) * (h + 1 - ys);
+    } else if (h == ys_ceil) {
+      weight = (w + 1 - xs) * (ys + 1 - h);
+    }
+  } else if (w == xs_ceil) {
+    if (h == ys_floor) {
+      weight = (xs + 1 - w) * (h + 1 - ys);
+    } else if (h == ys_ceil) {
+      weight = (xs + 1 - w) * (ys + 1 - h);
+    }
+  }
+  return weight;
+}
+
+template <typename T>
+class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto transformed_height = ctx.Attr<int>("transformed_height");
+    auto transformed_width = ctx.Attr<int>("transformed_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int in_height = in_dims[2];
+    int in_width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const T* rois_data = rois->data<T>();
+
+    framework::Tensor roi2image;
+    roi2image.Resize({rois_num});
+    int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
+    auto lod = rois->lod().back();
+    for (int i = 0; i < lod.size() - 1; ++i) {
+      for (int j = lod[i]; j < lod[i + 1]; ++j) {
+        roi2image_data[j] = i;
+      }
+    }
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < channels; ++c) {
+        for (int in_h = 0; in_h < in_height; ++in_h) {
+          for (int in_w = 0; in_w < in_width; ++in_w) {
+            T gradient = 0.0;
+            for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
+              const T* rois = rois_data + roi_idx * 8;
+              T roi_x[4];
+              T roi_y[4];
+              for (int k = 0; k < 4; ++k) {
+                roi_x[k] = rois[2 * k] * spatial_scale;
+                roi_y[k] = rois[2 * k + 1] * spatial_scale;
+              }
+
+              // Get transform matrix
+              T matrix[9];
+              get_transform_matrix<T>(transformed_width, transformed_height,
+                                      roi_x, roi_y, matrix);
+              const T* out_grad_ptr = out_grad_data +
+                                      (roi_idx * channels + c) *
+                                          transformed_height *
+                                          transformed_width;
+              for (int out_h = 0; out_h < transformed_height; ++out_h) {
+                for (int out_w = 0; out_w < transformed_width; ++out_w) {
+                  T src_w;
+                  T src_h;
+                  get_source_coords<T>(matrix, out_w, out_h, &src_w, &src_h);
+                  if (in_quad<T>(src_w, src_h, roi_x, roi_y)) {
+                    if (GT<T>(-0.5, src_w) ||
+                        GT<T>(src_w, static_cast<T>(in_width - 0.5)) ||
+                        GT<T>(-0.5, src_h) ||
+                        GT<T>(src_h, static_cast<T>(in_height - 0.5))) {
+                      continue;
+                    }
+                    T weight = get_feature_gradient<T>(src_w, src_h, in_w, in_h,
+                                                       in_width, in_height);
+                    gradient +=
+                        out_grad_ptr[out_h * transformed_width + out_w] *
+                        weight;
+                  }
+                }
+              }
+            }
+            int out_idx = (n * channels + c) * in_height * in_width +
+                          in_h * in_width + in_w;
+            in_grad_data[out_idx] = gradient;
+          }
+        }
+      }
+    }
+  }
+};
+
+class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ROIPerspectiveTransformOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("ROIs"),
+        "Input(ROIs) of ROIPerspectiveTransformOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of ROIPerspectiveTransformOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW.");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 8)"
+                   "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]");
+    PADDLE_ENFORCE(rois_dims[1] == 8,
+                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 8)"
+                   "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...].");
+
+    int transformed_height = ctx->Attrs().Get<int>("transformed_height");
+    int transformed_width = ctx->Attrs().Get<int>("transformed_width");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE_GT(transformed_height, 0,
+                      "The transformed output height must greater than 0");
+    PADDLE_ENFORCE_GT(transformed_width, 0,
+                      "The transformed output width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+    std::vector<int64_t> out_dims_v({rois_dims[0],   // num_rois
+                                     input_dims[1],  // channels
+                                     static_cast<int64_t>(transformed_height),
+                                     static_cast<int64_t>(transformed_width)});
+    auto out_dims = framework::make_ddim(out_dims_v);
+
+    ctx->SetOutputDim("Out", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPerspectiveTransformOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor), "
+             "the input of ROIPerspectiveTransformOp. "
+             "The format of input tensor is NCHW. Where N is batch size, "
+             "C is the number of input channels, "
+             "H is the height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
+             "(LoDTensor), "
+             "ROIs (Regions of Interest) to be transformed. "
+             "should be a 2-D LoDTensor of shape (num_rois, 8)"
+             "given as [[x1, y1, x2, y2, x3, y3, x4, y4], ...]."
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the top right coordinates, and"
+             "(x3, y3) is the bottom right coordinates, and"
+             "(x4, y4) is the bottom left coordinates.");
+    AddOutput(
+        "Out",
+        "(Tensor), "
+        "The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
+        "(num_rois, channels, transformed_h, transformed_w).");
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Spatial scale factor to scale ROI coords.")
+        .SetDefault(1.0);
+    AddAttr<int>("transformed_height",
+                 "(int, default 1), "
+                 "The height of transformed output.")
+        .SetDefault(1);
+    AddAttr<int>("transformed_width",
+                 "(int, default 1), "
+                 "The width of transformed output.")
+        .SetDefault(1);
+    AddComment(R"DOC(
+**ROIPerspectiveTransform Operator**
+
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp,
+                  ops::ROIPerspectiveTransformOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(roi_perspective_transform_grad,
+                  ops::ROIPerspectiveTransformGradOp);
+REGISTER_OP_CPU_KERNEL(roi_perspective_transform,
+                       ops::CPUROIPerspectiveTransformOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(roi_perspective_transform_grad,
+                       ops::CPUROIPerspectiveTransformGradOpKernel<float>);
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+// CUDA: index helpers
+#define idx4_4(index, d1, d2, d3, d4) (index % d4)
+#define idx4_3(index, d1, d2, d3, d4) ((index / d4) % d3)
+#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
+#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1)
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__device__ bool GT_E(T a, T b) {
+  return (a > b) || fabs(a - b) < 1e-4;
+}
+
+template <typename T>
+__device__ bool LT_E(T a, T b) {
+  return (a < b) || fabs(a - b) < 1e-4;
+}
+
+template <typename T>
+__device__ bool GT(T a, T b) {
+  return (a - b) > 1e-4;
+}
+
+template <typename T>
+__device__ T max(T a, T b) {
+  return a > b ? a : b;
+}
+
+template <typename T>
+__device__ T min(T a, T b) {
+  return a < b ? a : b;
+}
+
+/*
+* check if (x, y) is in the boundary of roi
+*/
+template <typename T>
+__device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
+  for (int i = 0; i < 4; i++) {
+    T start_w = roi_x[i];
+    T start_h = roi_y[i];
+    T end_w = roi_x[(i + 1) % 4];
+    T end_h = roi_y[(i + 1) % 4];
+    if (fabs(start_h - end_h) < 1e-4) {
+      if (fabs(y - start_h) < 1e-4 && fabs(y - end_h) < 1e-4 &&
+          GT_E<T>(x, min<T>(start_w, end_w)) &&
+          LT_E<T>(x, max<T>(start_w, end_w))) {
+        return true;
+      }
+    } else {
+      T intersec_x =
+          (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
+      if (fabs(intersec_x - x) < 1e-4 && GT_E(y, min<T>(start_h, end_h)) &&
+          LT_E<T>(y, max<T>(start_h, end_h))) {
+        return true;
+      }
+    }
+  }
+
+  int n_cross = 0;
+  for (int i = 0; i < 4; i++) {
+    T start_w = roi_x[i];
+    T start_h = roi_y[i];
+    T end_w = roi_x[(i + 1) % 4];
+    T end_h = roi_y[(i + 1) % 4];
+    if (fabs(start_h - end_h) < 1e-4) {
+      continue;
+    }
+    if (LT_E<T>(y, min<T>(start_h, end_h)) ||
+        GT<T>(y, max<T>(start_h, end_h))) {
+      continue;
+    }
+    T intersec_x =
+        (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
+    if (fabs(intersec_x - x) < 1e-4) {
+      return true;
+    }
+    if (GT<T>(intersec_x, x)) {
+      n_cross++;
+    }
+  }
+  return (n_cross % 2 == 1);
+}
+
+/**
+ * Perform bilinear interpolation in the input feature map.
+ */
+template <typename T>
+__device__ void bilinear_interpolate(const T* in_data, const int channels,
+                                     const int width, const int height,
+                                     int in_n, int in_c, T in_w, T in_h,
+                                     T* val) {
+  // Deal with cases that source coords are out of feature map boundary
+  if (GT<T>(-0.5, in_w) || GT<T>(in_w, width - 0.5) || GT<T>(-0.5, in_h) ||
+      GT<T>(in_h, height - 0.5)) {
+    val[0] = 0.0;
+    return;
+  }
+
+  if (GT<T>(0, in_w)) {
+    in_w = 0;
+  }
+  if (GT<T>(0, in_h)) {
+    in_h = 0;
+  }
+
+  int in_w_floor = floor(in_w);
+  int in_h_floor = floor(in_h);
+  int in_w_ceil;
+  int in_h_ceil;
+
+  if (GT_E<T>(in_w_floor, width - 1)) {
+    in_w_ceil = in_w_floor = width - 1;
+    in_w = static_cast<T>(in_w_floor);
+  } else {
+    in_w_ceil = in_w_floor + 1;
+  }
+
+  if (GT_E<T>(in_h_floor, height - 1)) {
+    in_h_ceil = in_h_floor = height - 1;
+    in_h = static_cast<T>(in_h_floor);
+  } else {
+    in_h_ceil = in_h_floor + 1;
+  }
+
+  T w_floor = in_w - in_w_floor;
+  T h_floor = in_h - in_h_floor;
+  T w_ceil = 1 - w_floor;
+  T h_ceil = 1 - h_floor;
+  const T* data = in_data + (in_n * channels + in_c) * height * width;
+  // Do bilinear interpolation
+  T v1 = data[in_h_floor * width + in_w_floor];
+  T v2 = data[in_h_ceil * width + in_w_floor];
+  T v3 = data[in_h_ceil * width + in_w_ceil];
+  T v4 = data[in_h_floor * width + in_w_ceil];
+  T w1 = w_ceil * h_ceil;
+  T w2 = w_ceil * h_floor;
+  T w3 = w_floor * h_floor;
+  T w4 = w_floor * h_ceil;
+  val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
+}
+
+/**
+ * Get the source coordinates in the input feature map.
+ *
+ * (u, v, w)^matrix = T * (out_w, out_h, 1)^matrix
+ *
+ * in_w = u / w
+ * in_h = v / w
+ *
+ */
+template <typename T>
+__device__ void get_source_coords(T matrix[], int out_w, int out_h, T* in_w,
+                                  T* in_h) {
+  T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
+  T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
+  T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
+
+  in_w[0] = u / w;
+  in_h[0] = v / w;
+}
+
+/**
+ * Get the matrix of perspective transform.
+ *
+ * dx1 = x1 - x2
+ * dx2 = x3 - x2
+ * dx3 = x0 - x1 + x2 - x3
+ * dy1 = y1 - y2
+ * dy2 = y3 - y2
+ * dy3 = y0 - y1 + y2 - y3
+ *
+ * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
+ * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
+ * a13 = x0
+ * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
+ * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
+ * a23 = y0
+ * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
+ * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
+ * a33 = 1
+ *
+ */
+template <typename T>
+__device__ void get_transform_matrix(const int transformed_width,
+                                     const int transformed_height, T roi_x[],
+                                     T roi_y[], T matrix[]) {
+  T x0 = roi_x[0];
+  T x1 = roi_x[1];
+  T x2 = roi_x[2];
+  T x3 = roi_x[3];
+  T y0 = roi_y[0];
+  T y1 = roi_y[1];
+  T y2 = roi_y[2];
+  T y3 = roi_y[3];
+
+  // Estimate the height and width of RoI
+  T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
+  T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
+  T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
+  T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
+  T estimated_height = (len2 + len4) / 2.0;
+  T estimated_width = (len1 + len3) / 2.0;
+
+  // Get the normalized height and normalized width
+  int normalized_height = transformed_height;
+  int normalized_width =
+      round(estimated_width * (normalized_height - 1) / estimated_height) + 1;
+  normalized_width = min(normalized_width, transformed_width);
+
+  T dx1 = x1 - x2;
+  T dx2 = x3 - x2;
+  T dx3 = x0 - x1 + x2 - x3;
+  T dy1 = y1 - y2;
+  T dy2 = y3 - y2;
+  T dy3 = y0 - y1 + y2 - y3;
+
+  matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) /
+              (normalized_width - 1);
+  matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) /
+              (normalized_height - 1);
+  matrix[8] = 1;
+
+  matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
+              (normalized_width - 1);
+  matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
+              (normalized_height - 1);
+  matrix[5] = y0;
+
+  matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
+              (normalized_width - 1);
+  matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
+              (normalized_height - 1);
+  matrix[2] = x0;
+}
+
+template <typename T>
+__global__ void RoiTransformKernel(const float* input_data,
+                                   const float* rois_data,
+                                   const int* roi2image_data, int num_rois,
+                                   int in_height, int in_width, int channels,
+                                   int transformed_height,
+                                   int transformed_width, float spatial_scale,
+                                   T* output_data) {
+  int output_size =
+      num_rois * transformed_height * transformed_width * channels;
+
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    // (n, c, out_h, out_w) is an element in the transformed output
+    int out_w = idx4_4(index, num_rois, channels, transformed_height,
+                       transformed_width);
+    int out_h = idx4_3(index, num_rois, channels, transformed_height,
+                       transformed_width);
+    int c = idx4_2(index, num_rois, channels, transformed_height,
+                   transformed_width);
+    int n = idx4_1(index, num_rois, channels, transformed_height,
+                   transformed_width);
+
+    auto bottom_rois = rois_data + n * 8;
+    int roi_batch_ind = bottom_rois[0];
+    T roi_x[4];
+    T roi_y[4];
+    for (int k = 0; k < 4; ++k) {
+      roi_x[k] = bottom_rois[2 * k] * spatial_scale;
+      roi_y[k] = bottom_rois[2 * k + 1] * spatial_scale;
+    }
+
+    // Get transform matrix
+    T matrix[9];
+    get_transform_matrix<T>(transformed_width, transformed_height, roi_x, roi_y,
+                            matrix);
+
+    // Get source coords
+    T in_w;
+    T in_h;
+    get_source_coords<T>(matrix, out_w, out_h, &in_w, &in_h);
+
+    if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
+      if (GT<T>(-0.5, in_w) || GT<T>(in_w, static_cast<T>(in_width - 0.5)) ||
+          GT<T>(-0.5, in_h) || GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
+        // Skip if source coords is not in input image
+        output_data[index] = 0.0;
+      } else {
+        // Perform bilinear interpolation
+        int in_n = roi2image_data[n];
+        bilinear_interpolate<T>(input_data, channels, in_width, in_height, in_n,
+                                c, in_w, in_h, output_data + index);
+      }
+
+    } else {
+      // Skip if source coords is not in quad
+      output_data[index] = 0.0;
+    }
+  }
+}
+
+template <typename T>
+class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    auto transformed_height = ctx.Attr<int>("transformed_height");
+    auto transformed_width = ctx.Attr<int>("transformed_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int in_height = in_dims[2];
+    int in_width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    const T* input_data = in->data<T>();
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    const T* rois_data = rois->data<T>();
+
+    framework::Tensor roi2image;
+    framework::Tensor roi2image_dev;
+    roi2image.Resize({rois_num});
+    int* roi2image_data = roi2image.mutable_data<int>(platform::CPUPlace());
+    auto lod = rois->lod().back();
+    for (int i = 0; i < lod.size() - 1; ++i) {
+      for (int j = lod[i]; j < lod[i + 1]; ++j) {
+        roi2image_data[j] = i;
+      }
+    }
+    TensorCopySync(roi2image, ctx.GetPlace(), &roi2image_dev);
+
+    int out_size = rois_num * transformed_height * transformed_width * channels;
+    auto stream = ctx.cuda_device_context().stream();
+    int block = 512;
+    int grid = (out_size + block - 1) / block;
+
+    RoiTransformKernel<T><<<grid, block, 0, stream>>>(
+        input_data, rois_data, roi2image_dev.data<int>(), rois_num, in_height,
+        in_width, channels, transformed_height, transformed_width,
+        spatial_scale, output_data);
+  }
+};
+
+template <typename T>
+__device__ T get_feature_gradient(T xs, T ys, int w, int h, const int width,
+                                  const int height) {
+  if (GT<T>(-0.5, xs) || GT<T>(xs, width - 0.5) || GT<T>(-0.5, ys) ||
+      GT<T>(ys, height - 0.5)) {
+    return 0;
+  }
+
+  if (GT<T>(0, xs)) {
+    xs = 0;
+  }
+  if (GT<T>(0, ys)) {
+    ys = 0;
+  }
+
+  int xs_floor = floor(xs);
+  int ys_floor = floor(ys);
+  int xs_ceil;
+  int ys_ceil;
+
+  if (GT_E<T>(xs_floor, width - 1)) {
+    xs_ceil = xs_floor = width - 1;
+    xs = static_cast<T>(xs_floor);
+  } else {
+    xs_ceil = xs_floor + 1;
+  }
+
+  if (GT_E(ys_floor, height - 1)) {
+    ys_ceil = ys_floor = height - 1;
+    ys = static_cast<T>(ys_floor);
+  } else {
+    ys_ceil = ys_floor + 1;
+  }
+
+  T weight = 0;
+  if (w == xs_floor) {
+    if (h == ys_floor) {
+      weight = (w + 1 - xs) * (h + 1 - ys);
+    } else if (h == ys_ceil) {
+      weight = (w + 1 - xs) * (ys + 1 - h);
+    }
+  } else if (w == xs_ceil) {
+    if (h == ys_floor) {
+      weight = (xs + 1 - w) * (h + 1 - ys);
+    } else if (h == ys_ceil) {
+      weight = (xs + 1 - w) * (ys + 1 - h);
+    }
+  }
+  return weight;
+}
+
+template <typename T>
+__global__ void RoiTransformGradKernel(
+    const size_t* lod, const T* rois_data, int batch_size, int num_rois,
+    int in_height, int in_width, int channels, int transformed_height,
+    int transformed_width, float spatial_scale, const T* out_grad_data,
+    T* in_grad_data) {
+  int input_size = batch_size * in_height * in_width * channels;
+
+  CUDA_1D_KERNEL_LOOP(index, input_size) {
+    // (n, c, h, w) coords in input
+    int in_w = idx4_4(index, batch_size, channels, in_height, in_width);
+    int in_h = idx4_3(index, batch_size, channels, in_height, in_width);
+    int c = idx4_2(index, batch_size, channels, in_height, in_width);
+    int n = idx4_1(index, batch_size, channels, in_height, in_width);
+
+    T gradient = 0.0;
+    // Accumulate gradient over all RoIs that interpolated this element
+    for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
+      const T* rois = rois_data + roi_idx * 8;
+      T roi_x[4];
+      T roi_y[4];
+      for (int k = 0; k < 4; ++k) {
+        roi_x[k] = rois[2 * k] * spatial_scale;
+        roi_y[k] = rois[2 * k + 1] * spatial_scale;
+      }
+
+      // Get transform matrix
+      T matrix[9];
+      get_transform_matrix<T>(transformed_width, transformed_height, roi_x,
+                              roi_y, matrix);
+
+      const T* out_grad_ptr =
+          out_grad_data +
+          (roi_idx * channels + c) * transformed_height * transformed_width;
+      for (int out_h = 0; out_h < transformed_height; ++out_h) {
+        for (int out_w = 0; out_w < transformed_width; ++out_w) {
+          T src_w;
+          T src_h;
+          get_source_coords<T>(matrix, out_w, out_h, &src_w, &src_h);
+          if (in_quad<T>(src_w, src_h, roi_x, roi_y)) {
+            if (GT<T>(-0.5, src_w) ||
+                GT<T>(src_w, static_cast<T>(in_width - 0.5)) ||
+                GT<T>(-0.5, src_h) ||
+                GT<T>(src_h, static_cast<T>(in_height - 0.5))) {
+              continue;
+            }
+            T weight = get_feature_gradient<T>(src_w, src_h, in_w, in_h,
+                                               in_width, in_height);
+            gradient +=
+                out_grad_ptr[out_h * transformed_width + out_w] * weight;
+          }
+        }
+      }
+    }
+    in_grad_data[index] = gradient;
+  }
+}
+
+template <typename T>
+class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto transformed_height = ctx.Attr<int>("transformed_height");
+    auto transformed_width = ctx.Attr<int>("transformed_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int in_height = in_dims[2];
+    int in_width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
+    const T* out_grad_data = out_grad->data<T>();
+    const T* rois_data = rois->data<T>();
+
+    auto lod = rois->lod().back();
+    auto lod_data = lod.CUDAData(ctx.GetPlace());
+
+    int in_size = in->numel();
+    auto stream = ctx.cuda_device_context().stream();
+    int block = 512;
+    int grid = (in_size + block - 1) / block;
+
+    RoiTransformGradKernel<T><<<grid, block, 0, stream>>>(
+        lod_data, rois_data, batch_size, rois_num, in_height, in_width,
+        channels, transformed_height, transformed_width, spatial_scale,
+        out_grad_data, in_grad_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(roi_perspective_transform,
+                        ops::CUDAROIPerspectiveTransformOpKernel<float>);
+REGISTER_OP_CUDA_KERNEL(roi_perspective_transform_grad,
+                        ops::CUDAROIPerspectiveTransformGradOpKernel<float>);
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
    int class_num = ctx.Attr<int>("class_num");

-    auto& label_lod = in_label->lod();
-    auto& detect_lod = in_detect->lod();
+    auto label_lod = in_label->lod();
+    auto detect_lod = in_detect->lod();
    PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
                      "Only support one level sequence now.");
    PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto labels = framework::EigenTensor<T, 2>::From(input_label);
    auto detect = framework::EigenTensor<T, 2>::From(input_detect);

-    auto& label_lod = input_label.lod();
-    auto& detect_lod = input_detect.lod();
+    auto label_lod = input_label.lod();
+    auto detect_lod = input_detect.lod();

    int batch_size = label_lod[0].size() - 1;
-    auto& label_index = label_lod[0];
+    auto label_index = label_lod[0];

    for (int n = 0; n < batch_size; ++n) {
      std::map<int, std::vector<Box>> boxes;
@@ -274,6 +274,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {

    output_true_pos->set_lod(true_pos_lod);
    output_false_pos->set_lod(false_pos_lod);
+    return;
  }

  void GetInputPos(const framework::Tensor& input_pos_count,
@@ -291,7 +292,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
    auto SetData = [](const framework::LoDTensor& pos_tensor,
                      std::map<int, std::vector<std::pair<T, int>>>& pos) {
      const T* pos_data = pos_tensor.data<T>();
-      auto& pos_data_lod = pos_tensor.lod()[0];
+      auto pos_data_lod = pos_tensor.lod()[0];
      for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
        for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
          T score = pos_data[j * 2];
@@ -316,23 +317,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
      std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
    int batch_size = gt_boxes.size();
    for (int n = 0; n < batch_size; ++n) {
-      auto& image_gt_boxes = gt_boxes[n];
-      for (auto& image_gt_box : image_gt_boxes) {
+      auto image_gt_boxes = gt_boxes[n];
+      for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
        size_t count = 0;
-        auto& labeled_bboxes = image_gt_box.second;
+        auto labeled_bboxes = it->second;
        if (evaluate_difficult) {
          count = labeled_bboxes.size();
        } else {
-          for (auto& box : labeled_bboxes) {
-            if (!box.is_difficult) {
-              ++count;
-            }
-          }
+          for (size_t i = 0; i < labeled_bboxes.size(); ++i)
+            if (!(labeled_bboxes[i].is_difficult)) ++count;
        }
        if (count == 0) {
          continue;
        }
-        int label = image_gt_box.first;
+        int label = it->first;
        if (label_pos_count->find(label) == label_pos_count->end()) {
          (*label_pos_count)[label] = count;
        } else {

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -92,9 +92,14 @@ bool VariableResponse::CopyLodTensorData(
    ::google::protobuf::io::CodedInputStream* input,
    const platform::DeviceContext& ctx, const framework::DDim& dims,
    int length) {
+  auto server_var = GetVar();
+  if (!server_var) {
+    LOG(ERROR) << "recved var should not on current server: "
+               << meta_.varname();
+    return false;
+  }
  auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
  tensor->Resize(dims);
-
  framework::LoD lod;
  for (int i = 0; i < meta_.lod_level(); ++i) {
    framework::Vector<size_t> v;
@@ -107,7 +112,6 @@ bool VariableResponse::CopyLodTensorData(

  void* tensor_data =
      tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
-
  if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
    return false;
  }

--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
--- a/paddle/fluid/operators/extract_rows_op.cc
+++ b/paddle/fluid/operators/extract_rows_op.cc
@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
    auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
    auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();

-    auto &in_rows = in.rows();
+    auto in_rows = in.rows();
    auto out_dim = framework::make_ddim(
        std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
    auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());

--- a/paddle/fluid/operators/fused_elemwise_activation_op.cc
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.cc
--- a/paddle/fluid/operators/fused_elemwise_activation_op.h
+++ b/paddle/fluid/operators/fused_elemwise_activation_op.h
--- a/paddle/fluid/operators/math/compound_functors.h
+++ b/paddle/fluid/operators/math/compound_functors.h
--- a/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/fluid/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/fluid/operators/math/functors.h
+++ b/paddle/fluid/operators/math/functors.h
@@ -58,9 +58,9 @@ template <typename T>
 struct ScaleGradFunctor {
  explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}

-  inline HOSTDEVICE T operator()(T x) { return coeff_; }
-
-  inline HOSTDEVICE T operator()(T x, T out) { return coeff_; }
+  inline HOSTDEVICE T UseX(T x) { return coeff_; }
+  inline HOSTDEVICE T UseOut(T out) { return coeff_; }
+  inline HOSTDEVICE T UseXAndOut(T x, T out) { return coeff_; }

 private:
  T coeff_;
@@ -73,9 +73,9 @@ struct ReluFunctor {

 template <typename T>
 struct ReluGradFunctor {
-  inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; }
-
-  inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; }
+  inline HOSTDEVICE T UseX(T x) { return x > 0 ? 1 : 0; }
+  inline HOSTDEVICE T UseOut(T out) { return out > 0 ? 1 : 0; }
+  inline HOSTDEVICE T UseXAndOut(T x, T out) { return out > 0 ? 1 : 0; }
 };

 }  // namespace math

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -199,6 +199,14 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
                                     const framework::SelectedRows& input) {
    framework::SelectedRows out;
+    (*this)(context, input, &out);
+    return out;
+  }
+
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input,
+                  framework::SelectedRows* output) {
+    framework::SelectedRows& out = *output;
    auto input_rows = input.rows();
    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
@@ -223,7 +231,6 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
        out_data[out_i * input_width + j] += input_data[i * input_width + j];
      }
    }
-    return out;
  }
 };


--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -60,9 +60,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
    auto out_place = context.GetPlace();
    PADDLE_ENFORCE(platform::is_gpu_place(out_place));

-    memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data,
-                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
-                 in1_value.numel() * sizeof(T), context.stream());
+    memory::Copy(
+        boost::get<platform::CUDAPlace>(out_place), out_data,
+        boost::get<platform::CUDAPlace>(in1_place), in1_data,
+        in1_value.numel() * sizeof(T),
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());

    auto* in2_data = in2_value.data<T>();
    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
@@ -107,7 +109,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);

    auto& in1_value = input1.value();
-    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
@@ -146,7 +148,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
    auto in1_height = input1.height();
    PADDLE_ENFORCE_EQ(in1_height, input2->height());

-    auto& in1_rows = input1.rows();
+    framework::Vector<int64_t> in1_rows(input1.rows());
    auto& in2_rows = *(input2->mutable_rows());

    auto& in1_value = input1.value();
@@ -206,7 +208,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);

    auto& in1_value = input1.value();
-    framework::Vector<int64_t> in1_rows(input1.rows());
+    auto& in1_rows = input1.rows();

    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
@@ -234,7 +236,7 @@ template <typename T, int block_size>
 __global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
                               T* out, const int64_t* out_rows,
                               size_t out_rows_size, int64_t row_numel) {
-  const int ty = blockIdx.y;
+  const int ty = blockIdx.x;
  int tid = threadIdx.x;
  __shared__ size_t out_idx;

@@ -260,6 +262,14 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
                                     const framework::SelectedRows& input) {
    framework::SelectedRows out;
+    (*this)(context, input, &out);
+    return out;
+  }
+
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& input,
+                  framework::SelectedRows* output) {
+    framework::SelectedRows& out = *output;
    framework::Vector<int64_t> input_rows(input.rows());
    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
@@ -281,16 +291,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {

    const int block_size = 256;
    dim3 threads(block_size, 1);
-    dim3 grid1(1, input_rows.size());
+    dim3 grid1(input_rows.size(), 1);

-    MergeAddKernel<
-        T, 256><<<grid1, threads, 0,
-                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                      .stream()>>>(
+    MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
        input_data, input_rows.CUDAData(context.GetPlace()), out_data,
        out.mutable_rows()->CUDAMutableData(context.GetPlace()),
        out.rows().size(), input_width);
-    return out;
  }
 };


--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -65,6 +65,9 @@ struct MergeAdd {
  // the input SelectedRows object.
  framework::SelectedRows operator()(const DeviceContext& context,
                                     const framework::SelectedRows& input);
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input,
+                  framework::SelectedRows* output);
 };

 template <typename DeviceContext, typename T>

--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -123,6 +123,7 @@ class SumKernel : public framework::OpKernel<T> {

      out_value->Resize(framework::make_ddim(in_dim));
      out_value->mutable_data<T>(context.GetPlace());
+
      // if all the input sparse vars are empty, no need to
      // merge these vars.
      if (first_dim == 0UL) {

--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -36,7 +36,9 @@ void BindConstValue(pybind11::module* m) {
      .value("Backward", framework::OpRole::kBackward)
      .value("Optimize", framework::OpRole::kOptimize)
      .value("Loss", framework::OpRole::kLoss)
-      .value("RPC", framework::OpRole::kRPC);
+      .value("RPC", framework::OpRole::kRPC)
+      .value("Dist", framework::OpRole::kDist)
+      .value("LRSched", framework::OpRole::kLRSched);

  op_proto_and_checker_maker.def(
      "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
--- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
--- a/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py