Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into quantize_transpiler_update

9dbd3d5a · Dang Qingqing · 993cbbcd · a83a4fab · 9dbd3d5a · 9dbd3d5a
123 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,6 +69,7 @@ option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
 option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 option(WITH_INFERENCE    "Compile fluid inference library"              ON)
+option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})


--- a/benchmark/fluid/args.py
+++ b/benchmark/fluid/args.py
@@ -140,5 +140,11 @@ def parse_args():
        '--use_lars',
        action='store_true',
        help='If set, use lars for optimizers, ONLY support resnet module.')
+    parser.add_argument(
+        '--reduce_strategy',
+        type=str,
+        choices=['reduce', 'all_reduce'],
+        default='all_reduce',
+        help='Specify the reduce strategy, can be reduce, all_reduce')
    args = parser.parse_args()
    return args
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -170,6 +170,14 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = args.cpus
    strategy.allow_op_delay = False
+    build_strategy = fluid.BuildStrategy()
+    if args.reduce_strategy == "reduce":
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.Reduce
+    else:
+        build_strategy.reduce_strategy = fluid.BuildStrategy(
+        ).ReduceStrategy.AllReduce
+
    avg_loss = train_args[0]

    if args.update_method == "pserver":
@@ -184,6 +192,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
        avg_loss.name,
        main_program=train_prog,
        exec_strategy=strategy,
+        build_strategy=build_strategy,
        num_trainers=num_trainers,
        trainer_id=trainer_id)


--- a/benchmark/fluid/models/mnist.py
+++ b/benchmark/fluid/models/mnist.py
@@ -67,11 +67,14 @@ def cnn_model(data):

 def get_model(args, is_train, main_prog, startup_prog):
    # NOTE: mnist is small, we don't implement data sharding yet.
-    filelist = [
-        os.path.join(args.data_path, f) for f in os.listdir(args.data_path)
-    ]
+    opt = None
+    data_file_handle = None
    with fluid.program_guard(main_prog, startup_prog):
        if args.use_reader_op:
+            filelist = [
+                os.path.join(args.data_path, f)
+                for f in os.listdir(args.data_path)
+            ]
            data_file_handle = fluid.layers.open_files(
                filenames=filelist,
                shapes=[[-1, 1, 28, 28], (-1, 1)],
@@ -100,7 +103,7 @@ def get_model(args, is_train, main_prog, startup_prog):
            if is_train:
                opt = fluid.optimizer.AdamOptimizer(
                    learning_rate=0.001, beta1=0.9, beta2=0.999)
-                opt.minimize()
+                opt.minimize(avg_cost)
                if args.memory_optimize:
                    fluid.memory_optimize(main_prog)


--- a/benchmark/fluid/models/resnet.py
+++ b/benchmark/fluid/models/resnet.py
@@ -207,7 +207,7 @@ def get_model(args, is_train, main_prog, startup_prog):

                total_images = 1281167 / trainer_count

-                step = int(total_images / args.batch_size + 1)
+                step = int(total_images / (args.batch_size * args.gpus) + 1)
                epochs = [30, 60, 90]
                bd = [step * e for e in epochs]
                base_lr = args.learning_rate

--- a/cmake/tensorrt.cmake
+++ b/cmake/tensorrt.cmake
@@ -16,7 +16,9 @@ find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a
    DOC "Path to TensorRT library.")

 if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY)
+  if(WITH_DSO)
    set(TENSORRT_FOUND ON)
+  endif(WITH DSO)
 else()
    set(TENSORRT_FOUND OFF)
 endif()

--- a/doc/fluid/dev/releasing_process_en.md
+++ b/doc/fluid/dev/releasing_process_en.md
 # PaddlePaddle Releasing Process

-PaddlePaddle manages its branches using "git-flow branching model", and [Semantic Versioning](http://semver.org/) as it's version number semantics.
+PaddlePaddle manages its branches using Trunk Based Development, and [Semantic Versioning](http://semver.org/) as it's version number semantics.

 Each time we release a new PaddlePaddle version, we should follow the below steps:


--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -59,7 +59,7 @@ paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], vara
 paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
 paddle.fluid.InferenceTranspiler.__init__ 
 paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
@@ -305,9 +305,9 @@ paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'neg
 paddle.fluid.layers.detection_output ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0))
 paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None))
 paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
-paddle.fluid.layers.rpn_target_assign ArgSpec(args=['loc', 'scores', 'anchor_box', 'anchor_var', 'gt_box', 'rpn_batch_size_per_im', 'fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap'], varargs=None, keywords=None, defaults=(256, 0.25, 0.7, 0.3))
+paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
 paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
-paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'gt_boxes', 'im_scales', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None))
+paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
 paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
 paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
@@ -346,7 +346,7 @@ paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'con
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
 paddle.fluid.transpiler.InferenceTranspiler.__init__ 
 paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -28,10 +28,20 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
 cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
 cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)

+if(WITH_GPU)
+  cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
+          all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
+endif()
+
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
        scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle)

-cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
+if(WITH_GPU)
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
+else()
+  cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
+endif()
+
 cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
        simple_threadpool device_context)


--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -46,7 +46,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif

 void AllReduceOpHandle::RunImpl() {
-  platform::RecordEvent r("all_reduce", nullptr);
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+
  if (NoDummyInputSize() == 1) {
    return;  // No need to all reduce when GPU count = 1;
  } else {

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -15,12 +15,15 @@
 #include "paddle/fluid/framework/details/broadcast_op_handle.h"
 #include "paddle/fluid/framework/details/container_cast.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/platform/profiler.h"

 namespace paddle {
 namespace framework {
 namespace details {

 void BroadcastOpHandle::RunImpl() {
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+
  if (places_.size() == 1) return;

  // The input and output may have dummy vars.

--- a/paddle/fluid/framework/details/computation_op_handle.h
+++ b/paddle/fluid/framework/details/computation_op_handle.h
@@ -32,6 +32,10 @@ struct ComputationOpHandle : public OpHandleBase {

  std::string Name() const override;

+  const Scope *GetScope() const { return scope_; }
+
+  const platform::Place &GetPlace() const { return place_; }
+
 protected:
  void RunImpl() override;


--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -127,6 +127,9 @@ static const char kLocalScopes[] = "local_scopes";
 static const char kStrategy[] = "strategy";

 void MultiDevSSAGraphBuilder::Init() const {
+  all_vars_.clear();
+  balance_vars_.clear();
+
  loss_var_name_ = Get<const std::string>(kLossVarName);
  places_ = Get<const std::vector<platform::Place>>(kPlaces);
  local_scopes_ = Get<const std::vector<Scope *>>(kLocalScopes);
@@ -348,14 +351,31 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(

  size_t cur_device_id = 0;
  bool is_forwarding = true;
+  bool is_dist_train = false;

  for (ir::Node *node : sorted_ops) {
    if (boost::get<int>(
            node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
        static_cast<int>(OpRole::kRPC)) {
-      CreateRPCOp(&result, node);
+      int op_dev_id = CreateRPCOp(&result, node);
+      PADDLE_ENFORCE(op_dev_id != -1,
+                     "Can not schedule the RPC operator to the right place.");
+      if (node->Op()->Type() == "recv") {
+        auto recv_vars_attr =
+            boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+                OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+        PADDLE_ENFORCE(recv_vars_attr.size() == 2UL);  // [parameter, gradient]
+        if (recv_vars_attr[0].find(".block") == std::string::npos) {
+          bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]);
+        }
+      }
+      is_dist_train = true;
    } else if (IsDistTrainOp(node, send_vars, recv_vars)) {
-      CreateDistTrainOp(&result, node);
+      int op_dev_id = CreateDistTrainOp(&result, node);
+      if (node->Op()->Type() == "concat") {
+        auto origin_param_name = node->Op()->OutputArgumentNames()[0];
+        bcast_var_name_set[op_dev_id].emplace(origin_param_name);
+      }
    } else if (IsScaleLossOp(node)) {
      // user can customize loss@grad if not use_default_grad_scale_
      if (strategy_.gradient_scale_ !=
@@ -414,7 +434,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
                    CreateReduceOp(&result, g_name, cur_device_id);
                    graph->Get<ShardedVarDevice>(kShardedVarDevice)
                        .emplace(g_name, cur_device_id);
+                    if (!is_dist_train) {
                      bcast_var_name_set[cur_device_id].emplace(p_name);
+                    }
                    break;
                  case BuildStrategy::ReduceStrategy::kAllReduce:
                    if (IsSparseGradient(g_name)) {
@@ -436,14 +458,19 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
      }
    }
  }
-
  bool use_gpu = false;
 #ifdef PADDLE_WITH_CUDA
  use_gpu = nccl_ctxs_ != nullptr;
 #endif

-  if (use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) {
-    // Insert BCast Ops
+  // Insert broadcast operators principle:
+  // 1. Broadcast optimized parameters in Reduce strategy;
+  // 2. No need broadcast optimized parameters in AllReduce strategy because of
+  //    the optimization sub-graph would be run on every GPU;
+  // 3. Allways broadcast received parameters in Distribute Training.
+  if ((use_gpu &&
+       strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) ||
+      is_dist_train) {
    for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
      auto &to_bcast_set = bcast_var_name_set[dev_id];
      for (auto &bcast_name : to_bcast_set) {
@@ -675,7 +702,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result,
  return var;
 }

-void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
+int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                                               ir::Node *node) const {
  int op_dev_id = -1;
  std::vector<std::string> input_var_names;
@@ -719,6 +746,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                 node->Op()->Type());

  CreateComputationalOp(result, node, op_dev_id);
+  return op_dev_id;
 }

 void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
@@ -737,7 +765,7 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
 }

 // Create RPC related op handles that connects its in ops and out ops.
-void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
+int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
                                         ir::Node *node) const {
  int op_dev_id = -1;
  if (node->Op()->Type() == "send") {
@@ -824,6 +852,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
      CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
    }
  }
+  return op_dev_id;
 }

 bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {

--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -40,12 +40,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
                         size_t device_id) const;
  void Init() const;

- private:
-  mutable std::string loss_var_name_;
-  mutable std::vector<platform::Place> places_;
-  mutable std::vector<Scope *> local_scopes_;
-  mutable std::unordered_set<std::string> grad_names_;
-
 #ifdef PADDLE_WITH_CUDA
  mutable platform::NCCLContextMap *nccl_ctxs_;
 #endif
@@ -54,8 +48,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass {

  bool IsScaleLossOp(ir::Node *node) const;

-  void CreateRPCOp(ir::Graph *result, ir::Node *node) const;
-  void CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
+  int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
+  int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;

  /**
   * Is this operator as the end-point operator before/after send operator.
@@ -95,13 +89,17 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
  size_t GetAppropriateDeviceID(
      const std::vector<std::string> &var_names) const;

- private:
+  void SetCommunicationContext(OpHandleBase *op_handle,
+                               const platform::Place &p) const;
+
+  mutable std::string loss_var_name_;
+  mutable std::vector<platform::Place> places_;
+  mutable std::vector<Scope *> local_scopes_;
+  mutable std::unordered_set<std::string> grad_names_;
+
  mutable BuildStrategy strategy_;
  mutable std::unordered_map<std::string, VarDesc *> all_vars_;
  mutable std::vector<int64_t> balance_vars_;
-
-  void SetCommunicationContext(OpHandleBase *op_handle,
-                               const platform::Place &p) const;
 };
 }  // namespace details
 }  // namespace framework

--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -27,7 +27,8 @@ namespace framework {
 namespace details {

 void ReduceOpHandle::RunImpl() {
-  platform::RecordEvent r("reduce", nullptr);
+  platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
+
  if (places_.size() == 1) return;
  // the input and output may have dummy var.
  auto in_var_handles = DynamicCast<VarHandle>(inputs_);

--- a/paddle/fluid/framework/details/reference_count_op_handle.h
+++ b/paddle/fluid/framework/details/reference_count_op_handle.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <atomic>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/details/op_handle_base.h"
+#include "paddle/fluid/framework/garbage_collector.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+using ReferenceCountMap = std::unordered_map<std::string, int>;
+using AtomicReferenceCountMap =
+    std::unordered_map<std::string, std::atomic<int>>;
+using DeviceReferenceCountMap =
+    std::unordered_map<int, std::unique_ptr<ReferenceCountMap>>;
+using AtomicDeviceReferenceCountMap =
+    std::unordered_map<int, std::unique_ptr<AtomicReferenceCountMap>>;
+using DeviceGarbageCollectorMap =
+    std::unordered_map<int,
+                       std::unique_ptr<GarbageCollector<framework::Tensor>>>;
+
+class ReferenceCountOpHandle : public OpHandleBase {
+ public:
+  ReferenceCountOpHandle(ir::Node *node, const Scope *scope,
+                         const platform::CUDAPlace &place,
+                         const std::vector<std::string> &var_names,
+                         GarbageCollector<Tensor> *gc,
+                         AtomicReferenceCountMap *ref_cnts)
+      : OpHandleBase(node),
+        scope_(scope),
+        var_names_(var_names),
+        gc_(gc),
+        ref_cnts_(ref_cnts) {
+    dev_ctx_ = static_cast<platform::CUDADeviceContext *>(
+        platform::DeviceContextPool::Instance().Get(place));
+    if (IsStreamGarabageCollector()) {
+      PADDLE_ENFORCE(cudaSetDevice(place.device));
+      PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
+    }
+  }
+
+  ~ReferenceCountOpHandle() {
+    if (IsStreamGarabageCollector()) {
+      auto gpu_place = boost::get<platform::CUDAPlace>(dev_ctx_->GetPlace());
+      PADDLE_ENFORCE(cudaSetDevice(gpu_place.device));
+      PADDLE_ENFORCE(cudaEventDestroy(event_));
+    }
+  }
+
+  std::string Name() const override { return "reference_count"; }
+
+ protected:
+  void RunImpl() override {
+    auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    std::vector<LoDTensor *> tensors;
+    for (auto &name : var_names_) {
+      auto it = ref_cnts_->find(name);
+      if (it == ref_cnts_->end()) continue;
+
+      auto *var = exec_scope->FindVar(name);
+      if (var == nullptr || !var->IsType<LoDTensor>()) continue;
+
+      if (it->second.fetch_sub(1) <= 1) {
+        tensors.emplace_back(var->GetMutable<LoDTensor>());
+      }
+    }
+
+    if (!tensors.empty()) {
+      ClearTensors(tensors);
+    }
+  }
+
+ private:
+  void ClearTensors(const std::vector<LoDTensor *> &tensors) {
+    auto *gc = dynamic_cast<StreamGarbageCollector<Tensor> *>(gc_);
+    if (gc != nullptr) {
+      auto compute_stream = dev_ctx_->stream();
+      auto callback_stream = gc->stream();
+      auto callback_func = [=]() {
+        PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream));
+        PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0));
+      };
+      gc_->Add(tensors, callback_func);
+    } else {
+      gc_->Add(tensors);
+    }
+  }
+
+  bool IsStreamGarabageCollector() const {
+    return dynamic_cast<const StreamGarbageCollector<Tensor> *>(gc_) != nullptr;
+  }
+
+  const Scope *scope_;
+  platform::CUDADeviceContext *dev_ctx_;
+  std::vector<std::string> var_names_;
+  GarbageCollector<Tensor> *gc_;       // not own
+  AtomicReferenceCountMap *ref_cnts_;  // not own
+  cudaEvent_t event_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/computation_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/details/reference_count_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  auto &ref_cnts = Get<DeviceReferenceCountMap>(kGlobalReferenceCount);
+  auto &cur_ref_cnts = Get<AtomicDeviceReferenceCountMap>(kCurReferenceCount);
+  auto &gcs = Get<DeviceGarbageCollectorMap>(kGarbageCollector);
+
+  // It is not easy to find the right reference counts of varaibles in graph
+  // Step 1: Find all variables in computation ops
+  // Step 2: Find all variables in non-computation ops which refers to variables
+  // in computation ops
+  std::unordered_set<std::string> names;
+  auto get_ref_cnts_from_compute_op = [&](
+      const std::unique_ptr<OpHandleBase> &op,
+      const std::vector<VarHandleBase *> &vars) {
+    std::vector<std::string> var_names_in_op;
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    if (compute_op == nullptr ||
+        !platform::is_gpu_place(compute_op->GetPlace()))
+      return var_names_in_op;
+    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
+    for (VarHandleBase *var_handle_base : vars) {
+      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
+      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
+
+      if (!platform::is_gpu_place(var_handle->place_) ||
+          boost::get<platform::CUDAPlace>(var_handle->place_) != place)
+        continue;
+
+      VarDesc *var_desc = var_handle->Node()->Var();
+      auto var_name = var_handle->Node()->Name();
+
+      // This is wierd but there is really some variables without var_desc
+      // in computation_op
+      if (var_desc == nullptr) {
+        if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr)
+          continue;
+      } else {
+        if (var_desc->Persistable() ||
+            var_desc->Proto()->type().type() != proto::VarType::LOD_TENSOR)
+          continue;
+      }
+
+      // compute op only runs in one device
+      if (ref_cnts[place.device]->count(var_name))
+        ++(*ref_cnts[place.device])[var_name];
+      else
+        (*ref_cnts[place.device])[var_name] = 1;
+
+      names.insert(var_name);
+      var_names_in_op.push_back(var_name);
+    }
+    return var_names_in_op;
+  };
+
+  auto update_ref_cnts_from_non_compute_op = [&](
+      const std::unique_ptr<OpHandleBase> &op,
+      const std::vector<VarHandleBase *> &vars) {
+    if (dynamic_cast<ComputationOpHandle *>(op.get()) != nullptr) return;
+    for (VarHandleBase *var_handle_base : vars) {
+      auto *var_handle = dynamic_cast<VarHandle *>(var_handle_base);
+      if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue;
+
+      auto var_name = var_handle->Node()->Name();
+      auto var_place = var_handle->place_;
+      if (!platform::is_gpu_place(var_place)) continue;
+      auto place = boost::get<platform::CUDAPlace>(var_place);
+      if (names.count(var_name) == 0) continue;
+      if (ref_cnts.count(place.device) &&
+          ref_cnts[place.device]->count(var_name)) {
+        ++(*ref_cnts[place.device])[var_name];
+      }
+    }
+  };
+
+  std::unordered_map<OpHandleBase *, ReferenceCountOpHandle *>
+      compute_ref_cnt_map;
+  auto &all_ops = graph->Get<GraphOps>(kGraphOps);
+  for (auto &op : all_ops) {
+    auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs());
+    auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs());
+    if (in_var_names.empty() && out_var_names.empty()) continue;
+    in_var_names.insert(in_var_names.end(), out_var_names.begin(),
+                        out_var_names.end());
+    auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
+    auto place = boost::get<platform::CUDAPlace>(compute_op->GetPlace());
+    ir::Node *ref_cnt_node =
+        graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation);
+    auto *ref_cnt_handle = new ReferenceCountOpHandle(
+        ref_cnt_node, compute_op->GetScope(), place, in_var_names,
+        gcs[place.device].get(), cur_ref_cnts[place.device].get());
+    auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar());
+    compute_op->AddOutput(dep_var);
+    ref_cnt_handle->AddInput(dep_var);
+    graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
+    compute_ref_cnt_map[compute_op] = ref_cnt_handle;
+  }
+
+  for (auto &op : all_ops) {
+    update_ref_cnts_from_non_compute_op(op, op->Inputs());
+    update_ref_cnts_from_non_compute_op(op, op->Outputs());
+  }
+
+  std::vector<std::unique_ptr<OpHandleBase>> new_all_ops;
+  new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size());
+  for (auto &op : all_ops) {
+    new_all_ops.emplace_back(std::move(op));
+    auto it = compute_ref_cnt_map.find(new_all_ops.back().get());
+    if (it != compute_ref_cnt_map.end()) {
+      new_all_ops.emplace_back(it->second);
+    }
+  }
+
+  all_ops.swap(new_all_ops);
+  return graph;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(reference_count_pass,
+              paddle::framework::details::ReferenceCountPass)
+    .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kCurReferenceCount)
+    .RequirePassAttr(paddle::framework::details::kGarbageCollector);
--- a/paddle/fluid/framework/details/reference_count_pass.h
+++ b/paddle/fluid/framework/details/reference_count_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/details/reference_count_op_handle.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char kGlobalReferenceCount[] = "reference_count";
+constexpr char kCurReferenceCount[] = "current_reference_count";
+constexpr char kGarbageCollector[] = "garbage_collector";
+
+class ReferenceCountPass : public ir::Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -51,7 +51,7 @@ void ScaleLossGradOpHandle::RunImpl() {
              ->stream();
      memory::Copy(boost::get<platform::CUDAPlace>(place_), tmp,
                   platform::CPUPlace(), &coeff_, sizeof(float), stream);
-      VLOG(1) << place_ << "RUN Scale loss grad op";
+      VLOG(10) << place_ << "RUN Scale loss grad op";
    });
 #endif
  }

--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -18,6 +18,9 @@
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/framework/details/reference_count_op_handle.h"
+#endif

 namespace paddle {
 namespace framework {
@@ -65,12 +68,28 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(

  platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
  drop_scope_counter_ += 1;
+
+#ifdef PADDLE_WITH_CUDA
+  const std::string gc_name = "garbage_collector";
+  DeviceGarbageCollectorMap *gc =
+      Graph().Has(gc_name) ? &(Graph().Get<DeviceGarbageCollectorMap>(gc_name))
+                           : nullptr;
+#endif
+
  if (!fetch_tensors.empty() ||
      drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
    drop_scope_counter_ = 0;
    // Wait All computational streams
    for (auto p : places_) {
      platform::DeviceContextPool::Instance().Get(p)->Wait();
+#ifdef PADDLE_WITH_CUDA
+      if (gc != nullptr && platform::is_gpu_place(p)) {
+        auto gpu_place = boost::get<platform::CUDAPlace>(p);
+        auto &gc_at_place = gc->at(gpu_place.device);
+        gc_at_place->Wait();
+        gc_at_place->Reset();
+      }
+#endif
    }
    for (auto &scope : local_scopes_) {
      auto &local_scope =

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -37,7 +37,11 @@ int kProgramId = -1;

 ExecutorPrepareContext::ExecutorPrepareContext(
    const framework::ProgramDesc& prog, size_t block_id)
-    : prog_(prog), block_id_(block_id) {}
+    : prog_(prog), block_id_(block_id) {
+  if (GetEagerDeletionThreshold() >= 0) {
+    ref_cnts_ = GetNonPersistableReferenceCount<int>(prog_, block_id_);
+  }
+}

 ExecutorPrepareContext::~ExecutorPrepareContext() {
  VLOG(5) << "destroy ExecutorPrepareContext";
@@ -329,15 +333,81 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope,
    CreateVariables(ctx->prog_, local_scope, ctx->block_id_);
  }

+  int64_t max_memory_size = GetEagerDeletionThreshold();
+
+  std::unique_ptr<GarbageCollector<Tensor>> gc;
+  if (max_memory_size >= 0) {
+    ctx->ResetReferenceCount();
+#ifdef PADDLE_WITH_CUDA
+    if (platform::is_gpu_place(place_)) {
+      gc.reset(new DefaultStreamGarbageCollector<Tensor>(
+          boost::get<platform::CUDAPlace>(place_), max_memory_size));
+    } else {
+#endif
+      gc.reset(new CPUGarbageCollector<Tensor>(
+          boost::get<platform::CPUPlace>(place_), max_memory_size));
+#ifdef PADDLE_WITH_CUDA
+    }
+#endif
+  }
+
  for (auto& op : ctx->ops_) {
    op->Run(*local_scope, place_);

+    if (gc != nullptr) {
+      std::vector<std::string> erase_vars;
+      for (auto& input : op->Inputs()) {
+        for (auto& input_name : input.second) {
+          auto it = ctx->cur_ref_cnts_.find(input_name);
+          if (it == ctx->cur_ref_cnts_.end()) continue;
+          if (it->second == 1) {  // should delete it
+            erase_vars.emplace_back(input_name);
+            ctx->cur_ref_cnts_.erase(input_name);
+          } else {
+            --(it->second);
+          }
+        }
+      }
+
+      for (auto& output : op->Outputs()) {
+        for (auto& output_name : output.second) {
+          auto it = ctx->cur_ref_cnts_.find(output_name);
+          if (it == ctx->cur_ref_cnts_.end()) continue;
+          if (it->second == 1) {
+            erase_vars.emplace_back(output_name);
+            ctx->cur_ref_cnts_.erase(output_name);
+          } else {
+            --(it->second);
+          }
+        }
+      }
+
+      if (!erase_vars.empty()) {
+        std::vector<framework::LoDTensor*> erase_tensors;
+        for (auto& name : erase_vars) {
+          auto* var = local_scope->FindVar(name);
+          if (var == nullptr) continue;
+          if (var->IsType<framework::LoDTensor>()) {
+            auto* tensor = var->GetMutable<framework::LoDTensor>();
+            erase_tensors.push_back(tensor);
+          }
+        }
+        if (!erase_tensors.empty()) gc->Add(erase_tensors);
+      }
+    }
+
    if (FLAGS_benchmark) {
      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
              << memory::memory_usage(place_);
    }
  }
+
+  if (gc != nullptr) {
+    gc->Wait();
+  } else {
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+  }
+
  if (local_scope != scope) {
    scope->DeleteScope(local_scope);
  } else {

--- a/paddle/fluid/framework/executor.h
+++ b/paddle/fluid/framework/executor.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -27,13 +28,58 @@ namespace paddle {
 namespace framework {
 extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);

+template <typename T>
+std::unordered_map<std::string, T> GetNonPersistableReferenceCount(
+    const ProgramDesc& prog, size_t block_id) {
+  auto& block = prog.Block(block_id);
+  std::unordered_set<std::string> ignored_vars;
+  std::unordered_map<std::string, T> ref_cnts;
+
+  for (auto var_desc : block.AllVars()) {
+    auto type = var_desc->Proto()->type().type();
+    if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) {
+      ignored_vars.insert(var_desc->Name());  // ignore persistable vars
+    }
+  }
+
+  for (auto op_desc : block.AllOps()) {
+    for (auto& input : op_desc->Inputs()) {
+      for (auto& input_name : input.second) {
+        if (!ignored_vars.count(input_name)) {
+          if (ref_cnts.count(input_name))
+            ++ref_cnts[input_name];
+          else
+            ref_cnts[input_name] = 1;
+        }
+      }
+    }
+
+    for (auto& output : op_desc->Outputs()) {
+      for (auto output_name : output.second) {
+        if (!ignored_vars.count(output_name)) {
+          if (ref_cnts.count(output_name))
+            ++ref_cnts[output_name];
+          else
+            ref_cnts[output_name] = 1;
+        }
+      }
+    }
+  }
+  return ref_cnts;
+}
+
 struct ExecutorPrepareContext {
  ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id);
  ~ExecutorPrepareContext();

+  void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; }
+
  const framework::ProgramDesc& prog_;
  size_t block_id_;
  std::vector<std::unique_ptr<OperatorBase>> ops_;
+
+  std::unordered_map<std::string, int> ref_cnts_;
+  std::unordered_map<std::string, int> cur_ref_cnts_;
 };

 class Executor {

--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <mutex>  // NOLINT
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+// T should have memory_size() and clear() method
+template <typename T>
+class GarbageCollector {
+ public:
+  GarbageCollector(const platform::Place &place, size_t max_memory_size)
+      : max_memory_size_(std::max(max_memory_size, static_cast<size_t>(1))) {
+    garbages_.reset(new std::deque<T *>());
+    dev_ctx_ = platform::DeviceContextPool::Instance().Get(place);
+  }
+
+  virtual ~GarbageCollector() {}
+
+  void Reset() {
+    std::lock_guard<std::mutex> guard(mutex_);
+    garbages_.reset(new std::deque<T *>());
+    cur_memory_size_ = 0;
+  }
+
+  template <typename Container>
+  void Add(const Container &objs) {
+    Add(objs, []() {});
+  }
+
+  template <typename Container, typename Callback>
+  void Add(const Container &objs, Callback &&callback) {
+    std::shared_ptr<std::deque<T *>> clear_deque;
+    {
+      std::lock_guard<std::mutex> guard(mutex_);
+      for (auto *obj : objs) {
+        garbages_->push_back(obj);
+        cur_memory_size_ += obj->memory_size();
+      }
+      if (cur_memory_size_ >= max_memory_size_) {
+        cur_memory_size_ = 0;
+        clear_deque = garbages_;
+        garbages_.reset(new std::deque<T *>());
+      }
+    }
+
+    if (clear_deque != nullptr) {
+      callback();
+      ClearCallback([=]() {
+        for (auto *obj : *clear_deque) obj->clear();
+      });
+    }
+  }
+
+  virtual void Wait() const {}
+
+ protected:
+  virtual void ClearCallback(const std::function<void()> &callback) = 0;
+
+  platform::DeviceContext *dev_ctx_;
+  std::shared_ptr<std::deque<T *>> garbages_;
+  mutable std::mutex mutex_;
+  const size_t max_memory_size_;
+  size_t cur_memory_size_ = 0;
+};
+
+template <typename T>
+class CPUGarbageCollector : public GarbageCollector<T> {
+ public:
+  CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size)
+      : GarbageCollector<T>(place, max_memory_size) {}
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override {
+    callback();
+  }
+};
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+class DefaultStreamGarbageCollector : public GarbageCollector<T> {
+ public:
+  DefaultStreamGarbageCollector(const platform::CUDAPlace &place,
+                                size_t max_memory_size)
+      : GarbageCollector<T>(place, max_memory_size) {}
+
+  cudaStream_t stream() const {
+    return static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
+        ->stream();
+  }
+
+  void Wait() const override {
+    this->dev_ctx_->Wait();
+    static_cast<const platform::CUDADeviceContext *>(this->dev_ctx_)
+        ->WaitStreamCallback();
+  }
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override {
+    static_cast<platform::CUDADeviceContext *>(this->dev_ctx_)
+        ->AddStreamCallback(callback);
+  }
+};
+
+template <typename T>
+class StreamGarbageCollector : public GarbageCollector<T> {
+ public:
+  StreamGarbageCollector(const platform::CUDAPlace &place,
+                         size_t max_memory_size)
+      : GarbageCollector<T>(place, max_memory_size) {
+    PADDLE_ENFORCE(cudaSetDevice(place.device));
+    PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+    callback_manager_.reset(new platform::StreamCallbackManager(stream_));
+  }
+
+  ~StreamGarbageCollector() {
+    auto place = boost::get<platform::CUDAPlace>(this->dev_ctx_->GetPlace());
+    PADDLE_ENFORCE(cudaSetDevice(place.device));
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+    PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+  }
+
+  void Wait() const override {
+    PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    callback_manager_->Wait();
+  }
+
+  cudaStream_t stream() const { return stream_; }
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override {
+    std::lock_guard<std::mutex> guard(this->mutex_);
+    callback_manager_->AddCallback(callback);
+  }
+
+ private:
+  cudaStream_t stream_;
+  std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
+};
+#endif
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -58,7 +58,7 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
    desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
    desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
    desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
-    desc.SetOutput("Out", std::vector<std::string>({conv_relu_out}));
+    desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
    desc.SetType("conv2d");
    for (auto& attr : conv->Op()->GetAttrMap()) {
      desc.SetAttr(attr.first, attr.second);

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -94,6 +94,14 @@ class Graph {
    };
  }

+  template <typename AttrType>
+  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph",
+                   attr_name);
+    attrs_[attr_name] = attr;
+    attr_dels_[attr_name] = []() {};
+  }
+
  const std::unordered_set<ir::Node *> &Nodes() const { return node_set_; }

  // Create a normal variable with non-null VarDesc.

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -429,7 +429,7 @@ struct LSTM : public PatternBase {

 struct GRU : public PatternBase {
  GRU(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "lstm") {}
+      : PatternBase(pattern, name_scope, "gru") {}

  PDNode* operator()(PDNode* x);


--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -188,6 +188,30 @@ ParallelExecutor::ParallelExecutor(
      main_program, member_->places_, loss_var_name, params,
      member_->local_scopes_, member_->use_cuda_, build_strategy,
      member_->nccl_ctxs_.get());
+
+  auto max_memory_size = GetEagerDeletionThreshold();
+  if (max_memory_size >= 0) {
+    for (auto &place : member_->places_) {
+      if (!platform::is_gpu_place(place)) continue;
+      auto gpu_place = boost::get<platform::CUDAPlace>(place);
+      if (gcs_[gpu_place.device] == nullptr) {
+        ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap());
+        cur_ref_cnts_[gpu_place.device].reset(
+            new details::AtomicReferenceCountMap());
+        gcs_[gpu_place.device].reset(
+            new StreamGarbageCollector<Tensor>(gpu_place, max_memory_size));
+      }
+    }
+    if (!gcs_.empty()) {
+      auto ref_cnt_pass =
+          ir::PassRegistry::Instance().Get("reference_count_pass");
+      ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_);
+      ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_);
+      ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_);
+      graph = ref_cnt_pass->Apply(std::move(graph));
+      graph->SetNotOwned("garbage_collector", &gcs_);
+    }
+  }
 #else
  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
      main_program, member_->places_, loss_var_name, params,
@@ -209,30 +233,9 @@ ParallelExecutor::ParallelExecutor(

 void ParallelExecutor::BCastParamsToDevices(
    const std::unordered_set<std::string> &vars) const {
-  // the initializing bcast, all vars would be bcast from device(0),
-  // otherwise
-  // bcast from the specified device.
-  bool initializing = member_->executor_ ? false : true;
+  // the initializing bcast, all vars would be bcast from device(0).
  for (auto &var : vars) {
-    int var_dev_id = -1;
-    if (member_->executor_) {
-      auto &sharded_var_device =
-          member_->executor_->Graph().Get<details::ShardedVarDevice>(
-              details::kShardedVarDevice);
-      if (sharded_var_device.find(var) != sharded_var_device.end()) {
-        var_dev_id = sharded_var_device.at(var);
-      }
-    }
-
-    if (!initializing && var_dev_id == -1) continue;
-
-    framework::Variable *main_var = nullptr;
-    if (initializing) {
-      main_var = member_->local_scopes_[0]->FindVar(var);
-    } else {
-      main_var = member_->local_scopes_[var_dev_id]->FindVar(var);
-    }
-
+    framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
    if (main_var == nullptr || !main_var->IsType<LoDTensor>()) {
      continue;
    }
@@ -248,8 +251,7 @@ void ParallelExecutor::BCastParamsToDevices(
        auto place = member_->places_[i];
        void *buffer;

-        if ((initializing && i == 0) ||
-            (!initializing && static_cast<int>(i) == var_dev_id)) {
+        if (i == 0) {
          buffer = const_cast<void *>(main_tensor.data<void>());
        } else {
          auto local_scope = member_->local_scopes_[i];
@@ -266,29 +268,18 @@ void ParallelExecutor::BCastParamsToDevices(
        platform::NCCLGroupGuard guard;
        for (size_t i = 0; i < member_->places_.size(); ++i) {
          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
-          if (initializing) {
          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
                                       nccl_ctx.comm_, nccl_ctx.stream());
-          } else {
-            if (var_dev_id >= 0) {
-              platform::dynload::ncclBcast(buffers[i], numel, data_type,
-                                           var_dev_id, nccl_ctx.comm_,
-                                           nccl_ctx.stream());
-            }
-          }
        }
        member_->nccl_ctxs_->WaitAll();
      }
-
 #else
      PADDLE_THROW("Not compiled with CUDA");
 #endif
    } else {
      platform::CPUPlace cpu;
      for (size_t i = 0; i < member_->places_.size(); ++i) {
-        if ((initializing && i == 0) ||
-            (!initializing && static_cast<int>(i) == var_dev_id))
-          continue;
+        if (i == 0) continue;

        auto local_scope = member_->local_scopes_[i];
        auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
@@ -310,6 +301,11 @@ void ParallelExecutor::BCastParamsToDevices(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                           const std::string &fetched_var_name) {
  platform::RecordBlock b(0);
+#ifdef PADDLE_WITH_CUDA
+  if (!gcs_.empty()) {
+    ResetReferenceCount();
+  }
+#endif
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
@@ -367,3 +363,6 @@ USE_PASS(graph_viz_pass);
 USE_PASS(multi_devices_pass);
 USE_PASS(multi_devices_check_pass);
 USE_PASS(multi_devices_print_pass);
+#ifdef PADDLE_WITH_CUDA
+USE_PASS(reference_count_pass);
+#endif
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -15,7 +15,9 @@ limitations under the License. */
 #pragma once

 #include <paddle/fluid/framework/details/build_strategy.h>
+#include <atomic>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/details/execution_strategy.h"
@@ -27,6 +29,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"

+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/framework/details/reference_count_pass.h"
+#endif
+
 namespace paddle {
 namespace framework {

@@ -66,10 +72,27 @@ class ParallelExecutor {
  void Run(const std::vector<std::string> &fetch_tensors,
           const std::string &fetched_var_name);

+ private:
  void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const;

- private:
  ParallelExecutorPrivate *member_;
+
+#ifdef PADDLE_WITH_CUDA
+  // ref_cnts_ is only initialized when ParallelExecutor constructs, and then
+  // keeps unchanged
+  // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_
+  details::DeviceReferenceCountMap ref_cnts_;
+  details::AtomicDeviceReferenceCountMap cur_ref_cnts_;
+  details::DeviceGarbageCollectorMap gcs_;
+
+  void ResetReferenceCount() {
+    for (auto &pair1 : ref_cnts_) {
+      for (auto &pair2 : *(pair1.second)) {
+        (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second;
+      }
+    }
+  }
+#endif
 };

 }  // namespace framework

--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -31,9 +31,21 @@ DEFINE_bool(
    "Delete local scope eagerly. It will reduce GPU memory usage but "
    "slow down the destruction of variables.(around 1% performance harm)");

+DEFINE_double(
+    eager_delete_tensor_gb, -1.0,
+    "Memory size threshold (GB) when the garbage collector clear tensors."
+    "Disabled when this value is less than 0");
+
 namespace paddle {
 namespace framework {

+int64_t GetEagerDeletionThreshold() {
+  return FLAGS_eager_delete_tensor_gb < 0
+             ? -1
+             : static_cast<int64_t>(FLAGS_eager_delete_tensor_gb *
+                                    (static_cast<int64_t>(1) << 30));
+}
+
 Scope::~Scope() { DropKids(); }

 Scope& Scope::NewScope() const {

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -26,6 +26,8 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

+int64_t GetEagerDeletionThreshold();
+
 class Scope;

 /**

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -151,6 +151,8 @@ class Tensor {

  void set_layout(const DataLayout layout) { layout_ = layout; }

+  void clear() { holder_ = nullptr; }
+
 private:
  /**
   * @note    Placeholder hides type T, so it doesn't appear as a template

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -17,9 +17,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 # paddle_fluid_origin exclude inference api interface
 cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)

-#if(APPLE)
-  add_subdirectory(api)
-#endif()
+add_subdirectory(api)

 # Create static library
 cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api analysis_predictor)
@@ -57,5 +55,7 @@ endif()
 if(WITH_TESTING)
  # tests/book depends the models that generated by python/paddle/fluid/tests/book
  add_subdirectory(tests/book)
+  if(WITH_INFERENCE_API_TEST)
    add_subdirectory(tests/api)
+  endif()  
 endif()
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -69,8 +69,9 @@ class DfgPassManagerImpl final : public DfgPassManager {
    if (FLAGS_IA_enable_tensorrt_subgraph_engine) {
      auto trt_teller = [&](const Node* node) {
        std::unordered_set<std::string> teller_set(
-            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
-             "depthwise_conv2d", "batch_norm", "concat"});
+            {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
+             "depthwise_conv2d", "batch_norm", "concat", "tanh",
+             "elementwise_add", "dropout"});
        if (!node->IsFunction()) return false;

        const auto* func = static_cast<const Function*>(node);

--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -72,6 +72,9 @@ class Analyzer : public OrderedRegistry<PassManager> {
      "mul_gru_fuse_pass",         //
      "seq_concat_fc_fuse_pass",   //
      "fc_fuse_pass",              //
+#ifdef PADDLE_WITH_MKLDNN
+      "conv_relu_mkldnn_fuse_pass",  //
+#endif
  }};

  std::unordered_set<std::string> disabled_ir_passes_;

--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -440,6 +440,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
    }
    return false;
  };
+
  for (auto &node : graph) {
    for (auto *in : node->inlinks) {
      // The Value that is written by nodes inside a sub-graph shouldn't be the
@@ -459,6 +460,7 @@ ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
                        std::vector<Node *>(outputs.begin(), outputs.end()));
 }

+// Filter the Intermediate results of the subgraph node.
 void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
  std::vector<Node *> op_nodes;
  for (auto &node : GraphTraits<DataFlowGraph>(*graph).nodes_in_TS()) {
@@ -480,9 +482,11 @@ void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) {
    for (auto *out : op_nodes[i]->outlinks) {
      if (follow_up_input_names.count(out->name())) {
        filtered_subgraph_outlinks.push_back(out);
+      } else {
+        out->SetDeleted();
      }
    }
-    PADDLE_ENFORCE_GE(filtered_subgraph_outlinks.size(), 1UL);
+    // The filtered_subgraph_outlinks may be empty.
    op_nodes[i]->outlinks = filtered_subgraph_outlinks;
  }
 }

--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -106,20 +106,23 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,

  // collect inputs
  std::unordered_set<std::string> input_names;
+  std::unordered_set<std::string> input_names_with_id;
  for (auto *x : func->inlinks) {
    input_names.insert(x->name());
+    input_names_with_id.insert(x->name() + std::to_string(x->id()));
  }
  desc.SetInput(
      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));

  std::unordered_set<std::string> output_names;
+  std::unordered_set<std::string> output_names_with_id;
  for (auto *x : func->outlinks) {
    output_names.insert(x->name());
+    output_names_with_id.insert(x->name() + std::to_string(x->id()));
  }

-  std::vector<std::string> output_temp(output_names.begin(),
-                                       output_names.end());
-  desc.SetOutput("Ys", output_temp);
+  desc.SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
  desc.SetType("tensorrt_engine");

  std::unordered_map<std::string, std::string> output_name_map;
@@ -153,11 +156,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
      std::vector<std::string> replaced_names;
      for (int k = 0; k < in_var->arguments_size(); k++) {
        std::string arg_value = in_var->arguments(k);
-        if (input_names.count(arg_value)) {
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
          replaced_names.push_back(arg_value);
        } else {
-          replaced_names.push_back(arg_value +
-                                   std::to_string(var2id[arg_value]));
+          replaced_names.push_back(arg_value_with_id);
        }
      }
      in_var->clear_arguments();
@@ -176,11 +180,12 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
      std::vector<std::string> replaced_names;
      for (int k = 0; k < out_var->arguments_size(); k++) {
        std::string arg_value = out_var->arguments(k);
-        if (output_names.count(arg_value)) {
-          output_name_map[arg_value] =
+        std::string arg_value_with_id =
            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id.count(arg_value_with_id)) {
+          output_name_map[arg_value] = arg_value_with_id;
        }
-        replaced_names.push_back(arg_value + std::to_string(var2id[arg_value]));
+        replaced_names.push_back(arg_value_with_id);
      }
      out_var->clear_arguments();
      for (size_t k = 0; k < replaced_names.size(); k++) {

--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
@@ -74,13 +74,141 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) {
  node_map.at(b)->attr(kUnionFindParent).Int32() = a_ancestor;
 }

+// This is a simple representation of a graph.
+// The BriefNode hold the pointer of the Node.
+// This is to avoid changing the original graph
+// in the process of trt graph analysis.
+struct BriefNode {
+  explicit BriefNode(Node *n) { node = n; }
+  Node *node;
+  std::vector<BriefNode *> inlinks;
+  std::vector<BriefNode *> outlinks;
+};
+
+// Union two adjacent BriefNode.
+// Suppose we have two adjacent nodes src and dst.
+// We will perform the following operations:
+// 1. add all inputs(except src) of dst to src inlinks.
+// 2. add all outputs of dst to src outlinks.
+// 3. change all the dst's inputs and outputs
+// corresponding inlinks and outlinks to src node.
+// 4. delete all dst's inlinks and outlinks.
+void UnionContractedNodes(const std::unordered_map<int, BriefNode *> &node_map,
+                          int src_id, int dst_id) {
+  // merge the two adjacent nodes into one node.
+  BriefNode *src_node = node_map.at(src_id);
+  BriefNode *dst_node = node_map.at(dst_id);
+
+  std::unordered_set<BriefNode *> inputs(src_node->inlinks.begin(),
+                                         src_node->inlinks.end());
+  std::unordered_set<BriefNode *> outputs;
+
+  for (auto *n : src_node->outlinks) {
+    if (n != dst_node) outputs.insert(n);
+  }
+
+  // Add the inlinks and outlinks of dst node to src node.
+  std::vector<BriefNode *> dst_in_nodes = dst_node->inlinks;
+  for (BriefNode *node : dst_in_nodes) {
+    if (node != src_node) {
+      inputs.insert(node);
+    }
+  }
+
+  std::vector<BriefNode *> dst_out_nodes = dst_node->outlinks;
+  for (BriefNode *node : dst_out_nodes) {
+    outputs.insert(node);
+  }
+
+// update the dst and src node's inlinks and outlinks.
+#ifdef __clang__
+  src_node->inlinks = std::vector<BriefNode *>(inputs.begin(), inputs.end());
+  src_node->outlinks = std::vector<BriefNode *>(outputs.begin(), outputs.end());
+  dst_node->inlinks.clear();
+  dst_node->outlinks.clear();
+#else
+  src_node->inlinks =
+      std::move(std::vector<BriefNode *>(inputs.begin(), inputs.end()));
+  src_node->outlinks =
+      std::move(std::vector<BriefNode *>(outputs.begin(), outputs.end()));
+  dst_node->inlinks.clear();
+  dst_node->outlinks.clear();
+#endif
+
+  auto inlink_or_outlink_cleaner = [&](std::vector<BriefNode *> &nodes) {
+    for (auto *&n : nodes) {
+      if (n == src_node || n == dst_node) {
+        n = src_node;
+      }
+    }
+  };
+  // Change all the dst inputs and outputs corresponding inlink and
+  // outlink to the src node.
+  for (auto *node : src_node->inlinks) {
+    inlink_or_outlink_cleaner(node->outlinks);
+  }
+
+  for (auto *node : src_node->outlinks) {
+    inlink_or_outlink_cleaner(node->inlinks);
+  }
+}
+
+// FlexibleDFS
+// If reverse is true, do reverse dfs.
+// If enter func is not nullptr, calls enter(node) before visiting any children
+// of node.
+// If leave func not nullptr, calls leave(node) after visiting all parents of
+// node.
+void FlexibleDFS(const std::vector<BriefNode *> &source, bool reverse,
+                 const std::function<bool(const BriefNode *)> &enter,
+                 const std::function<bool(const BriefNode *)> &leave) {
+  typedef struct {
+    const BriefNode *node;
+    bool leave;
+  } FNode;
+
+  std::vector<FNode> stack;
+  for (auto &node : source) {
+    stack.push_back(FNode{node, false});
+  }
+  std::unordered_set<const BriefNode *> visited;
+  while (!stack.empty()) {
+    auto fnode = stack.back();
+    stack.pop_back();
+
+    if (fnode.leave) {
+      if (leave && !leave(fnode.node)) return;
+    }
+    if (visited.count(fnode.node)) continue;
+    visited.insert(fnode.node);
+
+    if (enter && !enter(fnode.node)) return;
+
+    if (leave) stack.push_back(FNode{fnode.node, true});
+    const std::vector<BriefNode *> iter_nodes =
+        reverse == true ? fnode.node->inlinks : fnode.node->outlinks;
+    for (const BriefNode *node : iter_nodes) {
+      if (!visited.count(node)) {
+        stack.push_back(FNode{node, false});
+      }
+    }
+  }
+}
+
 std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
+  // Run the Extract algorithm to find all subgraphs.
  std::vector<Node *> marked_nodes;
+  //  We use brief_node_map to represent the original graph in order to avoid
+  //  changing the original graph.
+  std::unordered_map<int, BriefNode *> brief_node_map;
+
  for (auto &node : GraphTraits<DataFlowGraph>(*graph_).nodes_in_TS()) {
+    brief_node_map[node.id()] = new BriefNode(&node);
    if (node.attr(kMarkerAttrName).Bool()) {
      marked_nodes.push_back(&node);
    }
  }
+
  // extract sub-graphs in the marked node set, use Union Find algorithm.
  node_map_t node_map;  // id to ptr
  for (auto *n : marked_nodes) {
@@ -88,11 +216,73 @@ std::vector<std::vector<Node *>> SubGraphSplitter::ExtractSubGraphs() {
    n->attr(kUnionFindParent).Int32() = n->id();
    node_map[n->id()] = n;
  }
-  std::unordered_set<Node *> visited;
-  for (auto *n : marked_nodes) {
-    for (auto *out : n->outlinks) {
-      if (node_map.count(out->id())) {
-        UnionFindCombine(node_map, n->id(), out->id());
+
+  // create breif node map
+  for (auto &itr : brief_node_map) {
+    for (Node *node : itr.second->node->inlinks) {
+      itr.second->inlinks.push_back(brief_node_map[node->id()]);
+    }
+
+    for (Node *node : itr.second->node->outlinks) {
+      itr.second->outlinks.push_back(brief_node_map[node->id()]);
+    }
+  }
+
+  for (auto &itr : brief_node_map) {
+    BriefNode *brief_node = itr.second;
+
+    if (!brief_node->node->attr(kMarkerAttrName).Bool()) {
+      VLOG(4) << brief_node->node->id() << " node not a trt candicate.";
+      continue;
+    }
+
+    //  Our algorithm must guarantee that:
+    //  1. The graph is always directed acyclic graph（DAG）.
+    //  2. If there is a path in the subgraph from X to Y (X and Y are both
+    //  nodes in the subgraph), then all paths from X to Y are in the
+    //  subgraph.
+    //
+    //  In order to achieve the above guarantee.
+    //  For adjacent nodes src -> dst.
+    //  1. Get all dst input nodes except src.
+    //  2. Reverse DFS from those input nodes
+    //  3. If there is a path from input nodes to src,
+    //  then the src and dst nodes can not be fused into one node,
+    //  otherwise it can be done.
+
+    while (true) {
+      std::unordered_set<BriefNode *> contract_nodes;
+      for (auto *out : brief_node->outlinks) {
+        // must be an trt candidate
+        if (!out->node->attr(kMarkerAttrName).Bool()) continue;
+        // get all dst input nodes except src.
+        std::vector<BriefNode *> source_nodes;
+        for (auto *n : out->inlinks) {
+          if (n != brief_node) {
+            source_nodes.push_back(n);
+          }
+        }
+
+        // Reverse DFS from the source_nodes.
+        bool have_excess_path = false;
+        FlexibleDFS(source_nodes, true, nullptr,
+                    [&have_excess_path, brief_node](const BriefNode *n) {
+                      if (n == brief_node) {
+                        have_excess_path = true;
+                        return false;
+                      }
+                      return true;
+                    });
+        if (have_excess_path) continue;
+        contract_nodes.insert(out);
+      }
+      if (contract_nodes.empty()) break;
+
+      for (auto dst_node : contract_nodes) {
+        UnionFindCombine(node_map, brief_node->node->id(),
+                         dst_node->node->id());
+        UnionContractedNodes(brief_node_map, brief_node->node->id(),
+                             dst_node->node->id());
      }
    }
  }
@@ -128,6 +318,7 @@ void SubGraphFuse::ReplaceNodesWithSubGraphs() {
    auto io = ExtractInputAndOutputOfSubGraph(subgraph);
    block_node->inlinks = std::move(io.first);
    block_node->outlinks = std::move(io.second);
+
    for (auto *node : subgraph) {
      // TODO(Superjomn) need a unified mechanism to treat deleted node in each
      // pass.

--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
@@ -82,7 +82,7 @@ TEST(SubGraphSplitter, Fuse) {

  // At least one nodes should be deleted.
  ASSERT_EQ(dfg.nodes.size(), count0 + 1);  // added a new FunctionBlock
-  ASSERT_EQ(6, count1);
+  ASSERT_EQ(11, count1);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -69,25 +69,4 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    endfunction()
    anakin_target(inference_anakin_api)
    anakin_target(inference_anakin_api_shared)
-    if (WITH_TESTING)
-        # TODO(luotao): ANAKIN_MODLE_URL etc will move to demo ci later.
-        set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com")
-        set(ANAKIN_RNN_MODLE_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn.anakin2.model.bin")
-        set(ANAKIN_RNN_DATA_URL "${INFERENCE_URL}/anakin_test%2Fditu_rnn_data.txt")
-        execute_process(COMMAND bash -c "mkdir -p ${ANAKIN_SOURCE_DIR}")
-        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_MODLE_URL} -N")
-        execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_RNN_DATA_URL} -N")
-        if(WITH_GPU)
-            set(anakin_test_extra_deps dynload_cuda)
-            set(ANAKIN_MODLE_URL "${INFERENCE_URL}/mobilenet_v2.anakin.bin")
-            execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-certificate ${ANAKIN_MODLE_URL} -N")
-            cc_test(api_anakin_engine_tester SRCS api_anakin_engine_tester.cc 
-                    ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
-                    DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
-        endif()
-        cc_test(api_anakin_engine_rnn_tester SRCS api_anakin_engine_rnn_tester.cc 
-                ARGS --model=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
-                     --datapath=${ANAKIN_SOURCE_DIR}/anakin_test%2Fditu_rnn_data.txt
-                DEPS inference_anakin_api_shared ${anakin_test_extra_deps} SERIAL)
-    endif(WITH_TESTING)
 endif()
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -77,6 +77,9 @@ bool AnalysisPredictor::Init(

  OptimizeInferenceProgram();
  ctx_ = executor_->Prepare(*inference_program_, 0);
+  if (config_._use_mkldnn) {
+    executor_->EnableMKLDNN(*inference_program_);
+  }

  VLOG(5) << "to create variables";
  PADDLE_ENFORCE(scope_.get());

--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <glog/logging.h>
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/platform/enforce.h"

 namespace paddle {

@@ -64,13 +64,15 @@ PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) {

 void PaddleBuf::Resize(size_t length) {
  // Only the owned memory can be reset, the external memory can't be changed.
-  if (length_ == length) return;
+  if (length_ >= length) return;
  if (memory_owned_) {
    Free();
-  }
-  data_ = new char[length];
+    data_ = malloc(length);
    length_ = length;
    memory_owned_ = true;
+  } else {
+    PADDLE_THROW("The memory is allocated externally, can not Resized");
+  }
 }

 void PaddleBuf::Reset(void* data, size_t length) {
@@ -82,8 +84,8 @@ void PaddleBuf::Reset(void* data, size_t length) {

 void PaddleBuf::Free() {
  if (memory_owned_ && data_) {
-    assert(length_ > 0);
-    delete[] static_cast<char*>(data_);
+    PADDLE_ENFORCE_GT(length_, 0);
+    free(static_cast<char*>(data_));
    data_ = nullptr;
    length_ = 0;
  }

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -106,6 +106,9 @@ bool NativePaddlePredictor::Init(
  }

  ctx_ = executor_->Prepare(*inference_program_, 0);
+  if (config_._use_mkldnn) {
+    executor_->EnableMKLDNN(*inference_program_);
+  }
  executor_->CreateVariables(*inference_program_,
                             sub_scope_ ? sub_scope_ : scope_.get(), 0);


--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -153,11 +153,21 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
 }  // namespace paddle

 USE_TRT_CONVERTER(elementwise_add_weight);
+USE_TRT_CONVERTER(elementwise_add_tensor);
+USE_TRT_CONVERTER(elementwise_sub_tensor);
+USE_TRT_CONVERTER(elementwise_div_tensor);
+USE_TRT_CONVERTER(elementwise_mul_tensor);
+USE_TRT_CONVERTER(elementwise_max_tensor);
+USE_TRT_CONVERTER(elementwise_min_tensor);
+USE_TRT_CONVERTER(elementwise_pow_tensor);
 USE_TRT_CONVERTER(mul);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(sigmoid);
+USE_TRT_CONVERTER(tanh);
 USE_TRT_CONVERTER(fc);
 USE_TRT_CONVERTER(pool2d);
 USE_TRT_CONVERTER(softmax);
 USE_TRT_CONVERTER(batch_norm);
 USE_TRT_CONVERTER(concat);
+USE_TRT_CONVERTER(dropout);
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -123,10 +123,16 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
 }

 void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-               double latency) {
+               double latency, int epoch = 1) {
  LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
            << ", threads: " << num_threads << ", thread id: " << tid
            << ", latency: " << latency << "ms ======";
+  if (epoch > 1) {
+    int samples = batch_size * epoch;
+    LOG(INFO) << "====== sample number: " << samples
+              << ", average latency of each sample: " << latency / samples
+              << "ms ======";
+  }
 }

 }  // namespace inference

--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -45,7 +45,7 @@ class PaddleBuf {
  PaddleBuf(void* data, size_t length)
      : data_(data), length_(length), memory_owned_{false} {}
  // Own memory.
-  PaddleBuf(size_t length)
+  explicit PaddleBuf(size_t length)
      : data_(new char[length]), length_(length), memory_owned_(true) {}
  // Resize to `length` bytes.
  void Resize(size_t length);
@@ -121,6 +121,8 @@ struct NativeConfig : public PaddlePredictor::Config {
  bool use_gpu{false};
  int device{0};
  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
+  // NOTE: NOT use it, just for the internal test, will discard later
+  bool _use_mkldnn{false};
  // Specify the variable's name of each input.
  bool specify_input_name{false};


--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 # Add TRT tests
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
  DEPS tensorrt_engine operator scope framework_proto op_registry)

 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@@ -24,6 +24,8 @@ nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
-
 nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
+
+nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine dropout_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc
@@ -19,28 +19,64 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-class ReluOpConverter : public OpConverter {
+class ActivationOpConverter : public OpConverter {
 public:
-  ReluOpConverter() {}
+  ActivationOpConverter() {}
  void operator()(const framework::proto::OpDesc& op,
                  const framework::Scope& scope, bool test_mode) override {
    // Here the two nullptr looks strange, that's because the
    // framework::OpDesc's constructor is strange.
    framework::OpDesc op_desc(op, nullptr);
-    LOG(INFO) << "convert a fluid relu op to tensorrt activation layer whose "
-                 "type is Relu";
+    LOG(INFO)
+        << "convert a fluid Activation op to tensorrt activation layer whose "
+           "type is "
+        << op_type_;
    const nvinfer1::ITensor* input_tensor =
        engine_->GetITensor(op_desc.Input("X")[0]);
+
+    auto op_pair = ops.find(op_type_);
+    if (op_pair == ops.end()) {
+      PADDLE_THROW("Wrong activation op type!");
+    }
+
    nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER(
        engine_, Activation, *const_cast<nvinfer1::ITensor*>(input_tensor),
-        nvinfer1::ActivationType::kRELU);
+        op_pair->second);
    auto output_name = op_desc.Output("Out")[0];
+    layer->setName((op_type_ + " (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
    engine_->SetITensor(output_name, layer->getOutput(0));
    if (test_mode) {  // the test framework can not determine which is the
                      // output, so place the declaration inside.
      engine_->DeclareOutput(output_name);
    }
  }
+
+ protected:
+  std::string op_type_;
+  static const std::unordered_map<std::string, nvinfer1::ActivationType> ops;
+};
+
+const std::unordered_map<std::string, nvinfer1::ActivationType>
+    ActivationOpConverter::ops = {
+        {"relu", nvinfer1::ActivationType::kRELU},
+        {"sigmoid", nvinfer1::ActivationType::kSIGMOID},
+        {"tanh", nvinfer1::ActivationType::kTANH},
+};
+
+class ReluOpConverter : public ActivationOpConverter {
+ public:
+  ReluOpConverter() { op_type_ = "relu"; }
+};
+
+class SigmoidOpConverter : public ActivationOpConverter {
+ public:
+  SigmoidOpConverter() { op_type_ = "sigmoid"; }
+};
+
+class TanhOpConverter : public ActivationOpConverter {
+ public:
+  TanhOpConverter() { op_type_ = "tanh"; }
 };

 }  // namespace tensorrt
@@ -48,3 +84,5 @@ class ReluOpConverter : public OpConverter {
 }  // namespace paddle

 REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter);
+REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter);
+REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc
@@ -116,6 +116,8 @@ class BatchNormOpConverter : public OpConverter {
                             scale_weights.get(), power_weights.get());

    auto output_name = op_desc.Output("Y").front();
+    layer->setName(("batch_norm (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
    engine_->weight_map[op_desc.Input("Bias").front()] =
        std::move(combile_bias_tensor);
    engine_->weight_map[op_desc.Input("Scale").front()] =

--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@@ -42,6 +42,8 @@ class ConcatOpConverter : public OpConverter {
    axis = axis - 1;  // Remove batch dim
    layer->setAxis(axis);
    auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("concat (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
    engine_->SetITensor(output_name, layer->getOutput(0));
    if (test_mode) {  // the test framework can not determine which is the
                      // output, so place the declaration inside.

--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -78,8 +78,10 @@ class Conv2dOpConverter : public OpConverter {
    layer->setNbGroups(groups);

    auto output_name = op_desc.Output("Output").front();
+    layer->setName(("conv2d (Output: " + output_name + ")").c_str());
    engine_->weight_map[op_desc.Input("Filter").front()] =
        std::move(weight_tensor);
+    layer->getOutput(0)->setName(output_name.c_str());
    engine_->SetITensor(output_name, layer->getOutput(0));
    if (test_mode) {
      engine_->DeclareOutput(output_name);

--- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * DropoutOp. This Layer doesn't has weights.
+ */
+class DropoutOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid dropout op to tensorrt dropout layer";
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    float dropout_prob = boost::get<float>(op_desc.GetAttr("dropout_prob"));
+
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> weight_tensor(
+        new framework::LoDTensor());
+    weight_tensor->Resize(framework::make_ddim({1}));
+    auto* weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
+    weight_data[0] = 1 - dropout_prob;
+
+    TensorRTEngine::Weight scale_weights{
+        nvinfer1::DataType::kFLOAT, static_cast<void*>(weight_data),
+        weight_tensor->memory_size() / sizeof(float)};
+    TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+    TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+                                         0};
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Scale, *const_cast<nvinfer1::ITensor*>(input1),
+        nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(),
+        power_weights.get());
+
+    engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] =
+        std::move(weight_tensor);
+    auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("dropout (Output: " + output_name + ")").c_str());
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(dropout);
+REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -89,6 +89,8 @@ class ElementwiseWeightOpConverter : public OpConverter {
        shift_weights.get(), scale_weights.get(), power_weights.get());
    auto output_name = op_desc.Output("Out")[0];

+    layer->setName(("elementwise_add (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
    engine_->weight_map[op_desc.Input("Y").front()] = std::move(weight_tensor);
    engine_->SetITensor(output_name, layer->getOutput(0));
    if (test_mode) {  // the test framework can not determine which is the
@@ -137,6 +139,8 @@ class ElementwiseTensorOpConverter : public OpConverter {
        *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);

    auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("elementwise (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
    engine_->SetITensor(output_name, layer->getOutput(0));
    if (test_mode) {  // the test framework can not determine which is the
                      // output, so place the declaration inside.

--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -107,6 +107,8 @@ class FcOpConverter : public OpConverter {
                                       n_output, tmp_weight.get(), bias.get());

    auto output_name = op_desc.Output("Out").front();
+    layer->setName(("fc (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
    engine_->SetITensor(output_name, layer->getOutput(0));
    engine_->weight_map[op_desc.Input("Y").front()] = std::move(tmp);
    if (test_mode) {

--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -72,6 +72,8 @@ class Pool2dOpConverter : public OpConverter {
    layer->setPadding(nv_paddings);

    auto output_name = op_desc.Output("Out")[0];
+    layer->setName(("pool2d (Output: " + output_name + ")").c_str());
+    layer->getOutput(0)->setName(output_name.c_str());
    engine_->SetITensor(output_name, layer->getOutput(0));
    if (test_mode) {
      engine_->DeclareOutput(output_name);

--- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc
@@ -20,18 +20,18 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-TEST(ReluOpConverter, main) {
+void test_activation(std::string act_type) {
  framework::Scope scope;
  std::unordered_set<std::string> parameters;
  TRTConvertValidation validator(10, parameters, scope, 1000);
-  validator.DeclInputVar("relu-X", nvinfer1::Dims2(10, 6));
-  validator.DeclOutputVar("relu-Out", nvinfer1::Dims2(10, 6));
+  validator.DeclInputVar("act-X", nvinfer1::Dims2(10, 6));
+  validator.DeclOutputVar("act-Out", nvinfer1::Dims2(10, 6));

  // Prepare Op description
  framework::OpDesc desc;
-  desc.SetType("relu");
-  desc.SetInput("X", {"relu-X"});
-  desc.SetOutput("Out", {"relu-Out"});
+  desc.SetType(act_type);
+  desc.SetInput("X", {"act-X"});
+  desc.SetOutput("Out", {"act-Out"});

  LOG(INFO) << "set OP";
  validator.SetOp(*desc.Proto());
@@ -40,8 +40,16 @@ TEST(ReluOpConverter, main) {
  validator.Execute(5);
 }

+TEST(ReluOpConverter, main) { test_activation("relu"); }
+
+TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); }
+
+TEST(TanhOpConverter, main) { test_activation("tanh"); }
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle

 USE_OP(relu);
+USE_OP(sigmoid);
+USE_OP(tanh);
--- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(DropoutOpConverter, main) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  TRTConvertValidation validator(8, parameters, scope, 1000);
+
+  std::vector<int> tensor_shape{8, 10};
+  validator.DeclInputVar("dropout-X", tensor_shape,
+                         nvinfer1::DimsCHW(10, 1, 1));
+  validator.DeclOutputVar("dropout-Out", nvinfer1::DimsCHW(10, 1, 1));
+  validator.DeclOutputVar("mask-Out", nvinfer1::DimsCHW(10, 1, 1));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  int is_test = 1;
+  float dropout_prob = 0.4;
+
+  desc.SetType("dropout");
+  desc.SetInput("X", {"dropout-X"});
+  desc.SetOutput("Mask", {"mask-Out"});
+  desc.SetOutput("Out", {"dropout-Out"});
+  desc.SetAttr("is_test", is_test);
+  desc.SetAttr("dropout_prob", dropout_prob);
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  std::unordered_set<std::string> neglected_output = {"mask-Out"};
+
+  validator.Execute(8, neglected_output);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_OP(dropout);
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
-function (inference_download_and_uncompress install_dir url)
-    get_filename_component(filename ${url} NAME)
-    message(STATUS "Download inference test stuff ${filename} from ${url}")
+set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com")
+set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
+    "A path setting inference demo download directories.")
+set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
+function (inference_download install_dir url filename)
+    message(STATUS "Download inference test stuff from ${url}/${filename}")
    execute_process(COMMAND bash -c "mkdir -p ${install_dir}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}")
-    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+    execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}")
    message(STATUS "finish downloading ${filename}")
-endfunction(inference_download_and_uncompress)
+endfunction()
+
+function (inference_download_and_uncompress install_dir url filename)
+    inference_download(${install_dir} ${url} ${filename})
+    execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}")
+endfunction()

-function(download_model_and_data install_dir model_url data_url)
-    if (NOT EXISTS ${install_dir} AND WITH_INFERENCE)
-        inference_download_and_uncompress(${install_dir} ${model_url})
-        inference_download_and_uncompress(${install_dir} ${data_url})
+function(download_model_and_data install_dir model_name data_name)
+    if (NOT EXISTS ${install_dir})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
+        inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_name})
    endif()
 endfunction()

+function(inference_analysis_api_test target install_dir filename)
+    inference_analysis_test(${target} SRCS ${filename}
+        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
+        ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
+endfunction()
+
 # RNN1
-set(RNN1_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fmodel.tar.gz")
-set(RNN1_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/rnn1%2Fdata.txt.tar.gz")
-set(RNN1_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/rnn1")
-download_model_and_data(${RNN1_INSTALL_DIR} ${RNN1_MODEL_URL} ${RNN1_DATA_URL})
-inference_analysis_test(test_analyzer_rnn1 SRCS analyzer_rnn1_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
-    ARGS --infer_model=${RNN1_INSTALL_DIR}/model
-         --infer_data=${RNN1_INSTALL_DIR}/data.txt)
+if(NOT APPLE)
+    set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
+    download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz")
+    inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc)
+else()
+    # TODO: fix this test on MACOS, the reason is that
+    # fusion_seqexpand_concat_fc_op is not supported on MACOS
+    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_analyzer_rnn1")
+endif()
+
+# RNN2
+set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
+download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)

 # chinese_ner
-set(CHINESE_NER_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner_model.tar.gz")
-set(CHINESE_NER_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/chinese_ner-data.txt.tar.gz")
-set(CHINESE_NER_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/chinese_ner")
-download_model_and_data(${CHINESE_NER_INSTALL_DIR} ${CHINESE_NER_MODEL_URL} ${CHINESE_NER_DATA_URL})
-inference_analysis_test(test_analyzer_ner SRCS analyzer_ner_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
-    ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model
-        --infer_data=${CHINESE_NER_INSTALL_DIR}/data.txt)
+set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
+download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc)

 # lac
-set(LAC_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/lac_model.tar.gz")
-set(LAC_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/lac_data.txt.tar.gz")
-set(LAC_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/lac")
-download_model_and_data(${LAC_INSTALL_DIR} ${LAC_MODEL_URL} ${LAC_DATA_URL})
-inference_analysis_test(test_analyzer_lac SRCS analyzer_lac_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor
-    ARGS --infer_model=${LAC_INSTALL_DIR}/model
-        --infer_data=${LAC_INSTALL_DIR}/data.txt)
+set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac")
+download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc)

 # text_classification
-set(TEXT_CLASSIFICATION_MODEL_URL "http://paddle-inference-dist.bj.bcebos.com/text-classification-Senta.tar.gz")
-set(TEXT_CLASSIFICATION_DATA_URL "http://paddle-inference-dist.bj.bcebos.com/text_classification_data.txt.tar.gz")
-set(TEXT_CLASSIFICATION_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo/text_classification")
-download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} ${TEXT_CLASSIFICATION_MODEL_URL} ${TEXT_CLASSIFICATION_DATA_URL})
-inference_analysis_test(test_text_classification SRCS analyzer_text_classification_tester.cc
-    EXTRA_DEPS paddle_inference_api paddle_fluid_api analysis_predictor
-    ARGS --infer_model=${TEXT_CLASSIFICATION_INSTALL_DIR}/text-classification-Senta
-         --infer_data=${TEXT_CLASSIFICATION_INSTALL_DIR}/data.txt
-         --topn=1 # Just run top 1 batch.
-         )
+set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification")
+download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
+
+# ocr
+set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
+if (NOT EXISTS ${OCR_INSTALL_DIR})
+    inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
+endif()
+inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
+
+# anakin
+if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
+   # anakin rnn1
+   set(ANAKIN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/anakin")
+   set(ANAKIN_RNN1_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/rnn1")
+   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn.anakin2.model.bin")
+   inference_download(${ANAKIN_RNN1_INSTALL_DIR} ${INFERENCE_URL} "anakin_test%2Fditu_rnn_data.txt")
+   cc_test(test_anakin_rnn1 SRCS anakin_rnn1_tester.cc 
+           ARGS --model=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn.anakin2.model.bin 
+                --datapath=${ANAKIN_RNN1_INSTALL_DIR}/anakin_test%2Fditu_rnn_data.txt
+           DEPS inference_anakin_api_shared SERIAL)
+   # anakin mobilenet
+   if(WITH_GPU)
+       set(ANAKIN_MOBILENET_INSTALL_DIR "${ANAKIN_INSTALL_DIR}/mobilenet")
+       inference_download(${ANAKIN_MOBILENET_INSTALL_DIR} ${INFERENCE_URL} "mobilenet_v2.anakin.bin")
+       cc_test(test_anakin_mobilenet SRCS anakin_mobilenet_tester.cc 
+               ARGS --model=${ANAKIN_MOBILENET_INSTALL_DIR}/mobilenet_v2.anakin.bin
+               DEPS inference_anakin_api_shared dynload_cuda SERIAL)
+   endif()
+endif()
--- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc
--- a/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine_rnn_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
@@ -12,21 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_string(infer_model, "", "model path for LAC");
-DEFINE_string(infer_data, "", "data file for LAC");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(burning, 0, "Burning before repeat.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@@ -124,48 +110,38 @@ const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25,

 void TestLACPrediction(const std::string &model_path,
                       const std::string &data_file, const int batch_size,
-                       const int repeat, bool test_all_data,
-                       bool use_analysis = false) {
-  NativeConfig config;
-  config.model_dir = model_path;
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-  std::vector<PaddleTensor> input_slots, outputs_slots;
-  DataRecord data(data_file, batch_size);
-  GetOneBatch(&input_slots, &data, batch_size);
-  std::unique_ptr<PaddlePredictor> predictor;
-  if (use_analysis) {
+                       const int repeat, bool use_analysis = false) {
  AnalysisConfig cfg;
  cfg.model_dir = model_path;
  cfg.use_gpu = false;
  cfg.device = 0;
  cfg.specify_input_name = true;
  cfg.enable_ir_optim = true;
+
+  std::vector<PaddleTensor> input_slots, outputs_slots;
+  DataRecord data(data_file, batch_size);
+  GetOneBatch(&input_slots, &data, batch_size);
+  std::unique_ptr<PaddlePredictor> predictor;
+  if (use_analysis) {
    predictor =
        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
  } else {
    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
  }
  for (int i = 0; i < FLAGS_burning; i++) {
    predictor->Run(input_slots, &outputs_slots);
  }
  Timer timer;
-  if (test_all_data) {
-    double sum = 0;
-    LOG(INFO) << "Total number of samples: " << data.datasets.size();
-    for (int i = 0; i < repeat; i++) {
+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
    for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
      GetOneBatch(&input_slots, &data, batch_size);
-        timer.tic();
-        predictor->Run(input_slots, &outputs_slots);
-        sum += timer.toc();
-      }
+      input_slots_all.emplace_back(input_slots);
    }
-    PrintTime(batch_size, repeat, 1, 0, sum / repeat);
-    LOG(INFO) << "Average latency of each sample: "
-              << sum / repeat / data.datasets.size() << " ms";
+    LOG(INFO) << "total number of samples: " << data.datasets.size();
+    TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
    return;
  }
  timer.tic();
@@ -190,19 +166,10 @@ void TestLACPrediction(const std::string &model_path,
  if (use_analysis) {
    // run once for comparion as reference
    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
    std::vector<PaddleTensor> ref_outputs_slots;
    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    EXPECT_EQ(ref_outputs_slots.size(), outputs_slots.size());
-    auto &ref_out = ref_outputs_slots[0];
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
-    EXPECT_EQ(size, ref_size);
-    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-    for (size_t i = 0; i < size; ++i) {
-      EXPECT_EQ(pdata_ref[i], pdata[i]);
-    }
+    CompareResult(ref_outputs_slots, outputs_slots);

    AnalysisPredictor *analysis_predictor =
        dynamic_cast<AnalysisPredictor *>(predictor.get());
@@ -231,13 +198,13 @@ void TestLACPrediction(const std::string &model_path,
 TEST(Analyzer_LAC, native) {
  LOG(INFO) << "LAC with native";
  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
-                    FLAGS_repeat, FLAGS_test_all_data);
+                    FLAGS_repeat);
 }

 TEST(Analyzer_LAC, analysis) {
  LOG(INFO) << "LAC with analysis";
  TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size,
-                    FLAGS_repeat, FLAGS_test_all_data, true);
+                    FLAGS_repeat, true);
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -12,20 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gtest/gtest.h>
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/platform/profiler.h"
-
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 10, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@@ -113,17 +100,6 @@ const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
                                       48, 39, 38, 16, 25};

 void TestChineseNERPrediction(bool use_analysis) {
-  NativeConfig config;
-  config.prog_file = FLAGS_infer_model + "/__model__";
-  config.param_file = FLAGS_infer_model + "/param";
-  config.use_gpu = false;
-  config.device = 0;
-  config.specify_input_name = true;
-
-  std::vector<PaddleTensor> input_slots, outputs;
-  std::unique_ptr<PaddlePredictor> predictor;
-  Timer timer;
-  if (use_analysis) {
  AnalysisConfig cfg;
  cfg.prog_file = FLAGS_infer_model + "/__model__";
  cfg.param_file = FLAGS_infer_model + "/param";
@@ -131,32 +107,28 @@ void TestChineseNERPrediction(bool use_analysis) {
  cfg.device = 0;
  cfg.specify_input_name = true;
  cfg.enable_ir_optim = true;
+
+  std::vector<PaddleTensor> input_slots, outputs;
+  std::unique_ptr<PaddlePredictor> predictor;
+  Timer timer;
+  if (use_analysis) {
    predictor =
        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
  } else {
    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
  }

  if (FLAGS_test_all_data) {
    LOG(INFO) << "test all data";
-    double sum = 0;
-    size_t num_samples;
-    for (int i = 0; i < FLAGS_repeat; i++) {
    DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
-      // Just one batch, the num_samples remains the same.
-      num_samples = data.num_samples;
-      for (size_t bid = 0; bid < num_samples / FLAGS_batch_size; ++bid) {
+    std::vector<std::vector<PaddleTensor>> input_slots_all;
+    for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
      PrepareInputs(&input_slots, &data, FLAGS_batch_size);
-        timer.tic();
-        predictor->Run(input_slots, &outputs);
-        sum += timer.toc();
-      }
+      input_slots_all.emplace_back(input_slots);
    }
-    LOG(INFO) << "total number of samples: " << num_samples;
-    PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
-    LOG(INFO) << "average latency of each sample: "
-              << sum / FLAGS_repeat / num_samples;
+    LOG(INFO) << "total number of samples: " << data.num_samples;
+    TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
    return;
  }
  // Prepare inputs.
@@ -182,19 +154,10 @@ void TestChineseNERPrediction(bool use_analysis) {
  if (use_analysis) {
    // run once for comparion as reference
    auto ref_predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
    std::vector<PaddleTensor> ref_outputs_slots;
    ref_predictor->Run(input_slots, &ref_outputs_slots);
-    EXPECT_EQ(ref_outputs_slots.size(), outputs.size());
-    auto &ref_out = ref_outputs_slots[0];
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
-    EXPECT_EQ(size, ref_size);
-    int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
-    for (size_t i = 0; i < size; ++i) {
-      EXPECT_EQ(pdata_ref[i], result[i]);
-    }
+    CompareResult(ref_outputs_slots, outputs);

    AnalysisPredictor *analysis_predictor =
        dynamic_cast<AnalysisPredictor *>(predictor.get());

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -12,24 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-
-#include <google/protobuf/text_format.h>
-#include <gtest/gtest.h>
-#include <thread>  // NOLINT
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/analysis_predictor.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-
-DEFINE_string(infer_model, "", "model path");
-DEFINE_string(infer_data, "", "data path");
-DEFINE_int32(batch_size, 10, "batch size.");
-DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
-DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@@ -164,26 +147,6 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
  }
 }

-void CompareResult(const std::vector<PaddleTensor> &outputs,
-                   const std::vector<PaddleTensor> &base_outputs) {
-  PADDLE_ENFORCE_GT(outputs.size(), 0);
-  PADDLE_ENFORCE_EQ(outputs.size(), base_outputs.size());
-  for (size_t i = 0; i < outputs.size(); i++) {
-    auto &out = outputs[i];
-    auto &base_out = base_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t size1 = std::accumulate(base_out.shape.begin(), base_out.shape.end(),
-                                   1, [](int a, int b) { return a * b; });
-    PADDLE_ENFORCE_EQ(size, size1);
-    PADDLE_ENFORCE_GT(size, 0);
-    float *data = static_cast<float *>(out.data.data());
-    float *base_data = static_cast<float *>(base_out.data.data());
-    for (size_t i = 0; i < size; i++) {
-      EXPECT_NEAR(data[i], base_data[i], 1e-3);
-    }
-  }
-}
 // Test with a really complicate model.
 void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
  AnalysisConfig config;
@@ -198,7 +161,6 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {
  config.ir_passes.clear();  // Do not exclude any pass.

  int batch_size = FLAGS_batch_size;
-  int num_times = FLAGS_repeat;

  auto base_predictor =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
@@ -213,45 +175,14 @@ void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) {

  base_predictor->Run(input_slots, &base_outputs);

+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  input_slots_all.emplace_back(input_slots);
  if (num_threads == 1) {
-    // Prepare inputs.
-    Timer timer;
-    timer.tic();
-    for (int i = 0; i < num_times; i++) {
-      predictor->Run(input_slots, &outputs);
-    }
-    PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times);
+    TestOneThreadPrediction(config, input_slots_all, &outputs);
    CompareResult(outputs, base_outputs);
  } else {
-    std::vector<std::thread> threads;
-    std::vector<std::unique_ptr<PaddlePredictor>> predictors;
-    // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
-    // because AttentionLSTM's hard code nodeid will be damanged.
-    for (int tid = 0; tid < num_threads; ++tid) {
-      predictors.emplace_back(
-          CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-              config));
-    }
-    for (int tid = 0; tid < num_threads; ++tid) {
-      threads.emplace_back([&, tid]() {
-        // Each thread should have local input_slots and outputs.
-        std::vector<PaddleTensor> input_slots;
-        DataRecord data(FLAGS_infer_data, batch_size);
-        PrepareInputs(&input_slots, &data, batch_size);
-        std::vector<PaddleTensor> outputs;
-        Timer timer;
-        timer.tic();
-        for (int i = 0; i < num_times; i++) {
-          predictors[tid]->Run(input_slots, &outputs);
-        }
-        PrintTime(batch_size, num_times, num_threads, tid,
-                  timer.toc() / num_times);
-        CompareResult(outputs, base_outputs);
-      });
-    }
-    for (int i = 0; i < num_threads; ++i) {
-      threads[i].join();
-    }
+    // only return the output of first thread
+    TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
  }

  if (use_analysis && activate_ir) {
@@ -293,8 +224,7 @@ TEST(Analyzer, RNN_tests) {
    // Directly infer with the original model.
    TestRNN1Prediction(false, false, i);
    // Inference with the original model with the analysis turned on, the
-    // analysis
-    // module will transform the program to a data flow graph.
+    // analysis module will transform the program to a data flow graph.
    TestRNN1Prediction(true, false, i);
    // Inference with analysis and IR. The IR module will fuse some large
    // kernels.

--- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/analysis/analyzer.h"
+
+#include <google/protobuf/text_format.h>
+#include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data path");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+
+namespace paddle {
+namespace inference {
+
+using namespace framework;  // NOLINT
+
+struct DataRecord {
+  std::vector<std::vector<std::vector<float>>> link_step_data_all;
+  std::vector<size_t> lod;
+  std::vector<std::vector<float>> rnn_link_data;
+  std::vector<float> result_data;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= link_step_data_all.size()) {
+      data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter,
+                                     link_step_data_all.begin() + batch_end);
+      // Prepare LoDs
+      data.lod.push_back(0);
+      CHECK(!data.link_step_data_all.empty()) << "empty";
+      for (size_t j = 0; j < data.link_step_data_all.size(); j++) {
+        for (const auto &d : data.link_step_data_all[j]) {
+          data.rnn_link_data.push_back(d);
+          // calculate lod
+          data.lod.push_back(data.lod.back() + 11);
+        }
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, ':', &data);
+      if (num_lines % 2) {  // feature
+        std::vector<std::string> feature_data;
+        split(data[1], ' ', &feature_data);
+        std::vector<std::vector<float>> link_step_data;
+        int feature_count = 1;
+        std::vector<float> feature;
+        for (auto &step_data : feature_data) {
+          std::vector<float> tmp;
+          split_to_float(step_data, ',', &tmp);
+          feature.insert(feature.end(), tmp.begin(), tmp.end());
+          if (feature_count % 11 == 0) {  // each sample has 11 features
+            link_step_data.push_back(feature);
+            feature.clear();
+          }
+          feature_count++;
+        }
+        link_step_data_all.push_back(std::move(link_step_data));
+      } else {  // result
+        std::vector<float> tmp;
+        split_to_float(data[1], ',', &tmp);
+        result_data.insert(result_data.end(), tmp.begin(), tmp.end());
+      }
+    }
+  }
+};
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor feed_tensor;
+  feed_tensor.name = "feed";
+  auto one_batch = data->NextBatch();
+  int token_size = one_batch.rnn_link_data.size();
+  // each token has 11 features, each feature's dim is 54.
+  std::vector<int> rnn_link_data_shape({token_size * 11, 54});
+  feed_tensor.shape = rnn_link_data_shape;
+  feed_tensor.lod.assign({one_batch.lod});
+  feed_tensor.dtype = PaddleDType::FLOAT32;
+  TensorAssignData<float>(&feed_tensor, one_batch.rnn_link_data);
+  // Set inputs.
+  input_slots->assign({feed_tensor});
+}
+
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<float> &base_result) {
+  PADDLE_ENFORCE_GT(outputs.size(), 0);
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    PADDLE_ENFORCE_GT(size, 0);
+    float *data = static_cast<float *>(out.data.data());
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_NEAR(data[i], base_result[i], 1e-3);
+    }
+  }
+}
+// Test with a really complicate model.
+void TestRNN2Prediction() {
+  AnalysisConfig config;
+  config.prog_file = FLAGS_infer_model + "/__model__";
+  config.param_file = FLAGS_infer_model + "/param";
+  config.use_gpu = false;
+  config.device = 0;
+  config.specify_input_name = true;
+  config.enable_ir_optim = true;
+  PADDLE_ENFORCE(config.ir_mode ==
+                 AnalysisConfig::IrPassMode::kExclude);  // default
+
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+
+  auto base_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  std::vector<PaddleTensor> input_slots;
+  DataRecord data(FLAGS_infer_data, batch_size);
+  PrepareInputs(&input_slots, &data, batch_size);
+  std::vector<PaddleTensor> outputs, base_outputs;
+
+  Timer timer1;
+  timer1.tic();
+  for (int i = 0; i < num_times; i++) {
+    base_predictor->Run(input_slots, &base_outputs);
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times);
+
+  Timer timer2;
+  timer2.tic();
+  for (int i = 0; i < num_times; i++) {
+    predictor->Run(input_slots, &outputs);
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
+
+  CompareResult(base_outputs, data.result_data);
+  CompareResult(outputs, data.result_data);
+}
+
+TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -12,23 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/analyzer.h"
-#include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
-#include <gtest/gtest.h>
-#include <fstream>
-#include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/inference/analysis/ut_helper.h"
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/api/paddle_inference_pass.h"
-#include "paddle/fluid/inference/api/timer.h"
-
-DEFINE_string(infer_model, "", "Directory of the inference model.");
-DEFINE_string(infer_data, "", "Path of the dataset.");
-DEFINE_int32(batch_size, 1, "batch size.");
-DEFINE_int32(repeat, 1, "How many times to repeat run.");
-DEFINE_int32(topn, -1, "Run top n batches of data to save time");
+#include "paddle/fluid/inference/tests/api/tester_helper.h"

 namespace paddle {
 namespace inference {
@@ -37,24 +21,25 @@ struct DataReader {
  explicit DataReader(const std::string &path)
      : file(new std::ifstream(path)) {}

-  bool NextBatch(PaddleTensor *tensor, int batch_size) {
+  bool NextBatch(std::vector<PaddleTensor> *input, int batch_size) {
    PADDLE_ENFORCE_EQ(batch_size, 1);
    std::string line;
-    tensor->lod.clear();
-    tensor->lod.emplace_back(std::vector<size_t>({0}));
+    PaddleTensor tensor;
+    tensor.dtype = PaddleDType::INT64;
+    tensor.lod.emplace_back(std::vector<size_t>({0}));
    std::vector<int64_t> data;

    for (int i = 0; i < batch_size; i++) {
      if (!std::getline(*file, line)) return false;
      inference::split_to_int64(line, ' ', &data);
    }
-    tensor->lod.front().push_back(data.size());
+    tensor.lod.front().push_back(data.size());

-    tensor->data.Resize(data.size() * sizeof(int64_t));
-    memcpy(tensor->data.data(), data.data(), data.size() * sizeof(int64_t));
-    tensor->shape.clear();
-    tensor->shape.push_back(data.size());
-    tensor->shape.push_back(1);
+    tensor.data.Resize(data.size() * sizeof(int64_t));
+    memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t));
+    tensor.shape.push_back(data.size());
+    tensor.shape.push_back(1);
+    input->assign({tensor});
    return true;
  }

@@ -68,32 +53,28 @@ void Main(int batch_size) {
  config.model_dir = FLAGS_infer_model;
  config.use_gpu = false;
  config.enable_ir_optim = true;
-  auto predictor =
-      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-          config);

-  std::vector<PaddleTensor> input_slots(1);
-  // one batch starts
-  // data --
-  auto &input = input_slots[0];
-  input.dtype = PaddleDType::INT64;
-
-  inference::Timer timer;
-  double sum = 0;
-  std::vector<PaddleTensor> output_slots;
+  std::vector<PaddleTensor> input_slots, output_slots;
+  DataReader reader(FLAGS_infer_data);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;

+  if (FLAGS_test_all_data) {
+    LOG(INFO) << "test all data";
    int num_batches = 0;
-  for (int t = 0; t < FLAGS_repeat; t++) {
-    DataReader reader(FLAGS_infer_data);
-    while (reader.NextBatch(&input, FLAGS_batch_size)) {
-      if (FLAGS_topn > 0 && num_batches > FLAGS_topn) break;
-      timer.tic();
-      CHECK(predictor->Run(input_slots, &output_slots));
-      sum += timer.toc();
+    while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
+      input_slots_all.emplace_back(input_slots);
      ++num_batches;
    }
+    LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
+    TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
+    return;
  }
-  PrintTime(batch_size, FLAGS_repeat, 1, 0, sum / FLAGS_repeat);
+
+  // one batch starts
+  // data --
+  reader.NextBatch(&input_slots, FLAGS_batch_size);
+  input_slots_all.emplace_back(input_slots);
+  TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);

  // Get output
  LOG(INFO) << "get outputs " << output_slots.size();

--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line) {
+  VLOG(3) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
+  return record;
+}
+
+/*
+ * Use the native and analysis fluid engine to inference the demo.
+ * ocr, mobilenet and se_resnext50
+ */
+void TestVisualPrediction(bool use_mkldnn) {
+  std::unique_ptr<PaddlePredictor> predictor;
+  AnalysisConfig cfg;
+  cfg.param_file = FLAGS_infer_model + "/__params__";
+  cfg.prog_file = FLAGS_infer_model + "/__model__";
+  cfg.use_gpu = false;
+  cfg._use_mkldnn = use_mkldnn;
+  cfg.device = 0;
+  cfg.enable_ir_optim = true;
+  // TODO(TJ): fix fusion gru
+  cfg.ir_passes.push_back("fc_gru_fuse_pass");
+#ifdef PADDLE_WITH_MKLDNN
+  // disable mkldnn fuse since it should have some bugs
+  cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
+#endif
+  predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
+
+  // Only have single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_infer_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+
+  // Inference.
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
+
+  std::vector<PaddleTensor> outputs_slots;
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < FLAGS_repeat; i++) {
+    predictor->Run({input}, &outputs_slots);
+  }
+  PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
+            timer.toc() / FLAGS_repeat);
+
+  VLOG(3) << "output.size " << outputs_slots.size();
+
+  // run native as reference
+  auto ref_predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
+  std::vector<PaddleTensor> ref_outputs_slots;
+  ref_predictor->Run({input}, &ref_outputs_slots);
+  CompareResult(outputs_slots, ref_outputs_slots);
+  // print what are fused
+  AnalysisPredictor *analysis_predictor =
+      dynamic_cast<AnalysisPredictor *>(predictor.get());
+  auto &fuse_statis = analysis_predictor->analysis_argument()
+                          .Get<std::unordered_map<std::string, int>>(
+                              framework::ir::kFuseStatisAttr);
+  for (auto &item : fuse_statis) {
+    LOG(INFO) << "fused " << item.first << " " << item.second;
+  }
+  int num_ops = 0;
+  for (auto &node :
+       analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
+    if (node->IsFunction()) {
+      ++num_ops;
+    }
+  }
+  LOG(INFO) << "has num ops: " << num_ops;
+}
+
+TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, analysis_mkldnn) {
+  TestVisualPrediction(/*use_mkldnn*/ true);
+}
+#endif
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gtest/gtest.h>
+#include <thread>  // NOLINT
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
+#include "paddle/fluid/inference/analysis/ut_helper.h"
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DEFINE_string(infer_model, "", "model path");
+DEFINE_string(infer_data, "", "data file");
+DEFINE_int32(batch_size, 1, "batch size.");
+DEFINE_int32(burning, 0, "Burning before repeat.");
+DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
+DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
+DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
+
+namespace paddle {
+namespace inference {
+
+void CompareResult(const std::vector<PaddleTensor> &outputs,
+                   const std::vector<PaddleTensor> &ref_outputs) {
+  EXPECT_GT(outputs.size(), 0);
+  EXPECT_EQ(outputs.size(), ref_outputs.size());
+  for (size_t i = 0; i < outputs.size(); i++) {
+    auto &out = outputs[i];
+    auto &ref_out = ref_outputs[i];
+    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
+                                  [](int a, int b) { return a * b; });
+    size_t ref_size =
+        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
+                        [](int a, int b) { return a * b; });
+    EXPECT_GT(size, 0);
+    EXPECT_EQ(size, ref_size);
+    EXPECT_EQ(out.dtype, ref_out.dtype);
+    switch (out.dtype) {
+      case PaddleDType::INT64: {
+        int64_t *pdata = static_cast<int64_t *>(out.data.data());
+        int64_t *pdata_ref = static_cast<int64_t *>(ref_out.data.data());
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_EQ(pdata_ref[j], pdata[j]);
+        }
+        break;
+      }
+      case PaddleDType::FLOAT32: {
+        float *pdata = static_cast<float *>(out.data.data());
+        float *pdata_ref = static_cast<float *>(ref_out.data.data());
+        for (size_t j = 0; j < size; ++j) {
+          EXPECT_NEAR(pdata_ref[j], pdata[j], 1e-3);
+        }
+        break;
+      }
+    }
+  }
+}
+
+void TestOneThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  auto predictor =
+      CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+          config);
+  Timer timer;
+  timer.tic();
+  for (int i = 0; i < num_times; i++) {
+    for (size_t j = 0; j < inputs.size(); j++) {
+      predictor->Run(inputs[j], outputs);
+    }
+  }
+  PrintTime(batch_size, num_times, 1, 0, timer.toc() / num_times,
+            inputs.size());
+}
+
+void TestMultiThreadPrediction(
+    AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
+    std::vector<PaddleTensor> *outputs, int num_threads) {
+  int batch_size = FLAGS_batch_size;
+  int num_times = FLAGS_repeat;
+  std::vector<std::thread> threads;
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
+  // because AttentionLSTM's hard code nodeid will be damanged.
+  for (int tid = 0; tid < num_threads; ++tid) {
+    predictors.emplace_back(
+        CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
+            config));
+  }
+  for (int tid = 0; tid < num_threads; ++tid) {
+    threads.emplace_back([&, tid]() {
+      // Each thread should have local inputs and outputs.
+      // The inputs of each thread are all the same.
+      std::vector<std::vector<PaddleTensor>> inputs_tid = inputs;
+      std::vector<PaddleTensor> outputs_tid;
+      Timer timer;
+      timer.tic();
+      for (int i = 0; i < num_times; i++) {
+        for (size_t j = 0; j < inputs_tid.size(); j++) {
+          predictors[tid]->Run(inputs_tid[j], &outputs_tid);
+        }
+      }
+      PrintTime(batch_size, num_times, num_threads, tid,
+                timer.toc() / num_times, inputs_tid.size());
+    });
+  }
+  for (int i = 0; i < num_threads; ++i) {
+    threads[i].join();
+  }
+}
+
+void TestPrediction(AnalysisConfig config,
+                    const std::vector<std::vector<PaddleTensor>> inputs,
+                    std::vector<PaddleTensor> *outputs, int num_threads) {
+  if (num_threads == 1) {
+    TestOneThreadPrediction(config, inputs, outputs);
+  } else {
+    TestMultiThreadPrediction(config, inputs, outputs, num_threads);
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -167,6 +167,8 @@ void BuddyAllocator::Free(void* p) {
 }

 size_t BuddyAllocator::Used() { return total_used_; }
+size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
+size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }

 void* BuddyAllocator::SystemAlloc(size_t size) {
  size_t index = 0;

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -42,6 +42,8 @@ class BuddyAllocator {
  void* Alloc(size_t unaligned_size);
  void Free(void* ptr);
  size_t Used();
+  size_t GetMinChunkSize();
+  size_t GetMaxChunkSize();

 public:
  // Disable copy and assignment

--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -119,8 +119,8 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
                 << place.device << ", available " << avail << " bytes";
    LOG(WARNING) << "total " << total;
-    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
-    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GpuMinChunkSize " << buddy_allocator->GetMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << buddy_allocator->GetMaxChunkSize();
    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
    platform::SetDeviceId(cur_dev);
  }

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -296,6 +296,7 @@ op_library(flatten_op DEPS reshape_op)
 op_library(sequence_pad_op DEPS sequence_padding)
 op_library(unstack_op DEPS stack_op)
 op_library(fake_quantize_op DEPS memory)
+op_library(fusion_lstm_op DEPS cpu_lstm_compute)

 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)

--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -300,10 +300,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
    bool fuse_relu = ctx.Attr<bool>("fuse_relu");
+    bool fuse_eltwise = ctx.Attr<bool>("fuse_eltwise");
    int groups = ctx.Attr<int>("groups");

-    // TODO(pzelazko-intel) add support for group convolution and dilation
-    PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
+    // TODO: add support for dilation
    PADDLE_ENFORCE(
        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
        "dilation in convolution is not implemented yet");
@@ -314,6 +314,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
    std::vector<int> weights_tz =
        paddle::framework::vectorize2int(filter->dims());
+    int g = std::max(groups, 1);
+    if (g > 1) {
+      int o = weights_tz[0];
+      int i = weights_tz[1];
+      int h = weights_tz[2];
+      int w = weights_tz[3];
+      weights_tz.resize(5);
+      weights_tz[0] = g;
+      weights_tz[1] = o / g;
+      weights_tz[2] = i;
+      weights_tz[3] = h;
+      weights_tz[4] = w;
+    }
    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());

    // Get unique name for storing MKLDNN primitives
@@ -327,7 +340,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto user_src_md = platform::MKLDNNMemDesc(
        {src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
    auto user_weights_md = platform::MKLDNNMemDesc(
-        {weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format());
+        {weights_tz}, platform::MKLDNNGetDataType<T>(),
+        (g == 1) ? filter->format() : mkldnn::memory::format::goihw);

    /* create memory descriptor for convolution without specified format
     * ('any') which lets a primitive (convolution in this case) choose
@@ -340,7 +354,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto src_md = platform::MKLDNNMemDesc(
        src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
    auto weights_md = platform::MKLDNNMemDesc(
-        weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
+        weights_tz, platform::MKLDNNGetDataType<T>(),
+        (g == 1) ? chosen_memory_format : mkldnn::memory::format::goihw);
    std::vector<int> bias_tz;  // TODO(mgallus): avoid empty vector creation.
                               // Currently used whenever bias is != nullptr.
    auto dst_md = platform::MKLDNNMemDesc(
@@ -352,12 +367,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
      bias_tz = paddle::framework::vectorize2int(bias->dims());
      auto bias_md = platform::MKLDNNMemDesc(
          bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      conv_pd =
-          ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, strides,
-                               paddings, mkldnn_engine, fuse_relu);
+      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
+                                     strides, paddings, mkldnn_engine,
+                                     fuse_relu, fuse_eltwise);
    } else {
-      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
-                                     paddings, mkldnn_engine, fuse_relu);
+      conv_pd =
+          ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
+                               mkldnn_engine, fuse_relu, fuse_eltwise);
    }
    // Save conv_pd/src_memory/weights_memory for backward pass
    dev_ctx.SetBlob(key_conv_pd, conv_pd);
@@ -407,16 +423,26 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  }

 private:
-  mkldnn::primitive_attr AddRelu() const {
+  mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
+                                       bool fuse_eltwise) const {
+    mkldnn::primitive_attr conv_attr;
+    mkldnn::post_ops post_operations;
+    // Fusion with Elementwise layer relies on adding a sum post-operation with
+    // the scale parameter. It is assumed that when fuse_eltwise is true, the
+    // Output tensor contains the data coming from residual connection. The
+    // result of this post_op is: Output = scale * Output + Conv_Out.
+    if (fuse_eltwise) {
+      post_operations.append_sum(1.0f);
+    }
    // Fusion with ReLU layer is executed through the PostOps feature. Create a
    // PostOps object and configure it to execute an eltwise relu operation.
-    mkldnn::primitive_attr conv_attr;
+    if (fuse_relu) {
      constexpr float scale = 1.0f;
      constexpr float negative_slope = 0.0f;
      constexpr float placeholder = 0.0f;
-    mkldnn::post_ops post_operations;
      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
                                     negative_slope, placeholder);
+    }
    conv_attr.set_post_ops(post_operations);
    return conv_attr;
  }
@@ -425,8 +451,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                       const memory::desc& dst, const std::vector<int>& strides,
                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine,
-                       const bool fuse_relu) const {
+                       const mkldnn::engine& engine, const bool fuse_relu,
+                       const bool fuse_eltwise) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};

@@ -435,10 +461,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        dst, stride_dims, padding_dims, padding_dims,
        mkldnn::padding_kind::zero);

-    mkldnn::primitive_attr conv_attr;
-    if (fuse_relu) {
-      conv_attr = AddRelu();
-    }
+    mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);

    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
        conv_desc, conv_attr, engine);
@@ -452,8 +475,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       const memory::desc& bias, const memory::desc& dst,
                       const std::vector<int>& strides,
                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine,
-                       const bool fuse_relu) const {
+                       const mkldnn::engine& engine, const bool fuse_relu,
+                       const bool fuse_eltwise) const {
    memory::dims stride_dims = {strides[0], strides[1]};
    memory::dims padding_dims = {paddings[0], paddings[1]};

@@ -462,10 +485,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
        bias, dst, stride_dims, padding_dims, padding_dims,
        mkldnn::padding_kind::zero);

-    mkldnn::primitive_attr conv_attr;
-    if (fuse_relu) {
-      conv_attr = AddRelu();
-    }
+    mkldnn::primitive_attr conv_attr = CreatePostOps(fuse_relu, fuse_eltwise);

    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
        conv_desc, conv_attr, engine);

--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -164,6 +164,11 @@ void Conv2DOpMaker::Make() {
      .SetDefault(false);
  AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
+  AddAttr<bool>("fuse_eltwise",
+                "(bool, default false) Only used in mkldnn kernel. Used "
+                "whenever convolution output is connected via skip connection "
+                "to a previous layer.")
+      .SetDefault(false);
  AddAttr<std::string>(
      "data_format",
      "(string, default NCHW) Only used in "

--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"

@@ -21,7 +22,7 @@ namespace operators {
 */
 template <typename T>
 inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes,
-                       const framework::Tensor& gt_boxes, const T* weights,
+                       const framework::Tensor& gt_boxes, const float* weights,
                       const bool normalized, framework::Tensor* box_delta) {
  auto ex_boxes_et = framework::EigenTensor<T, 2>::From(ex_boxes);
  auto gt_boxes_et = framework::EigenTensor<T, 2>::From(gt_boxes);
@@ -62,5 +63,35 @@ void Gather(const T* in, const int in_stride, const int* index, const int num,
  }
 }

+template <typename T>
+void BboxOverlaps(const framework::Tensor& r_boxes,
+                  const framework::Tensor& c_boxes,
+                  framework::Tensor* overlaps) {
+  auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
+  auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
+  auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
+  int r_num = r_boxes.dims()[0];
+  int c_num = c_boxes.dims()[0];
+  auto zero = static_cast<T>(0.0);
+  T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
+      inter_area;
+  for (int i = 0; i < r_num; ++i) {
+    r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
+                 (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
+    for (int j = 0; j < c_num; ++j) {
+      c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
+                   (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
+      x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
+      y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
+      x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
+      y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
+      inter_w = std::max(x_max - x_min + 1, zero);
+      inter_h = std::max(y_max - y_min + 1, zero);
+      inter_area = inter_w * inter_h;
+      overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -42,10 +42,11 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
                   "Input(RpnRois) shouldn't be null.");
    PADDLE_ENFORCE(ctx->HasInput("GtClasses"),
                   "Input(GtClasses) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(IsCrowd) shouldn't be null.");
    PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
                   "Input(GtBoxes) shouldn't be null.");
-    PADDLE_ENFORCE(ctx->HasInput("ImScales"),
-                   "Input(ImScales) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null.");

    PADDLE_ENFORCE(ctx->HasOutput("Rois"),
                   "Output(Rois) of RpnTargetAssignOp should not be null");
@@ -64,22 +65,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {

    auto rpn_rois_dims = ctx->GetInputDim("RpnRois");
    auto gt_classes_dims = ctx->GetInputDim("GtClasses");
+    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
-    auto im_scales_dims = ctx->GetInputDim("ImScales");
+    auto im_info_dims = ctx->GetInputDim("ImInfo");

    PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2,
                      "The rank of Input(RpnRois) must be 2.");
-    PADDLE_ENFORCE_EQ(gt_classes_dims.size(), 1,
-                      "The rank of Input(GtClasses) must be 1.");
    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
                      "The rank of Input(GtBoxes) must be 2.");
-    PADDLE_ENFORCE_EQ(im_scales_dims.size(), 1,
-                      "The rank of Input(ImScales) must be 1.");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");

    int class_nums = ctx->Attrs().Get<int>("class_nums");

    ctx->SetOutputDim("Rois", {-1, 4});
-    ctx->SetOutputDim("LabelsInt32", {-1});
+    ctx->SetOutputDim("LabelsInt32", {-1, 1});
    ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums});
    ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums});
    ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums});
@@ -105,45 +105,18 @@ void Concat(const platform::CPUDeviceContext& context,
  concat_functor(context, inputs, axis, out_tensor);
 }

-template <typename T>
-void BboxOverlaps(const Tensor& r_boxes, const Tensor& c_boxes,
-                  Tensor* overlaps) {
-  auto r_boxes_et = framework::EigenTensor<T, 2>::From(r_boxes);
-  auto c_boxes_et = framework::EigenTensor<T, 2>::From(c_boxes);
-  auto overlaps_et = framework::EigenTensor<T, 2>::From(*overlaps);
-  int r_num = r_boxes.dims()[0];
-  int c_num = c_boxes.dims()[0];
-  auto zero = static_cast<T>(0.0);
-  T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h,
-      inter_area;
-  for (int i = 0; i < r_num; ++i) {
-    r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) *
-                 (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1);
-    for (int j = 0; j < c_num; ++j) {
-      c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) *
-                   (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1);
-      x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0));
-      y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1));
-      x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2));
-      y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3));
-      inter_w = std::max(x_max - x_min + 1, zero);
-      inter_h = std::max(y_max - y_min + 1, zero);
-      inter_area = inter_w * inter_h;
-      overlaps_et(i, j) = inter_area / (r_box_area + c_box_area - inter_area);
-    }
-  }
-}
-
 template <typename T>
 std::vector<std::vector<int>> SampleFgBgGt(
    const platform::CPUDeviceContext& context, Tensor* iou,
-    const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
-    const float bg_thresh_hi, const float bg_thresh_lo,
-    std::minstd_rand engine) {
+    const Tensor& is_crowd, const int batch_size_per_im,
+    const float fg_fraction, const float fg_thresh, const float bg_thresh_hi,
+    const float bg_thresh_lo, std::minstd_rand engine, const bool use_random) {
  std::vector<int> fg_inds;
  std::vector<int> bg_inds;
  std::vector<int> gt_inds;
-  T* proposal_to_gt_overlaps = iou->mutable_data<T>(context.GetPlace());
+  int64_t gt_num = is_crowd.numel();
+  const int* crowd_data = is_crowd.data<int>();
+  T* proposal_to_gt_overlaps = iou->data<T>();
  int64_t row = iou->dims()[0];
  int64_t col = iou->dims()[1];
  float epsilon = 0.00001;
@@ -152,6 +125,9 @@ std::vector<std::vector<int>> SampleFgBgGt(
  for (int64_t i = 0; i < row; ++i) {
    const T* v = proposal_to_gt_overlaps + i * col;
    T max_overlap = *std::max_element(v, v + col);
+    if ((i < gt_num) && (crowd_data[i])) {
+      max_overlap = -1.0;
+    }
    if (max_overlap > fg_thresh) {
      for (int64_t j = 0; j < col; ++j) {
        T val = proposal_to_gt_overlaps[i * col + j];
@@ -170,10 +146,11 @@ std::vector<std::vector<int>> SampleFgBgGt(
  }

  // Reservoir Sampling
+  std::uniform_real_distribution<float> uniform(0, 1);
  int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction);
  int fg_rois_this_image = fg_inds.size();
  int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image);
-  std::uniform_real_distribution<float> uniform(0, 1);
+  if (use_random) {
    const int64_t fg_size = static_cast<int64_t>(fg_inds.size());
    if (fg_size > fg_rois_per_this_image) {
      for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) {
@@ -184,6 +161,7 @@ std::vector<std::vector<int>> SampleFgBgGt(
        }
      }
    }
+  }
  std::vector<int> new_fg_inds(fg_inds.begin(),
                               fg_inds.begin() + fg_rois_per_this_image);
  std::vector<int> new_gt_inds(gt_inds.begin(),
@@ -192,6 +170,7 @@ std::vector<std::vector<int>> SampleFgBgGt(
  int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image;
  int bg_rois_this_image = bg_inds.size();
  int bg_rois_per_this_image = std::min(bg_rois_per_image, bg_rois_this_image);
+  if (use_random) {
    const int64_t bg_size = static_cast<int64_t>(bg_inds.size());
    if (bg_size > bg_rois_per_this_image) {
      for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) {
@@ -200,6 +179,7 @@ std::vector<std::vector<int>> SampleFgBgGt(
          std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i);
      }
    }
+  }
  std::vector<int> new_bg_inds(bg_inds.begin(),
                               bg_inds.begin() + bg_rois_per_this_image);
  std::vector<std::vector<int>> res;
@@ -248,14 +228,14 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
 template <typename T>
 std::vector<Tensor> SampleRoisForOneImage(
    const platform::CPUDeviceContext& context, Tensor* rpn_rois,
-    Tensor* gt_classes, Tensor* gt_boxes, Tensor* im_scale,
+    Tensor* gt_classes, Tensor* is_crowd, Tensor* gt_boxes, Tensor* im_info,
    const int batch_size_per_im, const float fg_fraction, const float fg_thresh,
    const float bg_thresh_hi, const float bg_thresh_lo,
    const std::vector<float>& bbox_reg_weights, const int class_nums,
-    std::minstd_rand engine) {
+    std::minstd_rand engine, bool use_random) {
  auto rpn_rois_et = framework::EigenTensor<T, 2>::From(*rpn_rois);
-  auto im_scale_data = im_scale->data<T>()[0];
-  rpn_rois_et = rpn_rois_et / im_scale_data;
+  auto im_scale = im_info->data<T>()[2];
+  rpn_rois_et = rpn_rois_et / im_scale;

  Tensor boxes;
  int proposals_num = gt_boxes->dims()[0] + rpn_rois->dims()[0];
@@ -270,8 +250,8 @@ std::vector<Tensor> SampleRoisForOneImage(

  // Generate proposal index
  std::vector<std::vector<int>> fg_bg_gt = SampleFgBgGt<T>(
-      context, &proposal_to_gt_overlaps, batch_size_per_im, fg_fraction,
-      fg_thresh, bg_thresh_hi, bg_thresh_lo, engine);
+      context, &proposal_to_gt_overlaps, *is_crowd, batch_size_per_im,
+      fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, engine, use_random);
  std::vector<int> fg_inds = fg_bg_gt[0];
  std::vector<int> bg_inds = fg_bg_gt[1];
  std::vector<int> gt_inds = fg_bg_gt[2];
@@ -291,15 +271,15 @@ std::vector<Tensor> SampleRoisForOneImage(
  // Compute targets
  Tensor bbox_targets_single;
  bbox_targets_single.mutable_data<T>(bbox_dim, context.GetPlace());
-  BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, nullptr, false,
-                &bbox_targets_single);
+  BoxToDelta<T>(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(),
+                false, &bbox_targets_single);

  // Scale rois
  Tensor sampled_rois;
  sampled_rois.mutable_data<T>(sampled_boxes.dims(), context.GetPlace());
  auto sampled_rois_et = framework::EigenTensor<T, 2>::From(sampled_rois);
  auto sampled_boxes_et = framework::EigenTensor<T, 2>::From(sampled_boxes);
-  sampled_rois_et = sampled_boxes_et * im_scale_data;
+  sampled_rois_et = sampled_boxes_et * im_scale;

  // Expand box targets
  Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights;
@@ -351,8 +331,9 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto* rpn_rois = context.Input<LoDTensor>("RpnRois");
    auto* gt_classes = context.Input<LoDTensor>("GtClasses");
+    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
    auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
-    auto* im_scales = context.Input<LoDTensor>("ImScales");
+    auto* im_info = context.Input<LoDTensor>("ImInfo");

    auto* rois = context.Output<LoDTensor>("Rois");
    auto* labels_int32 = context.Output<LoDTensor>("LabelsInt32");
@@ -369,18 +350,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
    std::vector<float> bbox_reg_weights =
        context.Attr<std::vector<float>>("bbox_reg_weights");
    int class_nums = context.Attr<int>("class_nums");
+    bool use_random = context.Attr<bool>("use_random");

    PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL,
                      "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD");
    PADDLE_ENFORCE_EQ(
        gt_classes->lod().size(), 1UL,
        "GenerateProposalLabelsOp gt_classes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "GenerateProposalLabelsOp is_crowd needs 1 level of LoD");
    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
                      "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD");
    int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1);

    rois->mutable_data<T>({n * batch_size_per_im, kBoxDim}, context.GetPlace());
-    labels_int32->mutable_data<int>({n * batch_size_per_im},
+    labels_int32->mutable_data<int>({n * batch_size_per_im, 1},
                                    context.GetPlace());
    bbox_targets->mutable_data<T>({n * batch_size_per_im, kBoxDim * class_nums},
                                  context.GetPlace());
@@ -391,8 +375,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {

    std::random_device rnd;
    std::minstd_rand engine;
-    int seed =
-        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+    int seed = rnd();
    engine.seed(seed);

    framework::LoD lod;
@@ -403,19 +386,23 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {

    auto rpn_rois_lod = rpn_rois->lod().back();
    auto gt_classes_lod = gt_classes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
    auto gt_boxes_lod = gt_boxes->lod().back();
    for (int i = 0; i < n; ++i) {
      Tensor rpn_rois_slice =
          rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
      Tensor gt_classes_slice =
          gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
      Tensor gt_boxes_slice =
          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
-      Tensor im_scales_slice = im_scales->Slice(i, i + 1);
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
      std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
-          dev_ctx, &rpn_rois_slice, &gt_classes_slice, &gt_boxes_slice,
-          &im_scales_slice, batch_size_per_im, fg_fraction, fg_thresh,
-          bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, engine);
+          dev_ctx, &rpn_rois_slice, &gt_classes_slice, &is_crowd_slice,
+          &gt_boxes_slice, &im_info_slice, batch_size_per_im, fg_fraction,
+          fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
+          engine, use_random);
      Tensor sampled_rois = tensor_output[0];
      Tensor sampled_labels_int32 = tensor_output[1];
      Tensor sampled_bbox_targets = tensor_output[2];
@@ -442,7 +429,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
    bbox_inside_weights->set_lod(lod);
    bbox_outside_weights->set_lod(lod);
    rois->Resize({num_rois, kBoxDim});
-    labels_int32->Resize({num_rois});
+    labels_int32->Resize({num_rois, 1});
    bbox_targets->Resize({num_rois, kBoxDim * class_nums});
    bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums});
    bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums});
@@ -455,8 +442,9 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
    // TODO(buxingyuan): Add Document
    AddInput("RpnRois", "RpnRois.");
    AddInput("GtClasses", "GtClasses.");
+    AddInput("IsCrowd", "IsCrowd.");
    AddInput("GtBoxes", "GtBoxes.");
-    AddInput("ImScales", "ImScales.");
+    AddInput("ImInfo", "ImInfo.");

    AddOutput("Rois", "Rois.");
    AddOutput("LabelsInt32", "LabelsInt32.");
@@ -471,8 +459,7 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<float>("bg_thresh_lo", "bg_thresh_lo");
    AddAttr<std::vector<float>>("bbox_reg_weights", "bbox_reg_weights");
    AddAttr<int>("class_nums", "class_nums");
-    AddAttr<bool>("fix_seed", "fix_seed").SetDefault(false);
-    AddAttr<int>("seed", "seed").SetDefault(0);
+    AddAttr<bool>("use_random", "use_random").SetDefault(true);

    AddComment(R"DOC(
 Generate Proposals Labels Operator.

--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -89,12 +89,11 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
  }

  for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len];
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1];
+    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
+    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;

-    T anchor_center_x = (anchor_data[i * len + 2] + anchor_data[i * len]) / 2;
-    T anchor_center_y =
-        (anchor_data[i * len + 3] + anchor_data[i * len + 1]) / 2;
+    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
+    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;

    T bbox_center_x = 0, bbox_center_y = 0;
    T bbox_width = 0, bbox_height = 0;
@@ -106,25 +105,31 @@ void BoxCoder(const platform::DeviceContext &ctx, Tensor *all_anchors,
      bbox_center_y = variances_data[i * len + 1] *
                          bbox_deltas_data[i * len + 1] * anchor_height +
                      anchor_center_y;
-      bbox_width = std::exp(variances_data[i * len + 2] *
-                            bbox_deltas_data[i * len + 2]) *
+      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
+                                            bbox_deltas_data[i * len + 2],
+                                        std::log(1000.0 / 16.0))) *
                   anchor_width;
-      bbox_height = std::exp(variances_data[i * len + 3] *
-                             bbox_deltas_data[i * len + 3]) *
+      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
+                                             bbox_deltas_data[i * len + 3],
+                                         std::log(1000.0 / 16.0))) *
                    anchor_height;
    } else {
      bbox_center_x =
          bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
      bbox_center_y =
          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-      bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width;
-      bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height;
+      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
+                                        std::log(1000.0 / 16.0))) *
+                   anchor_width;
+      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
+                                         std::log(1000.0 / 16.0))) *
+                    anchor_height;
    }

    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2;
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
  }
  // return proposals;
 }
@@ -156,18 +161,23 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes,
                 float min_size, const Tensor &im_info, Tensor *keep) {
  const T *im_info_data = im_info.data<T>();
  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
-  min_size *= im_info_data[2];
+  T im_scale = im_info_data[2];
  keep->Resize({boxes->dims()[0], 1});
+  min_size = std::max(min_size, 1.0f);
  int *keep_data = keep->mutable_data<int>(ctx.GetPlace());

  int keep_len = 0;
  for (int i = 0; i < boxes->dims()[0]; ++i) {
    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
+    T ws_origin_scale =
+        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
+    T hs_origin_scale =
+        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
    T x_ctr = boxes_data[4 * i] + ws / 2;
    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
-    if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] &&
-        y_ctr <= im_info_data[0]) {
+    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
+        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
      keep_data[keep_len++] = i;
    }
  }
@@ -218,8 +228,8 @@ T JaccardOverlap(const T *box1, const T *box2, const bool normalized) {
    const T inter_ymin = std::max(box1[1], box2[1]);
    const T inter_xmax = std::min(box1[2], box2[2]);
    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = inter_xmax - inter_xmin;
-    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_w = std::max(0.0f, inter_xmax - inter_xmin + 1);
+    const T inter_h = std::max(0.0f, inter_ymax - inter_ymin + 1);
    const T inter_area = inter_w * inter_h;
    const T bbox1_area = BBoxArea<T>(box1, normalized);
    const T bbox2_area = BBoxArea<T>(box2, normalized);

--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -31,8 +31,14 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
-                   "Input(DistMat) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Anchor"),
+                   "Input(Anchor) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("GtBoxes"),
+                   "Input(GtBoxes) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("IsCrowd"),
+                   "Input(Anchor) of RpnTargetAssignOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("ImInfo"),
+                   "Input(ImInfo) of RpnTargetAssignOp should not be null");

    PADDLE_ENFORCE(
        ctx->HasOutput("LocationIndex"),
@@ -43,10 +49,20 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(
        ctx->HasOutput("TargetLabel"),
        "Output(TargetLabel) of RpnTargetAssignOp should not be null");
-
-    auto in_dims = ctx->GetInputDim("DistMat");
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
-                      "The rank of Input(DistMat) must be 2.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("TargetBBox"),
+        "Output(TargetBBox) of RpnTargetAssignOp should not be null");
+
+    auto anchor_dims = ctx->GetInputDim("Anchor");
+    auto gt_boxes_dims = ctx->GetInputDim("GtBoxes");
+    auto is_crowd_dims = ctx->GetInputDim("IsCrowd");
+    auto im_info_dims = ctx->GetInputDim("ImInfo");
+    PADDLE_ENFORCE_EQ(anchor_dims.size(), 2,
+                      "The rank of Input(Anchor) must be 2.");
+    PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2,
+                      "The rank of Input(GtBoxes) must be 2.");
+    PADDLE_ENFORCE_EQ(im_info_dims.size(), 2,
+                      "The rank of Input(ImInfo) must be 2.");

    ctx->SetOutputDim("LocationIndex", {-1});
    ctx->SetOutputDim("ScoreIndex", {-1});
@@ -59,198 +75,383 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(
-            ctx.Input<framework::LoDTensor>("DistMat")->type()),
+            ctx.Input<framework::LoDTensor>("Anchor")->type()),
        platform::CPUPlace());
  }
 };

 template <typename T>
-class RpnTargetAssignKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* anchor_t = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
-    auto* gt_bbox_t = context.Input<Tensor>("GtBox");
-    auto* dist_t = context.Input<LoDTensor>("DistMat");
-
-    auto* loc_index_t = context.Output<Tensor>("LocationIndex");
-    auto* score_index_t = context.Output<Tensor>("ScoreIndex");
-    auto* tgt_bbox_t = context.Output<Tensor>("TargetBBox");
-    auto* tgt_lbl_t = context.Output<Tensor>("TargetLabel");
-
-    auto lod = dist_t->lod().back();
-    int64_t batch_num = static_cast<int64_t>(lod.size() - 1);
-    int64_t anchor_num = dist_t->dims()[1];
-    PADDLE_ENFORCE_EQ(anchor_num, anchor_t->dims()[0]);
-
-    int rpn_batch_size = context.Attr<int>("rpn_batch_size_per_im");
-    float pos_threshold = context.Attr<float>("rpn_positive_overlap");
-    float neg_threshold = context.Attr<float>("rpn_negative_overlap");
-    float fg_fraction = context.Attr<float>("fg_fraction");
-
-    int fg_num_per_batch = static_cast<int>(rpn_batch_size * fg_fraction);
-
-    int64_t max_num = batch_num * anchor_num;
-    auto place = context.GetPlace();
+void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) {
+  auto* out_data = out->data<T>();
+  auto* to_add_data = to_add->data<T>();
+  memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
+}

-    tgt_bbox_t->mutable_data<T>({max_num, 4}, place);
-    auto* loc_index = loc_index_t->mutable_data<int>({max_num}, place);
-    auto* score_index = score_index_t->mutable_data<int>({max_num}, place);
-
-    Tensor tmp_tgt_lbl;
-    auto* tmp_lbl_data = tmp_tgt_lbl.mutable_data<int64_t>({max_num}, place);
-    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, int64_t> iset;
-    iset(dev_ctx, &tmp_tgt_lbl, static_cast<int64_t>(-1));
-
-    std::random_device rnd;
-    std::minstd_rand engine;
-    int seed =
-        context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
-    engine.seed(seed);
-
-    int fg_num = 0;
-    int bg_num = 0;
-    for (int i = 0; i < batch_num; ++i) {
-      Tensor dist = dist_t->Slice(lod[i], lod[i + 1]);
-      Tensor gt_bbox = gt_bbox_t->Slice(lod[i], lod[i + 1]);
-      auto fg_bg_gt = SampleFgBgGt(dev_ctx, dist, pos_threshold, neg_threshold,
-                                   rpn_batch_size, fg_num_per_batch, engine,
-                                   tmp_lbl_data + i * anchor_num);
-
-      int cur_fg_num = fg_bg_gt[0].size();
-      int cur_bg_num = fg_bg_gt[1].size();
-      std::transform(fg_bg_gt[0].begin(), fg_bg_gt[0].end(), loc_index,
-                     [i, anchor_num](int d) { return d + i * anchor_num; });
-      memcpy(score_index, loc_index, cur_fg_num * sizeof(int));
-      std::transform(fg_bg_gt[1].begin(), fg_bg_gt[1].end(),
-                     score_index + cur_fg_num,
-                     [i, anchor_num](int d) { return d + i * anchor_num; });
-
-      // get target bbox deltas
-      if (cur_fg_num) {
-        Tensor fg_gt;
-        T* gt_data = fg_gt.mutable_data<T>({cur_fg_num, 4}, place);
-        Tensor tgt_bbox = tgt_bbox_t->Slice(fg_num, fg_num + cur_fg_num);
-        T* tgt_data = tgt_bbox.data<T>();
-        Gather<T>(anchor_t->data<T>(), 4,
-                  reinterpret_cast<int*>(&fg_bg_gt[0][0]), cur_fg_num,
-                  tgt_data);
-        Gather<T>(gt_bbox.data<T>(), 4, reinterpret_cast<int*>(&fg_bg_gt[2][0]),
-                  cur_fg_num, gt_data);
-        BoxToDelta<T>(cur_fg_num, tgt_bbox, fg_gt, nullptr, false, &tgt_bbox);
-      }
-
-      loc_index += cur_fg_num;
-      score_index += cur_fg_num + cur_bg_num;
-      fg_num += cur_fg_num;
-      bg_num += cur_bg_num;
-    }
-
-    int lbl_num = fg_num + bg_num;
-    PADDLE_ENFORCE_LE(fg_num, max_num);
-    PADDLE_ENFORCE_LE(lbl_num, max_num);
-
-    tgt_bbox_t->Resize({fg_num, 4});
-    loc_index_t->Resize({fg_num});
-    score_index_t->Resize({lbl_num});
-    auto* lbl_data = tgt_lbl_t->mutable_data<int64_t>({lbl_num, 1}, place);
-    Gather<int64_t>(tmp_lbl_data, 1, score_index_t->data<int>(), lbl_num,
-                    lbl_data);
+template <typename T>
+std::vector<Tensor> FilterStraddleAnchor(
+    const platform::CPUDeviceContext& context, const Tensor* anchor,
+    const float rpn_straddle_thresh, T im_height, T im_width) {
+  std::vector<int> inds_inside;
+  int anchor_num = anchor->dims()[0];
+  auto* anchor_data = anchor->data<T>();
+  if (rpn_straddle_thresh >= 0) {
+    int index;
+    for (int i = 0; i < anchor_num; ++i) {
+      index = i * 4;
+      if ((anchor_data[index + 0] >= -rpn_straddle_thresh) &&
+          (anchor_data[index + 1] >= -rpn_straddle_thresh) &&
+          (anchor_data[index + 2] < im_width + rpn_straddle_thresh) &&
+          (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) {
+        inds_inside.emplace_back(i);
      }
-
- private:
-  void ScoreAssign(const T* dist_data, const Tensor& anchor_to_gt_max,
-                   const int row, const int col, const float pos_threshold,
-                   const float neg_threshold, int64_t* target_label,
-                   std::vector<int>* fg_inds, std::vector<int>* bg_inds) const {
-    float epsilon = 0.0001;
-    for (int64_t i = 0; i < row; ++i) {
-      const T* v = dist_data + i * col;
-      T max = *std::max_element(v, v + col);
-      for (int64_t j = 0; j < col; ++j) {
-        if (std::abs(max - v[j]) < epsilon) {
-          target_label[j] = 1;
    }
+  } else {
+    for (int i = 0; i < anchor_num; ++i) {
+      inds_inside.emplace_back(i);
    }
  }
+  int inside_num = inds_inside.size();
+  Tensor inds_inside_t;
+  int* inds_inside_data =
+      inds_inside_t.mutable_data<int>({inside_num}, context.GetPlace());
+  std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data);
+  Tensor inside_anchor_t;
+  T* inside_anchor_data =
+      inside_anchor_t.mutable_data<T>({inside_num, 4}, context.GetPlace());
+  Gather<T>(anchor->data<T>(), 4, inds_inside_data, inside_num,
+            inside_anchor_data);
+  std::vector<Tensor> res;
+  res.emplace_back(inds_inside_t);
+  res.emplace_back(inside_anchor_t);
+  return res;
+}

-    // Pick the fg/bg
-    const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
-    for (int64_t j = 0; j < col; ++j) {
-      if (anchor_to_gt_max_data[j] >= pos_threshold) {
-        target_label[j] = 1;
-      } else if (anchor_to_gt_max_data[j] < neg_threshold) {
-        target_label[j] = 0;
-      }
-      if (target_label[j] == 1) {
-        fg_inds->push_back(j);
-      } else if (target_label[j] == 0) {
-        bg_inds->push_back(j);
-      }
+template <typename T>
+Tensor FilterCrowdGt(const platform::CPUDeviceContext& context,
+                     Tensor* gt_boxes, Tensor* is_crowd) {
+  int gt_num = gt_boxes->dims()[0];
+  std::vector<int> not_crowd_inds;
+  auto* is_crowd_data = is_crowd->data<int>();
+  for (int i = 0; i < gt_num; ++i) {
+    if (is_crowd_data[i] == 0) {
+      not_crowd_inds.emplace_back(i);
    }
  }
-
-  void ReservoirSampling(const int num, std::minstd_rand engine,
-                         std::vector<int>* inds) const {
+  int ncrowd_num = not_crowd_inds.size();
+  Tensor ncrowd_gt_boxes;
+  T* ncrowd_gt_boxes_data =
+      ncrowd_gt_boxes.mutable_data<T>({ncrowd_num, 4}, context.GetPlace());
+  Gather<T>(gt_boxes->data<T>(), 4, not_crowd_inds.data(), ncrowd_num,
+            ncrowd_gt_boxes_data);
+  return ncrowd_gt_boxes;
+}
+
+void ReservoirSampling(const int num, std::vector<int>* inds,
+                       std::minstd_rand engine, bool use_random) {
  std::uniform_real_distribution<float> uniform(0, 1);
  size_t len = inds->size();
  if (len > static_cast<size_t>(num)) {
+    if (use_random) {
      for (size_t i = num; i < len; ++i) {
        int rng_ind = std::floor(uniform(engine) * i);
        if (rng_ind < num)
          std::iter_swap(inds->begin() + rng_ind, inds->begin() + i);
      }
+    }
    inds->resize(num);
  }
+}
+
+template <typename T>
+void ScoreAssign(const T* anchor_by_gt_overlap_data,
+                 const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max,
+                 const int rpn_batch_size_per_im, const float rpn_fg_fraction,
+                 const float rpn_positive_overlap,
+                 const float rpn_negative_overlap, std::vector<int>* fg_inds,
+                 std::vector<int>* bg_inds, std::vector<int>* tgt_lbl,
+                 std::minstd_rand engine, bool use_random) {
+  float epsilon = 0.00001;
+  int anchor_num = anchor_to_gt_max.dims()[0];
+  int gt_num = gt_to_anchor_max.dims()[0];
+  std::vector<int> target_label(anchor_num, -1);
+  std::vector<int> fg_inds_fake;
+  std::vector<int> bg_inds_fake;
+  const T* anchor_to_gt_max_data = anchor_to_gt_max.data<T>();
+  const T* gt_to_anchor_max_data = gt_to_anchor_max.data<T>();
+  // TODO(buxingyuan): Match with Detectron now
+  // but it seems here is a bug in two directions assignment
+  // in which the later one may overwrites the former one.
+  for (int64_t i = 0; i < anchor_num; ++i) {
+    bool is_anchors_with_max_overlap = false;
+    for (int64_t j = 0; j < gt_num; ++j) {
+      T value = anchor_by_gt_overlap_data[i * gt_num + j];
+      T diff = std::abs(value - gt_to_anchor_max_data[j]);
+      if (diff < epsilon) {
+        is_anchors_with_max_overlap = true;
+        break;
+      }
+    }
+    bool is_anchor_great_than_thresh =
+        (anchor_to_gt_max_data[i] >= rpn_positive_overlap);
+    if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) {
+      fg_inds_fake.push_back(i);
+    }
  }

-  // std::vector<std::vector<int>> RpnTargetAssign(
-  std::vector<std::vector<int>> SampleFgBgGt(
-      const platform::CPUDeviceContext& ctx, const Tensor& dist,
-      const float pos_threshold, const float neg_threshold,
-      const int rpn_batch_size, const int fg_num, std::minstd_rand engine,
-      int64_t* target_label) const {
-    auto* dist_data = dist.data<T>();
-    int row = dist.dims()[0];
-    int col = dist.dims()[1];
+  // Reservoir Sampling
+  int fg_num = static_cast<int>(rpn_fg_fraction * rpn_batch_size_per_im);
+  ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random);
+  fg_num = static_cast<int>(fg_inds_fake.size());
+  for (int64_t i = 0; i < fg_num; ++i) {
+    target_label[fg_inds_fake[i]] = 1;
+  }
+
+  int bg_num = rpn_batch_size_per_im - fg_num;
+  for (int64_t i = 0; i < anchor_num; ++i) {
+    if (anchor_to_gt_max_data[i] < rpn_negative_overlap) {
+      bg_inds_fake.push_back(i);
+    }
+  }
+  ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random);
+  bg_num = static_cast<int>(bg_inds_fake.size());
+  for (int64_t i = 0; i < bg_num; ++i) {
+    target_label[bg_inds_fake[i]] = 0;
+  }
+
+  for (int64_t i = 0; i < anchor_num; ++i) {
+    if (target_label[i] == 1) fg_inds->emplace_back(i);
+    if (target_label[i] == 0) bg_inds->emplace_back(i);
+  }
+  fg_num = fg_inds->size();
+  bg_num = bg_inds->size();
+
+  tgt_lbl->resize(fg_num + bg_num, 0);
+  std::vector<int> fg_lbl(fg_num, 1);
+  std::vector<int> bg_lbl(bg_num, 0);
+  std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data());
+  std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num);
+}
+
+template <typename T>
+std::vector<Tensor> SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx,
+                                    const Tensor& anchor_by_gt_overlap,
+                                    const int rpn_batch_size_per_im,
+                                    const float rpn_positive_overlap,
+                                    const float rpn_negative_overlap,
+                                    const float rpn_fg_fraction,
+                                    std::minstd_rand engine, bool use_random) {
+  auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data<T>();
+  int anchor_num = anchor_by_gt_overlap.dims()[0];
+  int gt_num = anchor_by_gt_overlap.dims()[1];

  std::vector<int> fg_inds;
  std::vector<int> bg_inds;
  std::vector<int> gt_inds;
+  std::vector<int> tgt_lbl;

  // Calculate the max IoU between anchors and gt boxes
  // Map from anchor to gt box that has highest overlap
  auto place = ctx.GetPlace();
-    Tensor anchor_to_gt_max, anchor_to_gt_argmax;
-    anchor_to_gt_max.mutable_data<T>({col}, place);
-    int* argmax = anchor_to_gt_argmax.mutable_data<int>({col}, place);
-
-    auto x = framework::EigenMatrix<T>::From(dist);
-    auto x_col_max = framework::EigenVector<T>::Flatten(anchor_to_gt_max);
-    auto x_col_argmax =
+  Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max;
+  anchor_to_gt_max.mutable_data<T>({anchor_num}, place);
+  int* argmax = anchor_to_gt_argmax.mutable_data<int>({anchor_num}, place);
+  gt_to_anchor_max.mutable_data<T>({gt_num}, place);
+
+  auto anchor_by_gt_overlap_et =
+      framework::EigenMatrix<T>::From(anchor_by_gt_overlap);
+  auto anchor_to_gt_max_et =
+      framework::EigenVector<T>::Flatten(anchor_to_gt_max);
+  auto gt_to_anchor_max_et =
+      framework::EigenVector<T>::Flatten(gt_to_anchor_max);
+  auto anchor_to_gt_argmax_et =
      framework::EigenVector<int>::Flatten(anchor_to_gt_argmax);
-    x_col_max = x.maximum(Eigen::DSizes<int, 1>(0));
-    x_col_argmax = x.argmax(0).template cast<int>();
+  anchor_to_gt_max_et =
+      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(1));
+  anchor_to_gt_argmax_et =
+      anchor_by_gt_overlap_et.argmax(1).template cast<int>();
+  gt_to_anchor_max_et =
+      anchor_by_gt_overlap_et.maximum(Eigen::DSizes<int, 1>(0));

  // Follow the Faster RCNN's implementation
-    ScoreAssign(dist_data, anchor_to_gt_max, row, col, pos_threshold,
-                neg_threshold, target_label, &fg_inds, &bg_inds);
-    // Reservoir Sampling
-    ReservoirSampling(fg_num, engine, &fg_inds);
-    int fg_num2 = static_cast<int>(fg_inds.size());
-    int bg_num = rpn_batch_size - fg_num2;
-    ReservoirSampling(bg_num, engine, &bg_inds);
-
-    gt_inds.reserve(fg_num2);
-    for (int i = 0; i < fg_num2; ++i) {
+  ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max,
+              rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap,
+              rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, engine,
+              use_random);
+
+  int fg_num = fg_inds.size();
+  int bg_num = bg_inds.size();
+  gt_inds.reserve(fg_num);
+  for (int i = 0; i < fg_num; ++i) {
    gt_inds.emplace_back(argmax[fg_inds[i]]);
  }
-    std::vector<std::vector<int>> fg_bg_gt;
-    fg_bg_gt.emplace_back(fg_inds);
-    fg_bg_gt.emplace_back(bg_inds);
-    fg_bg_gt.emplace_back(gt_inds);

-    return fg_bg_gt;
+  Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t;
+  int* loc_index_data = loc_index_t.mutable_data<int>({fg_num}, place);
+  int* score_index_data =
+      score_index_t.mutable_data<int>({fg_num + bg_num}, place);
+  int* tgt_lbl_data = tgt_lbl_t.mutable_data<int>({fg_num + bg_num}, place);
+  int* gt_inds_data = gt_inds_t.mutable_data<int>({fg_num}, place);
+  std::copy(fg_inds.begin(), fg_inds.end(), loc_index_data);
+  std::copy(fg_inds.begin(), fg_inds.end(), score_index_data);
+  std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num);
+  std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data);
+  std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data);
+  std::vector<Tensor> loc_score_tgtlbl_gt;
+  loc_score_tgtlbl_gt.emplace_back(loc_index_t);
+  loc_score_tgtlbl_gt.emplace_back(score_index_t);
+  loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t);
+  loc_score_tgtlbl_gt.emplace_back(gt_inds_t);
+
+  return loc_score_tgtlbl_gt;
+}
+
+template <typename T>
+class RpnTargetAssignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* anchor = context.Input<Tensor>("Anchor");  // (H*W*A) * 4
+    auto* gt_boxes = context.Input<LoDTensor>("GtBoxes");
+    auto* is_crowd = context.Input<LoDTensor>("IsCrowd");
+    auto* im_info = context.Input<LoDTensor>("ImInfo");
+
+    auto* loc_index = context.Output<LoDTensor>("LocationIndex");
+    auto* score_index = context.Output<LoDTensor>("ScoreIndex");
+    auto* tgt_bbox = context.Output<LoDTensor>("TargetBBox");
+    auto* tgt_lbl = context.Output<LoDTensor>("TargetLabel");
+
+    PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL,
+                      "RpnTargetAssignOp gt_boxes needs 1 level of LoD");
+    PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL,
+                      "RpnTargetAssignOp is_crowd needs 1 level of LoD");
+    int64_t anchor_num = static_cast<int64_t>(anchor->dims()[0]);
+    int64_t batch_num = static_cast<int64_t>(gt_boxes->lod().back().size() - 1);
+
+    int rpn_batch_size_per_im = context.Attr<int>("rpn_batch_size_per_im");
+    float rpn_straddle_thresh = context.Attr<float>("rpn_straddle_thresh");
+    float rpn_positive_overlap = context.Attr<float>("rpn_positive_overlap");
+    float rpn_negative_overlap = context.Attr<float>("rpn_negative_overlap");
+    float rpn_fg_fraction = context.Attr<float>("rpn_fg_fraction");
+    bool use_random = context.Attr<bool>("use_random");
+
+    int64_t max_num = batch_num * rpn_batch_size_per_im;
+    auto place = context.GetPlace();
+
+    loc_index->mutable_data<int>({max_num}, place);
+    score_index->mutable_data<int>({max_num}, place);
+    tgt_bbox->mutable_data<T>({max_num, 4}, place);
+    tgt_lbl->mutable_data<int>({max_num, 1}, place);
+
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+
+    std::random_device rnd;
+    std::minstd_rand engine;
+    int seed = rnd();
+    engine.seed(seed);
+
+    framework::LoD lod_loc, loc_score;
+    std::vector<size_t> lod0_loc(1, 0);
+    std::vector<size_t> lod0_score(1, 0);
+
+    int total_loc_num = 0;
+    int total_score_num = 0;
+    auto gt_boxes_lod = gt_boxes->lod().back();
+    auto is_crowd_lod = is_crowd->lod().back();
+    for (int i = 0; i < batch_num; ++i) {
+      Tensor gt_boxes_slice =
+          gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
+      Tensor is_crowd_slice =
+          is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]);
+      Tensor im_info_slice = im_info->Slice(i, i + 1);
+      auto* im_info_data = im_info_slice.data<T>();
+      auto im_height = im_info_data[0];
+      auto im_width = im_info_data[1];
+      auto im_scale = im_info_data[2];
+
+      // Filter straddle anchor
+      std::vector<Tensor> filter_output = FilterStraddleAnchor<T>(
+          dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width);
+      Tensor inds_inside = filter_output[0];
+      Tensor inside_anchor = filter_output[1];
+
+      // Filter crowd gt
+      Tensor ncrowd_gt_boxes =
+          FilterCrowdGt<T>(dev_ctx, &gt_boxes_slice, &is_crowd_slice);
+      auto ncrowd_gt_boxes_et =
+          framework::EigenTensor<T, 2>::From(ncrowd_gt_boxes);
+      ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale;
+
+      Tensor anchor_by_gt_overlap;
+      anchor_by_gt_overlap.mutable_data<T>(
+          {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place);
+      BboxOverlaps<T>(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap);
+
+      auto loc_score_tgtlbl_gt = SampleRpnFgBgGt<T>(
+          dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im,
+          rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine,
+          use_random);
+
+      Tensor sampled_loc_index = loc_score_tgtlbl_gt[0];
+      Tensor sampled_score_index = loc_score_tgtlbl_gt[1];
+      Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2];
+      Tensor sampled_gt_index = loc_score_tgtlbl_gt[3];
+
+      int loc_num = sampled_loc_index.dims()[0];
+      int score_num = sampled_score_index.dims()[0];
+      // unmap to all anchor
+      Tensor sampled_loc_index_unmap, sampled_score_index_unmap;
+      sampled_loc_index_unmap.mutable_data<int>({loc_num}, place);
+      sampled_score_index_unmap.mutable_data<int>({score_num}, place);
+      Gather<int>(inds_inside.data<int>(), 1, sampled_loc_index.data<int>(),
+                  loc_num, sampled_loc_index_unmap.data<int>());
+      Gather<int>(inds_inside.data<int>(), 1, sampled_score_index.data<int>(),
+                  score_num, sampled_score_index_unmap.data<int>());
+
+      // get target bbox deltas
+      Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox;
+      auto* sampled_anchor_data =
+          sampled_anchor.mutable_data<T>({loc_num, 4}, place);
+      auto* sampled_gt_data = sampled_gt.mutable_data<T>({loc_num, 4}, place);
+      Gather<T>(anchor->data<T>(), 4, sampled_loc_index_unmap.data<int>(),
+                loc_num, sampled_anchor_data);
+      Gather<T>(ncrowd_gt_boxes.data<T>(), 4, sampled_gt_index.data<int>(),
+                loc_num, sampled_gt_data);
+      sampled_tgt_bbox.mutable_data<T>({loc_num, 4}, place);
+      BoxToDelta<T>(loc_num, sampled_anchor, sampled_gt, nullptr, false,
+                    &sampled_tgt_bbox);
+
+      // Add anchor offset
+      int anchor_offset = i * anchor_num;
+      auto sampled_loc_index_unmap_et =
+          framework::EigenTensor<int, 1>::From(sampled_loc_index_unmap);
+      sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset;
+      auto sampled_score_index_unmap_et =
+          framework::EigenTensor<int, 1>::From(sampled_score_index_unmap);
+      sampled_score_index_unmap_et =
+          sampled_score_index_unmap_et + anchor_offset;
+      AppendRpns<int>(loc_index, total_loc_num, &sampled_loc_index_unmap);
+      AppendRpns<int>(score_index, total_score_num, &sampled_score_index_unmap);
+      AppendRpns<T>(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox);
+      AppendRpns<int>(tgt_lbl, total_score_num, &sampled_tgtlbl);
+      total_loc_num += loc_num;
+
+      total_score_num += score_num;
+      lod0_loc.emplace_back(total_loc_num);
+      lod0_score.emplace_back(total_score_num);
+    }
+
+    PADDLE_ENFORCE_LE(total_loc_num, max_num);
+    PADDLE_ENFORCE_LE(total_score_num, max_num);
+
+    lod_loc.emplace_back(lod0_loc);
+    loc_score.emplace_back(lod0_score);
+    loc_index->set_lod(lod_loc);
+    score_index->set_lod(loc_score);
+    tgt_bbox->set_lod(lod_loc);
+    tgt_lbl->set_lod(loc_score);
+    loc_index->Resize({total_loc_num});
+    score_index->Resize({total_score_num});
+    tgt_bbox->Resize({total_loc_num, 4});
+    tgt_lbl->Resize({total_score_num, 1});
  }
 };

@@ -259,18 +460,22 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("Anchor",
             "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4].");
-    AddInput("GtBox", "(LoDTensor) input groud-truth bbox with shape [K, 4].");
-    AddInput(
-        "DistMat",
-        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
-        "[K, M]. It is pair-wise distance matrix between the entities "
-        "represented by each row and each column. For example, assumed one "
-        "entity is A with shape [K], another entity is B with shape [M]. The "
-        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
-        "the distance is, the better macthing the pairs are. Please note, "
-        "This tensor can contain LoD information to represent a batch of "
-        "inputs. One instance of this batch can contain different numbers of "
-        "entities.");
+    AddInput("GtBoxes",
+             "(LoDTensor) input groud-truth bbox with shape [K, 4].");
+    AddInput("IsCrowd",
+             "(LoDTensor) input which indicates groud-truth is crowd.");
+    AddInput("ImInfo",
+             "(LoDTensor) input image information with shape [N, 3]. "
+             "N is the batch size, each image information includes height, "
+             "width and scale.");
+    AddAttr<int>("rpn_batch_size_per_im",
+                 "Total number of RPN examples per image.")
+        .SetDefault(256);
+    AddAttr<float>(
+        "rpn_straddle_thresh",
+        "Remove RPN anchors that go outside the image by straddle_thresh "
+        "pixels, "
+        "Set to -1 or a large value, e.g. 100000, to disable pruning anchors.");
    AddAttr<float>(
        "rpn_positive_overlap",
        "Minimum overlap required between an anchor and ground-truth "
@@ -282,20 +487,15 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
        "box for the (anchor, gt box) pair to be a negative examples.")
        .SetDefault(0.3);
    AddAttr<float>(
-        "fg_fraction",
+        "rpn_fg_fraction",
        "Target fraction of RoI minibatch that "
        "is labeled foreground (i.e. class > 0), 0-th class is background.")
        .SetDefault(0.25);
-    AddAttr<int>("rpn_batch_size_per_im",
-                 "Total number of RPN examples per image.")
-        .SetDefault(256);
-    AddAttr<bool>("fix_seed",
-                  "A flag indicating whether to use a fixed seed to generate "
-                  "random mask. NOTE: DO NOT set this flag to true in "
-                  "training. Setting this flag to true is only useful in "
-                  "unittest.")
-        .SetDefault(false);
-    AddAttr<int>("seed", "RpnTargetAssign random seed.").SetDefault(0);
+    AddAttr<bool>("use_random",
+                  "A flag indicating whether to use a ReservoirSampling. "
+                  "NOTE: DO NOT set this flag to false in training. "
+                  "Setting this flag to false is only useful in unittest.")
+        .SetDefault(true);
    AddOutput(
        "LocationIndex",
        "(Tensor), The indexes of foreground anchors in all RPN anchors, the "
@@ -308,16 +508,16 @@ class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker {
        "ScoreIndex is [F + B], F and B are sampled foreground and backgroud "
        " number.");
    AddOutput("TargetBBox",
-              "(Tensor<int64_t>), The target bbox deltas with shape "
+              "(Tensor), The target bbox deltas with shape "
              "[F, 4], F is the sampled foreground number.");
    AddOutput(
        "TargetLabel",
-        "(Tensor<int64_t>), The target labels of each anchor with shape "
+        "(Tensor<int>), The target labels of each anchor with shape "
        "[F + B, 1], F and B are sampled foreground and backgroud number.");
    AddComment(R"DOC(
-This operator can be, for given the IoU between the ground truth bboxes and the
+This operator can be, for a given set of ground truth bboxes and the
 anchors, to assign classification and regression targets to each prediction.
-The Score index and LocationIndex will be generated according to the DistMat.
+The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU.
 The rest anchors would not contibute to the RPN training loss

 ScoreIndex is composed of foreground anchor indexes(positive labels) and

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -125,7 +125,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
  VarHandlePtr h(new VarHandle(ep, "Get", var_name_val, p_ctx, p_scope));
  s->Prepare(h, time_out);

-  framework::AsyncIO([var_name_val, p_scope, p_ctx, s, this] {
+  framework::AsyncIO([var_name_val, s, this] {
    // prepare input
    sendrecv::VariableMessage req;
    req.set_varname(var_name_val);
@@ -166,7 +166,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
  s->Prepare(h, time_out);

  framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
-                      time_out, s, this] {
+                      s, this] {
    auto* var = p_scope->FindVar(in_var_name_val);

    ::grpc::ByteBuffer req;
@@ -290,13 +290,19 @@ void GRPCClient::Proceed() {
      c->Finish(false);
    }

-    delete c;
+    bool notify = false;
    {
      std::lock_guard<std::mutex> lk(sync_mutex_);
      req_count_--;
+      notify = (req_count_ <= 0 || !c->status_.ok());
    }
+
+    delete c;
+
+    if (notify) {
      sync_cond_.notify_all();
    }
+  }
  VLOG(3) << "GRPCClient Proceed end";
 }


--- a/paddle/fluid/operators/distributed/proto_encoder_helper.h
+++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h
@@ -82,8 +82,10 @@ class ProtoEncodeHelper {
      : base_(buf), p_(buf), limit_(base_ + max_size) {}

  ~ProtoEncodeHelper() {
+#define REPLACE_ENFORCE_GLOG 1
    // Make sure callers didn't do operations that went over max_size promised
-    PADDLE_ENFORCE_LE(p_, limit_);
+    paddle::platform::throw_on_error(p_ <= limit_);
+#undef REPLACE_ENFORCE_GLOG
  }

  const char* data() const { return base_; }

--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -56,7 +56,7 @@ class VarHandle {
            const std::string& name,
            const platform::DeviceContext* p_ctx = nullptr,
            const framework::Scope* p_scope = nullptr)
-      : ok_(kVarHandleDefaultState) {
+      : status_(kDefaultState) {
    ep_ = ep;
    ctx_ = p_ctx;
    scope_ = p_scope;
@@ -68,18 +68,20 @@ class VarHandle {

 public:
  bool Wait() {
+    int ret = kDefaultState;
    {
      std::unique_lock<std::mutex> lk(sync_mutex_);
-      wait_cond_.wait(lk, [this] { return ok_ != kVarHandleDefaultState; });
+      wait_cond_.wait(lk, [this] { return status_ != kDefaultState; });
+      ret = status_;
    }
-    VLOG(7) << "VarHandle wait:" << ok_;
-    return ok_ != 0;
+    VLOG(7) << "VarHandle wait:" << ret;
+    return ret != kErrorState;
  }

  void Finish(bool ok) {
    {
      std::unique_lock<std::mutex> lk(sync_mutex_);
-      ok_ = ok;
+      status_ = ok ? kFinishState : kErrorState;
    }
    VLOG(7) << "VarHandle finish:" << ok;
    wait_cond_.notify_all();
@@ -87,8 +89,8 @@ class VarHandle {

  std::string String() const {
    std::ostringstream s;
-    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], ok:[" << ok_
-      << "]";
+    s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:["
+      << status_ << "]";
    return s.str();
  }

@@ -111,9 +113,13 @@ class VarHandle {
 protected:
  std::mutex sync_mutex_;
  std::condition_variable wait_cond_;
-  int ok_;

-  static const int kVarHandleDefaultState = -1;
+  enum VarHandleStatus {
+    kDefaultState = -1,
+    kErrorState = 0,
+    kFinishState = 1,
+  };
+  VarHandleStatus status_;

 private:
  DISABLE_COPY_AND_ASSIGN(VarHandle);

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
@@ -269,7 +270,6 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
            wh_data, D4, static_cast<T>(1), out, D4)

-// gates: W_ch, W_ih, W_fh, W_oh
 #define GET_Ct(ct_1, gates, ct)                   \
  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
  act_cand(D, gates, gates);                      \
@@ -395,11 +395,22 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
        }
      }
    } else {
+      // TODO(TJ): unly workaround, clean me
+      std::function<void(T*, const T*, T*, T*)> compute_ctht;
+      if (platform::jit::MayIUse(platform::jit::avx) &&
+          act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
+          act_cell_str == "tanh" && D == 8) {
+        compute_ctht = math::lstm_compute_ctht<T>;
+      } else {
+        compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
+          COMPUTE_CtHt(gates, ct_1, ct, ht);
+        };
+      }
      for (int i = 0; i < N; ++i) {
        PROCESS_H0C0
        for (int step = tstart; step < seq_len; ++step) {
          GEMM_WH_ADDON(1, prev_h_data, xx_data);
-          COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
+          compute_ctht(xx_data, prev_c_data, c_out_data, h_out_data);
          MOVE_ONE_STEP;
        }
      }
@@ -532,12 +543,23 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
        MOVE_ONE_STEP;
      }
    } else {
+      // TODO(TJ): unly workaround, clean me
+      std::function<void(T*, const T*, T*, T*)> compute_ctht;
+      if (platform::jit::MayIUse(platform::jit::avx) &&
+          act_gate_str == "sigmoid" && act_cand_str == "tanh" &&
+          act_cell_str == "tanh" && D == 8) {
+        compute_ctht = math::lstm_compute_ctht<T>;
+      } else {
+        compute_ctht = [&](T* gates, const T* ct_1, T* ct, T* ht) {
+          COMPUTE_CtHt(gates, ct_1, ct, ht);
+        };
+      }
      for (int step = tstart; step < max_seq_len; ++step) {
        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
        DEFINE_CUR;
        for (int i = 0; i < cur_bs; ++i) {
-          COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
+          compute_ctht(cur_in_data, cur_prev_c_data, cur_c_out_data,
                       cur_h_out_data);
          MOVE_ONE_BATCH;
        }

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -59,8 +59,7 @@ static void ParallelExecuteBlocks(
    framework::ProgramDesc *program, framework::Scope *scope) {
  std::vector<std::future<void>> fs;
  for (size_t idx : parallel_blkids) {
-    fs.push_back(
-        framework::Async([&executor, &prepared, &program, &scope, idx]() {
+    fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() {
      int run_block = idx;  // thread local
      try {
        VLOG(3) << "running server block: " << run_block

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -45,6 +45,8 @@ math_library(im2col)
 if (NOT WIN32) # windows do not support avx functions yet.
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
+# TODO(TJ): ugly workaround, clean me
+cc_library(cpu_lstm_compute SRCS cpu_lstm_compute.cc DEPS activation_functions cblas cpu_info)
 endif (NOT WIN32)

 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)

--- a/paddle/fluid/operators/math/cpu_lstm_compute.cc
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/math/cpu_lstm_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/cpu_lstm_compute.h
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/platform/cpu_info.h"
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// TODO(TJ): ugly workaround, clean me
+template <typename T>
+void lstm_compute_ctht(T* gates, const T* ct_1, T* ct, T* ht) {
+  // gates: W_ch, W_ih, W_fh, W_oh
+  vec_sigmoid<T, platform::jit::avx>(24, gates + 8, gates + 8);
+  vec_tanh<T, platform::jit::avx>(8, gates, gates);
+  const T *i = gates + 8, *f = gates + 16, *o = gates + 24;
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  for (int d = 0; d < 8; ++d) {
+    // C_t = C_t-1 * fgated + cand_gated * igated
+    ct[d] = ct_1[d] * f[d] + gates[d] * i[d];
+    // H_t = act_cell(C_t) * ogated
+    T tmp = ct[d] * 2;
+    tmp = static_cast<T>(0) - ((tmp < min) ? min : ((tmp > max) ? max : tmp));
+    vec_exp<T>(1, &tmp, &tmp);
+    tmp = static_cast<T>(2) / (static_cast<T>(1) + tmp) - static_cast<T>(1);
+    ht[d] = tmp * o[d];
+  }
+}
+
+#ifdef __AVX__
+namespace detail {
+namespace forward {
+namespace avx {
+__m256 Sigmoid(const __m256 a);
+__m256 Tanh(const __m256 a);
+}  // namespace avx
+}  // namespace forward
+}  // namespace detail
+
+template <>
+void lstm_compute_ctht<float>(float* gates, const float* ct_1, float* ct,
+                              float* ht) {
+  namespace act = detail::forward::avx;
+  // gates: W_ch, W_ih, W_fh, W_oh
+  __m256 c, i, f, o;
+  c = _mm256_loadu_ps(gates);
+  i = _mm256_loadu_ps(gates + 8);
+  f = _mm256_loadu_ps(gates + 16);
+  o = _mm256_loadu_ps(gates + 24);
+
+  /* C_t = C_t-1 * fgated + cand_gated * igated*/
+  c = _mm256_mul_ps(act::Tanh(c), act::Sigmoid(i));
+  i = _mm256_loadu_ps(ct_1);
+  f = _mm256_mul_ps(i, act::Sigmoid(f));
+  f = _mm256_add_ps(c, f);
+  _mm256_storeu_ps(ct, f);
+
+  /* H_t = act_cell(C_t) * ogated */
+  o = _mm256_mul_ps(act::Tanh(f), act::Sigmoid(o));
+  _mm256_storeu_ps(ht, o);
+}
+#endif
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <string>
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/fluid/platform/enforce.h"
 #ifdef __AVX__
 #include <immintrin.h>
 #endif
@@ -476,7 +477,7 @@ class VecActivations {
    } else if (type == "identity" || type == "") {
      return vec_identity<T, isa>;
    }
-    LOG(FATAL) << "Not support type: " << type;
+    PADDLE_THROW("Not support type: %s", type);
  }
 };


--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -103,6 +103,58 @@ class MaxSeqPoolGradFunctor {
  }
 };

+template <typename T>
+class LastSeqPoolFunctor {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input,
+                  framework::Tensor* output) {
+    // Create pointers to input and output data
+    auto* in_data = input.data<T>();
+    auto* out_data = output->data<T>();
+
+    // Calculate the size of each item in sequence
+    int64_t item_size = input.numel() / input.dims()[0];
+    auto lod = input.lod()[0];
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      // Calculate the length of each sequence
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      // Point to the begin of next sequence
+      in_data += seq_len * item_size;
+      // Copy the last item of sequence to output
+      std::memcpy(out_data, (in_data - item_size), item_size * sizeof(T));
+      out_data += item_size;
+    }
+  }
+};
+
+template <typename T>
+class FirstSeqPoolFunctor {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input,
+                  framework::Tensor* output) {
+    // Create pointers to input and output data
+    auto* in_data = input.data<T>();
+    auto* out_data = output->data<T>();
+
+    // Calculate the size of each item in sequence
+    int64_t item_size = input.numel() / input.dims()[0];
+    auto lod = input.lod()[0];
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      // Calculate the length of each sequence
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      // Copy the first item of sequence to output
+      std::memcpy(out_data, in_data, item_size * sizeof(T));
+      // Point to the next sequence
+      in_data += seq_len * item_size;
+      out_data += item_size;
+    }
+  }
+};
+
 template <typename T>
 class SequencePoolFunctor<platform::CPUDeviceContext, T> {
 public:
@@ -116,6 +168,16 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
      max_pool(context, input, output, index);
      return;
    }
+    if (pooltype == "LAST") {
+      math::LastSeqPoolFunctor<T> last_pool;
+      last_pool(context, input, output);
+      return;
+    }
+    if (pooltype == "FIRST") {
+      math::FirstSeqPoolFunctor<T> first_pool;
+      first_pool(context, input, output);
+      return;
+    }
    auto lod = input.lod()[0];
    auto& place = *context.eigen_device();
    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
@@ -133,10 +195,6 @@ class SequencePoolFunctor<platform::CPUDeviceContext, T> {
      } else if (pooltype == "SQRT") {
        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                              std::sqrt(static_cast<T>(h));
-      } else if (pooltype == "LAST") {
-        out_e.device(place) = in_e.chip(h - 1, 0);
-      } else if (pooltype == "FIRST") {
-        out_e.device(place) = in_e.chip(0, 0);
      } else {
        PADDLE_THROW("unsupported pooling pooltype");
      }

--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -71,8 +71,7 @@ class MaxOutOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of MaxoutOp"
-                   "should not be null.");
+                   "Input(X) of MaxoutOpshould not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of MaxoutOp should not be null.");
    auto in_x_dims = ctx->GetInputDim("X");
@@ -90,9 +89,10 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MaxOutOpGrad must not be null.");
    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-                   "Input(X@GRAD) should not be null.");
+                   "Output(Grad@X) of MaxOutOpGrad should not be null.");
    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
  }
 };

--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -26,10 +26,13 @@ class PReluOp : public framework::OperatorWithKernel {
    std::string mode = ctx->Attrs().Get<std::string>("mode");

    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of PreluOp should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Alpha"),
+                   "Input(Alpha) of PreluOp should not be null");

-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PreluOp should not be null");
    if (mode == "all") {
      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
                     "For mode 'all', size of weight Alpha must be one.");

--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -42,7 +42,7 @@ class RNNMemoryHelperOp : public framework::OperatorBase {

    auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
    auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
-    out_tensor->ShareDataWith(mem_tensor);
+    framework::TensorCopySync(mem_tensor, dev_place, out_tensor);
    out_tensor->set_lod(mem_tensor.lod());
  }
 };
@@ -50,8 +50,10 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
 class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"), "");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of rnn_memory_helper op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output of rnn_memory_helper op should not be null.");
    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
    ctx->ShareLoD("X", /*->*/ "Out");
  }
@@ -107,7 +109,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
    } else {
      auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
      auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
-      in_grad_tensor->ShareDataWith(out_grad_tensor);
+      framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor);
      in_grad_tensor->set_lod(out_grad_tensor.lod());
    }
  }
@@ -133,8 +135,11 @@ class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *ctx) const override {
    auto x_grad_name = framework::GradVarName("X");
-    PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), "");
-    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    PADDLE_ENFORCE(ctx->HasOutput(x_grad_name),
+                   "Gradient of Input(X) in rnn_memory_helper_grad of should "
+                   "not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of rnn_memory_helper_grad of should not be null.");
    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
    ctx->ShareLoD("X", /*->*/ x_grad_name);
  }

--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -25,7 +25,7 @@ class SliceOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;

-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("Input"),
                   "Input (Input) of slice op should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -58,7 +58,7 @@ class SliceOp : public framework::OperatorWithKernel {

 protected:
  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
+      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
        ctx.GetPlace());
@@ -119,15 +119,54 @@ Following examples will explain how slice works:
  }
 };

+class SliceOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("Input");
+    auto x_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+class SliceOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
+    bind->SetInput("Input", Input("Input"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("slice_grad");
+    return std::unique_ptr<framework::OpDesc>(bind);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
 REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker,
-                  paddle::framework::EmptyGradOpMaker);
+                  ops::SliceOpGradMaker);
+REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad);

 REGISTER_OP_CPU_KERNEL(
    slice, ops::SliceKernel<paddle::platform::CPUDeviceContext, int>,
    ops::SliceKernel<paddle::platform::CPUDeviceContext, int64_t>,
    ops::SliceKernel<paddle::platform::CPUDeviceContext, float>,
    ops::SliceKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    slice_grad, ops::SliceGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SliceGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -20,3 +20,10 @@ REGISTER_OP_CUDA_KERNEL(
    ops::SliceKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SliceKernel<paddle::platform::CUDADeviceContext, int>,
    ops::SliceKernel<paddle::platform::CUDADeviceContext, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(
+    slice_grad,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SliceGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once
 #include <algorithm>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"

@@ -84,5 +85,79 @@ class SliceKernel : public framework::OpKernel<T> {
    out_t.device(place) = in_t.slice(offsets, extents);
  }
 };
+
+template <typename DeviceContext, typename T>
+class SliceGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    size_t rank = ctx.Input<framework::Tensor>(framework::GradVarName("Out"))
+                      ->dims()
+                      .size();
+    switch (rank) {
+      case 1:
+        SliceCompute<1>(ctx);
+        break;
+      case 2:
+        SliceCompute<2>(ctx);
+        break;
+      case 3:
+        SliceCompute<3>(ctx);
+        break;
+      case 4:
+        SliceCompute<4>(ctx);
+        break;
+      case 5:
+        SliceCompute<5>(ctx);
+        break;
+      case 6:
+        SliceCompute<6>(ctx);
+        break;
+    }
+  }
+
+ private:
+  template <size_t D>
+  void SliceCompute(const framework::ExecutionContext& context) const {
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto* d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_input =
+        context.Output<framework::Tensor>(framework::GradVarName("Input"));
+    d_input->mutable_data<T>(context.GetPlace());
+    auto out_dims = d_out->dims();
+    auto in_dims = d_input->dims();
+    auto axes = context.Attr<std::vector<int>>("axes");
+    auto starts = context.Attr<std::vector<int>>("starts");
+
+    auto offsets = Eigen::array<int, D>();
+    auto extents = Eigen::array<int, D>();
+    for (size_t i = 0; i < D; ++i) {
+      offsets[i] = 0;
+      extents[i] = out_dims[i];
+    }
+    int start;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      start = starts[i];
+      if (start < 0) {
+        start = (start + in_dims[axes[i]]);
+      }
+      start = std::max(start, 0);
+      offsets[axes[i]] = start;
+    }
+    Eigen::array<std::pair<int, int>, D> paddings;
+    for (size_t i = 0; i < paddings.size(); ++i) {
+      paddings[i].first = offsets[i];
+      paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i];
+    }
+    auto d_in_t =
+        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *d_input);
+    auto d_out_t =
+        framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *d_out);
+    d_in_t.device(place) = d_out_t.pad(paddings, 0);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -160,11 +160,7 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
          fluid_t->mutable_data<float>(platform::CUDAPlace(
              boost::get<platform::CUDAPlace>(context.GetPlace()).device)),
          size * sizeof(float));
-      //} else {
-      // engine->GetOutputInGPU(
-      // y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
-      // size * sizeof(float));
-      //}
+
      output_index += 1;
    }


--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
@@ -63,7 +63,7 @@ class WhileOp : public framework::OperatorBase {
    while (cond.data<bool>()[0]) {
      auto &current_scope = scope.NewScope();
      step_scopes->push_back(&current_scope);
-      executor.RunPreparedContext(ctx.get(), &current_scope, false);
+      executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
      if (is_test) {
        scope.DeleteScope(&current_scope);
      }
@@ -169,7 +169,8 @@ class WhileGradOp : public framework::OperatorBase {
          }
        }
      }
-      executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false);
+      executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false, true,
+                                  true);

      auto &pg_names = Outputs(kXGRAD);
      auto &p_names = Inputs(kX);

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -51,7 +51,7 @@ ENDIF()

 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
+cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc
    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)


--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -210,11 +210,14 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
  if (dynload::HasCUDNN()) {
    cudnn_holder_.reset(new CudnnHolder(&stream_, place));
  }
+
+  callback_manager_.reset(new StreamCallbackManager(stream_));
 }

 CUDADeviceContext::~CUDADeviceContext() {
  SetDeviceId(place_.device);
  Wait();
+  WaitStreamCallback();
  PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
  eigen_stream_.reset();
  eigen_device_.reset();

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -31,6 +31,9 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/stream_callback_manager.h"
+#endif
 #include "unsupported/Eigen/CXX11/Tensor"

 namespace paddle {
@@ -112,6 +115,17 @@ class CUDADeviceContext : public DeviceContext {
    PADDLE_ENFORCE(cudaEventRecord(ev, stream_));
  }

+  template <typename Callback>
+  void AddStreamCallback(Callback&& callback) const {
+    std::lock_guard<std::mutex> guard(callback_mtx_);
+    callback_manager_->AddCallback(callback);
+  }
+
+  void WaitStreamCallback() const {
+    std::lock_guard<std::mutex> guard(callback_mtx_);
+    callback_manager_->Wait();
+  }
+
 private:
  CUDAPlace place_;

@@ -125,7 +139,12 @@ class CUDADeviceContext : public DeviceContext {
  int multi_process;
  int max_threads_per_mp;

-  std::mutex mtx_;
+  mutable std::mutex mtx_;
+
+  // This lock is only used by callback
+  // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes
+  mutable std::mutex callback_mtx_;
+  std::unique_ptr<StreamCallbackManager> callback_manager_;
 };

 template <>

--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <functional>
+#include <memory>
+#include "ThreadPool.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+
+using StreamCallback = std::function<void(cudaStream_t, cudaError_t)>;
+
+class StreamCallbackManager;
+
+struct StreamCallbackContext {
+  template <typename Callback>
+  inline StreamCallbackContext(const StreamCallbackManager *manager,
+                               Callback &&callback)
+      : manager_(manager), callback_(callback) {}
+
+  const StreamCallbackManager *manager_;  // do not own
+  StreamCallback callback_;
+};
+
+class StreamCallbackManager {
+ public:
+  explicit inline StreamCallbackManager(cudaStream_t stream = nullptr)
+      : stream_(stream), thread_pool_(new ThreadPool(1)) {}
+
+  template <typename Callback>
+  inline void AddCallback(Callback &&callback) const {
+    AddCallbackWithStreamAndErrorInfo(
+        [=](cudaStream_t, cudaError_t) { callback(); });
+  }
+
+  template <typename Callback>
+  inline void AddCallbackWithStreamAndErrorInfo(Callback &&callback) const {
+    auto *stream_callback_context = new StreamCallbackContext(this, callback);
+    PADDLE_ENFORCE(cudaStreamAddCallback(
+        stream_, StreamCallbackManager::StreamCallbackFunc,
+        stream_callback_context, 0));
+  }
+
+  void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
+
+ private:
+  const cudaStream_t stream_;
+  mutable std::unique_ptr<ThreadPool> thread_pool_;
+
+  // cudaStreamCallback cannot call CUDA API inside, so we have to use
+  // thread_pool here
+  static void CUDART_CB StreamCallbackFunc(cudaStream_t stream,
+                                           cudaError_t status,
+                                           void *user_data) {
+    auto *callback_context_ptr =
+        reinterpret_cast<StreamCallbackContext *>(user_data);
+    callback_context_ptr->manager_->thread_pool_->enqueue([=]() {
+      std::unique_ptr<StreamCallbackContext> callback_context(
+          callback_context_ptr);
+      callback_context->callback_(stream, status);
+    });
+  }
+};
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -683,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle.
                  const std::string &, Scope *, std::vector<Scope *> &,
                  const ExecutionStrategy &, const BuildStrategy &, size_t,
                  size_t>())
-      .def("_bcast_params", &ParallelExecutor::BCastParamsToDevices)
      // NOTE: even we return a vec<Scope*>* to Python use reference policy.
      // We still cannot get local_scope from this vector, since the element
      // of vec<Scope*> will be freed by Python GC. We can only return Scope*

--- a/paddle/fluid/string/CMakeLists.txt
+++ b/paddle/fluid/string/CMakeLists.txt
 cc_library(stringpiece SRCS piece.cc)
 cc_library(pretty_log SRCS pretty_log.cc)
-cc_test(test_pretty_log SRCS pretty_log.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
 cc_test(to_string_test SRCS to_string_test.cc)
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -33,6 +33,7 @@ function print_usage() {
    ${BLUE}single_test${NONE}: run a single unit test
    ${BLUE}bind_test${NONE}: parallel tests bind to different GPU
    ${BLUE}doc${NONE}: generate paddle documents
+    ${BLUE}gen_doc_lib${NONE}: generate paddle documents library
    ${BLUE}html${NONE}: convert C++ source code into HTML
    ${BLUE}dockerfile${NONE}: generate paddle release dockerfile
    ${BLUE}capi${NONE}: generate paddle CAPI package
@@ -67,6 +68,23 @@ function cmake_gen() {
    # Support build for all python versions, currently
    # including cp27-cp27m and cp27-cp27mu.
    PYTHON_FLAGS=""
+    SYSTEM=`uname -s`
+    if [ "$SYSTEM" == "Darwin" ]; then
+        if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then
+            echo "using python abi: $1"
+            if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then
+                export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
+                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7
+                export PATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/:${PATH}
+                PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/bin/python2.7
+            -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/2.7/include/python2.7
+            -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/2.7/lib/libpython2.7.dylib"
+            else
+                exit 1
+            fi
+        # TODO: qiyang add python3 part here 
+        fi
+    else 
        if [ "$1" != "" ]; then
            echo "using python abi: $1"     
            if [ "$1" == "cp27-cp27m" ]; then
@@ -89,6 +107,7 @@ function cmake_gen() {
            -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so"
           fi
        fi
+    fi

    cat <<EOF
    ========================================
@@ -116,6 +135,8 @@ function cmake_gen() {
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
        -DWITH_INFERENCE=${WITH_INFERENCE:-ON}
+        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
+        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo}
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
        -DPY_VERSION=${PY_VERSION:-2.7}
    ========================================
@@ -146,6 +167,8 @@ EOF
        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
        -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
        -DWITH_INFERENCE=${WITH_INFERENCE:-ON} \
+        -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
+        -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} \
        -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
        -DPY_VERSION=${PY_VERSION:-2.7}
 }
@@ -200,6 +223,19 @@ EOF
    make install -j `nproc`
 }

+function build_mac() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    cat <<EOF
+    ============================================
+    Building in /paddle/build ...
+    ============================================
+EOF
+    make clean
+    sudo make -j 8
+    sudo make install -j 8
+}
+
 function build_android() {
    if [ $ANDROID_ABI == "arm64-v8a" ]; then
      ANDROID_ARCH=arm64
@@ -323,6 +359,27 @@ EOF
    fi
 }

+function run_mac_test() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit tests ...
+    ========================================
+EOF
+
+        # TODO: jiabin need to refine this part when these tests fixed on mac
+        ctest --output-on-failure -j8     
+        # make install should also be test when unittest 
+        make install -j 8
+        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
+            paddle version
+        fi
+    fi
+}
+
 function assert_api_not_changed() {
    mkdir -p ${PADDLE_ROOT}/build/.check_api_workspace
    cd ${PADDLE_ROOT}/build/.check_api_workspace
@@ -431,24 +488,42 @@ EOF
    linkchecker doc/v2/cn/html/index.html
    linkchecker doc/v2/api/en/html/index.html

-    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+}

-    # Deploy to the the content server if its a "develop" or "release/version" branch
-    # The "develop_doc" branch is reserved to test full deploy process without impacting the real content.
-    if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then
-        PPO_SCRIPT_BRANCH=develop
-    elif [[ "$TRAVIS_BRANCH" == "develop"  ||  "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then
-        PPO_SCRIPT_BRANCH=master
-    else
-        # Early exit, this branch doesn't require documentation build
-        return 0;
-    fi
-     # Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch
-    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh
-    export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python
-    cd ..
-    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH}
-    cd -
+function gen_doc_lib() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    cat <<EOF
+    ========================================
+    Building documentation library ...
+    In /paddle/build
+    ========================================
+EOF
+    cmake .. \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_DOC=ON \
+        -DWITH_GPU=OFF \
+        -DWITH_MKL=OFF \
+        -DWITH_FLUID_ONLY=ON
+
+    local LIB_TYPE=$1
+    case $LIB_TYPE in
+      full)
+        # Build full Paddle Python module. Will timeout without caching 'copy_paddle_pybind' first
+        make -j `nproc` gen_proto_py framework_py_proto copy_paddle_pybind paddle_python
+        ;;
+      pybind)
+        # Build paddle pybind library. Takes 49 minutes to build. Might timeout
+        make -j `nproc` copy_paddle_pybind
+        ;;
+      proto)
+        # Even smaller library.
+        make -j `nproc` framework_py_proto
+        ;;
+      *)
+        exit 0
+        ;;
+      esac
 }

 function gen_html() {
@@ -608,6 +683,9 @@ function main() {
      doc)
        gen_docs
        ;;
+      gen_doc_lib)
+        gen_doc_lib $2
+        ;;
      html)
        gen_html
        ;;
@@ -637,6 +715,17 @@ function main() {
        test_fluid_inference_lib
        assert_api_spec_approvals
        ;;
+      maccheck)
+        cmake_gen ${PYTHON_ABI:-""}
+        build_mac
+        run_mac_test
+        ;;
+      cicheck_py35)
+        cmake_gen ${PYTHON_ABI:-""}
+        build
+        run_test
+        assert_api_not_changed
+        ;;
      *)
        print_usage
        exit 0

--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -67,7 +67,7 @@ def get_word_dict():
        for field in movie_reviews.fileids(category):
            for words in movie_reviews.words(field):
                word_freq_dict[words] += 1
-    words_sort_list = six.iteritems(word_freq_dict)
+    words_sort_list = list(six.iteritems(word_freq_dict))
    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
    for index, word in enumerate(words_sort_list):
        words_freq_sorted.append((word[0], index))

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -122,7 +122,7 @@ def __bootstrap__():
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic'
+        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb'
    ]
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -55,15 +55,19 @@ for _OP in set(__auto__):
    globals()[_OP] = generate_layer_fn(_OP)


-def rpn_target_assign(loc,
-                      scores,
+def rpn_target_assign(bbox_pred,
+                      cls_logits,
                      anchor_box,
                      anchor_var,
-                      gt_box,
+                      gt_boxes,
+                      is_crowd,
+                      im_info,
                      rpn_batch_size_per_im=256,
-                      fg_fraction=0.25,
+                      rpn_straddle_thresh=0.0,
+                      rpn_fg_fraction=0.5,
                      rpn_positive_overlap=0.7,
-                      rpn_negative_overlap=0.3):
+                      rpn_negative_overlap=0.3,
+                      use_random=True):
    """
    ** Target Assign Layer for region proposal network (RPN) in Faster-RCNN detection. **

@@ -83,14 +87,13 @@ def rpn_target_assign(loc,
    the positive anchors.

    Args:
-        loc(Variable): A 3-D Tensor with shape [N, M, 4] represents the
+        bbox_pred(Variable): A 3-D Tensor with shape [N, M, 4] represents the
            predicted locations of M bounding bboxes. N is the batch size,
            and each bounding box has four coordinate values and the layout
            is [xmin, ymin, xmax, ymax].
-        scores(Variable): A 3-D Tensor with shape [N, M, C] represents the
-            predicted confidence predictions. N is the batch size, C is the
-            class number, M is number of bounding boxes. For each category
-            there are total M scores which corresponding M bounding boxes.
+        cls_logits(Variable): A 3-D Tensor with shape [N, M, 1] represents the
+            predicted confidence predictions. N is the batch size, 1 is the
+            frontground and background sigmoid, M is number of bounding boxes.
        anchor_box(Variable): A 2-D Tensor with shape [M, 4] holds M boxes,
            each box is represented as [xmin, ymin, xmax, ymax],
            [xmin, ymin] is the left top coordinate of the anchor box,
@@ -99,11 +102,16 @@ def rpn_target_assign(loc,
            coordinate of the anchor box.
        anchor_var(Variable): A 2-D Tensor with shape [M,4] holds expanded 
            variances of anchors.
-        gt_box (Variable): The ground-truth boudding boxes (bboxes) are a 2D
+        gt_boxes (Variable): The ground-truth boudding boxes (bboxes) are a 2D
            LoDTensor with shape [Ng, 4], Ng is the total number of ground-truth
            bboxes of mini-batch input.
+        is_crowd (Variable): A 1-D LoDTensor which indicates groud-truth is crowd.
+        im_info (Variable): A 2-D LoDTensor with shape [N, 3]. N is the batch size,
+        3 is the height, width and scale.
        rpn_batch_size_per_im(int): Total number of RPN examples per image.
-        fg_fraction(float): Target fraction of RoI minibatch that is labeled
+        rpn_straddle_thresh(float): Remove RPN anchors that go outside the image
+            by straddle_thresh pixels.
+        rpn_fg_fraction(float): Target fraction of RoI minibatch that is labeled
            foreground (i.e. class > 0), 0-th class is background.
        rpn_positive_overlap(float): Minimum overlap required between an anchor
            and ground-truth box for the (anchor, gt box) pair to be a positive
@@ -129,45 +137,48 @@ def rpn_target_assign(loc,
    Examples:
        .. code-block:: python

-        loc = layers.data(name='location', shape=[2, 80],
+        bbox_pred = layers.data(name='bbox_pred', shape=[100, 4],
                          append_batch_size=False, dtype='float32')
-        scores = layers.data(name='scores', shape=[2, 40],
+        cls_logits = layers.data(name='cls_logits', shape=[100, 1],
                          append_batch_size=False, dtype='float32')
        anchor_box = layers.data(name='anchor_box', shape=[20, 4],
                          append_batch_size=False, dtype='float32')
-        gt_box = layers.data(name='gt_box', shape=[10, 4],
+        gt_boxes = layers.data(name='gt_boxes', shape=[10, 4],
                         append_batch_size=False, dtype='float32')
        loc_pred, score_pred, loc_target, score_target =
-            fluid.layers.detection_output(loc=location,
-                                          scores=scores,
+            fluid.layers.rpn_target_assign(bbox_pred=bbox_pred,
+                                          cls_logits=cls_logits,
                                          anchor_box=anchor_box,
-                                          gt_box=gt_box)
+                                          gt_boxes=gt_boxes)
    """

    helper = LayerHelper('rpn_target_assign', **locals())
-    # Compute overlaps between the prior boxes and the gt boxes overlaps
-    iou = iou_similarity(x=gt_box, y=anchor_box)
    # Assign target label to anchors
    loc_index = helper.create_tmp_variable(dtype='int32')
    score_index = helper.create_tmp_variable(dtype='int32')
-    target_label = helper.create_tmp_variable(dtype='int64')
+    target_label = helper.create_tmp_variable(dtype='int32')
    target_bbox = helper.create_tmp_variable(dtype=anchor_box.dtype)
    helper.append_op(
        type="rpn_target_assign",
-        inputs={'Anchor': anchor_box,
-                'GtBox': gt_box,
-                'DistMat': iou},
+        inputs={
+            'Anchor': anchor_box,
+            'GtBoxes': gt_boxes,
+            'IsCrowd': is_crowd,
+            'ImInfo': im_info
+        },
        outputs={
            'LocationIndex': loc_index,
            'ScoreIndex': score_index,
            'TargetLabel': target_label,
-            'TargetBBox': target_bbox,
+            'TargetBBox': target_bbox
        },
        attrs={
            'rpn_batch_size_per_im': rpn_batch_size_per_im,
+            'rpn_straddle_thresh': rpn_straddle_thresh,
            'rpn_positive_overlap': rpn_positive_overlap,
            'rpn_negative_overlap': rpn_negative_overlap,
-            'fg_fraction': fg_fraction
+            'rpn_fg_fraction': rpn_fg_fraction,
+            'use_random': use_random
        })

    loc_index.stop_gradient = True
@@ -175,12 +186,12 @@ def rpn_target_assign(loc,
    target_label.stop_gradient = True
    target_bbox.stop_gradient = True

-    scores = nn.reshape(x=scores, shape=(-1, 1))
-    loc = nn.reshape(x=loc, shape=(-1, 4))
-    predicted_scores = nn.gather(scores, score_index)
-    predicted_location = nn.gather(loc, loc_index)
+    cls_logits = nn.reshape(x=cls_logits, shape=(-1, 1))
+    bbox_pred = nn.reshape(x=bbox_pred, shape=(-1, 4))
+    predicted_cls_logits = nn.gather(cls_logits, score_index)
+    predicted_bbox_pred = nn.gather(bbox_pred, loc_index)

-    return predicted_scores, predicted_location, target_label, target_bbox
+    return predicted_cls_logits, predicted_bbox_pred, target_label, target_bbox


 def detection_output(loc,
@@ -1258,15 +1269,17 @@ def anchor_generator(input,

 def generate_proposal_labels(rpn_rois,
                             gt_classes,
+                             is_crowd,
                             gt_boxes,
-                             im_scales,
+                             im_info,
                             batch_size_per_im=256,
                             fg_fraction=0.25,
                             fg_thresh=0.25,
                             bg_thresh_hi=0.5,
                             bg_thresh_lo=0.0,
                             bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-                             class_nums=None):
+                             class_nums=None,
+                             use_random=True):
    """
    ** Generate proposal labels Faster-RCNN **
    TODO(buxingyuan): Add Document
@@ -1285,8 +1298,9 @@ def generate_proposal_labels(rpn_rois,
        inputs={
            'RpnRois': rpn_rois,
            'GtClasses': gt_classes,
+            'IsCrowd': is_crowd,
            'GtBoxes': gt_boxes,
-            'ImScales': im_scales
+            'ImInfo': im_info
        },
        outputs={
            'Rois': rois,
@@ -1302,7 +1316,8 @@ def generate_proposal_labels(rpn_rois,
            'bg_thresh_hi': bg_thresh_hi,
            'bg_thresh_lo': bg_thresh_lo,
            'bbox_reg_weights': bbox_reg_weights,
-            'class_nums': class_nums
+            'class_nums': class_nums,
+            'use_random': use_random
        })

    rois.stop_gradient = True

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -142,11 +142,6 @@ class ParallelExecutor(object):
        main = main if main else framework.default_main_program()
        if scope == None:
            scope = executor.global_scope()
-        # FIXME(Yancey1989): it's a temporary approach to determinate the distribute
-        # train program, call self.bcast_param() at the end of each mini-batch.
-        self.is_dist = True if "recv" in [
-            op.type for op in main.global_block().ops
-        ] else False

        if share_vars_from and not isinstance(share_vars_from,
                                              ParallelExecutor):
@@ -286,21 +281,11 @@ class ParallelExecutor(object):
        self.executor.run(fetch_list, fetch_var_name)
        arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array()

-        if self.is_dist:
-            self._bcast_params()
-
        if return_numpy:
            return executor.as_numpy(arr)

        return [arr[i] for i in range(len(arr))]

-    def _bcast_params(self):
-        """
-        Broadcast the parameters to other devices. It is used during
-        distributed training.
-        """
-        self.executor._bcast_params(set(self.persistable_vars))
-
    @property
    def device_count(self):
        return len(self._act_places)
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -148,6 +148,8 @@ class TestAnchorGenerator(unittest.TestCase):

 class TestGenerateProposalLabels(unittest.TestCase):
    def test_generate_proposal_labels(self):
+        program = Program()
+        with program_guard(program):
            rpn_rois = layers.data(
                name='rpn_rois',
                shape=[4, 4],
@@ -160,15 +162,21 @@ class TestGenerateProposalLabels(unittest.TestCase):
                dtype='int32',
                lod_level=1,
                append_batch_size=False)
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[6],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
            gt_boxes = layers.data(
                name='gt_boxes',
                shape=[6, 4],
                dtype='float32',
                lod_level=1,
                append_batch_size=False)
-        im_scales = layers.data(
-            name='im_scales',
-            shape=[1],
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
                dtype='float32',
                lod_level=1,
                append_batch_size=False)
@@ -176,8 +184,9 @@ class TestGenerateProposalLabels(unittest.TestCase):
            rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights = fluid.layers.generate_proposal_labels(
                rpn_rois=rpn_rois,
                gt_classes=gt_classes,
+                is_crowd=is_crowd,
                gt_boxes=gt_boxes,
-            im_scales=im_scales,
+                im_info=im_info,
                batch_size_per_im=2,
                fg_fraction=0.5,
                fg_thresh=0.5,
@@ -254,18 +263,18 @@ class TestRpnTargetAssign(unittest.TestCase):
    def test_rpn_target_assign(self):
        program = Program()
        with program_guard(program):
-            loc_shape = [10, 50, 4]
-            score_shape = [10, 50, 2]
+            bbox_pred_shape = [10, 50, 4]
+            cls_logits_shape = [10, 50, 2]
            anchor_shape = [50, 4]

-            loc = layers.data(
-                name='loc',
-                shape=loc_shape,
+            bbox_pred = layers.data(
+                name='bbox_pred',
+                shape=bbox_pred_shape,
                append_batch_size=False,
                dtype='float32')
-            scores = layers.data(
-                name='scores',
-                shape=score_shape,
+            cls_logits = layers.data(
+                name='cls_logits',
+                shape=cls_logits_shape,
                append_batch_size=False,
                dtype='float32')
            anchor_box = layers.data(
@@ -278,17 +287,31 @@ class TestRpnTargetAssign(unittest.TestCase):
                shape=anchor_shape,
                append_batch_size=False,
                dtype='float32')
-            gt_box = layers.data(
-                name='gt_box', shape=[4], lod_level=1, dtype='float32')
-
+            gt_boxes = layers.data(
+                name='gt_boxes', shape=[4], lod_level=1, dtype='float32')
+            is_crowd = layers.data(
+                name='is_crowd',
+                shape=[10],
+                dtype='int32',
+                lod_level=1,
+                append_batch_size=False)
+            im_info = layers.data(
+                name='im_info',
+                shape=[1, 3],
+                dtype='float32',
+                lod_level=1,
+                append_batch_size=False)
            pred_scores, pred_loc, tgt_lbl, tgt_bbox = layers.rpn_target_assign(
-                loc=loc,
-                scores=scores,
+                bbox_pred=bbox_pred,
+                cls_logits=cls_logits,
                anchor_box=anchor_box,
                anchor_var=anchor_var,
-                gt_box=gt_box,
+                gt_boxes=gt_boxes,
+                is_crowd=is_crowd,
+                im_info=im_info,
                rpn_batch_size_per_im=256,
-                fg_fraction=0.25,
+                rpn_straddle_thresh=0.0,
+                rpn_fg_fraction=0.5,
                rpn_positive_overlap=0.7,
                rpn_negative_overlap=0.3)


--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -29,6 +29,19 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
 list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test

+if(APPLE)
+    message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_desc_clone \n test_debugger \n test_program_code \n test_dist_transformer \n test_dist_se_resnext")
+    # this op is not support on mac
+    list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
+    # TODO: add the unitest back when it fixed
+    list(REMOVE_ITEM TEST_OPS test_detection_map_op)
+    list(REMOVE_ITEM TEST_OPS test_desc_clone)
+    list(REMOVE_ITEM TEST_OPS test_debugger)
+    list(REMOVE_ITEM TEST_OPS test_program_code)
+    list(REMOVE_ITEM TEST_OPS test_dist_transformer)
+    list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
+endif()
+
 function(py_test_modules TARGET_NAME)
  if(WITH_TESTING)
    set(options SERIAL)
@@ -46,6 +59,7 @@ function(py_test_modules TARGET_NAME)
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
+list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
@@ -61,11 +75,12 @@ if(WITH_DISTRIBUTE)
    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
    set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
    set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
+    py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
+    py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
+    py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 150)
-py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
-py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
 py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -92,7 +92,7 @@ class TrainTaskConfig(object):
    src_vocab_fpath = data_path + "vocab.bpe.32000"
    trg_vocab_fpath = data_path + "vocab.bpe.32000"
    train_file_pattern = data_path + "train.tok.clean.bpe.32000.en-de"
-    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de"
+    val_file_pattern = data_path + "newstest2013.tok.bpe.32000.en-de.cut"
    pool_size = 2000
    sort_type = None
    local = True
@@ -624,6 +624,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
            init = True

            # Validate and save the model for inference.
+            if batch_id == 0 or batch_id == 4:
                if TrainTaskConfig.val_file_pattern is not None:
                    val_avg_cost, val_ppl = test()
                    print("[%f]" % val_avg_cost)
@@ -1701,8 +1702,9 @@ class DistTransformer2x2(TestDistRunnerBase):
        exe.run(startup_prog)
        exe.run(pserver_prog)

-    def run_trainer(self, place, args):
-
+    def run_trainer(self, use_cuda, args):
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+        TrainTaskConfig.use_gpu = use_cuda
        sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model(
            args.is_dist, not args.sync_mode)


--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
@@ -109,15 +109,20 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
    return t


+from paddle.fluid.transpiler.details import op_to_code
+
+
 def operator_equal(a, b):
+    if op_to_code(a) != op_to_code(b):
+        raise ValueError("In operator_equal not equal\n")
+
    for k, v in six.iteritems(a.__dict__):
        if isinstance(v, fluid.framework.Program) or \
                isinstance(v, fluid.framework.Block):
            continue

        elif isinstance(v, core.OpDesc):
-            if v.serialize_to_string() != b.__dict__[k].serialize_to_string():
-                raise ValueError("In operator_equal not equal:{0}\n".format(k))
+            continue

        elif isinstance(v, collections.OrderedDict):
            v0 = sorted(list(six.iteritems(v)), key=lambda x: x[0])

--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -61,9 +61,10 @@ class TestDistRunnerBase(object):
        exe.run(startup_prog)
        exe.run(pserver_prog)

-    def run_trainer(self, place, args):
+    def run_trainer(self, use_cuda, args):
        import paddle
        import paddle.fluid as fluid
+        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=2)
        if args.mem_opt:
@@ -91,7 +92,7 @@ class TestDistRunnerBase(object):
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

        exe = fluid.ParallelExecutor(
-            True,
+            use_cuda,
            loss_name=avg_cost.name,
            exec_strategy=strategy,
            build_strategy=build_stra)
@@ -142,9 +143,8 @@ def runtime_main(test_class):
    if args.role == "pserver" and args.is_dist:
        model.run_pserver(args)
    else:
-        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        model.run_trainer(p, args)
+        use_cuda = True if core.is_compiled_with_cuda() else False
+        model.run_trainer(use_cuda, args)


 import paddle.compat as cpt
@@ -225,11 +225,12 @@ class TestDistBase(unittest.TestCase):
    def check_with_place(self, model_file, delta=1e-3, check_error_log=False):
        # TODO(typhoonzero): should auto adapt GPU count on the machine.
        required_envs = {
-            "PATH": os.getenv("PATH"),
-            "PYTHONPATH": os.getenv("PYTHONPATH"),
-            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
+            "PATH": os.getenv("PATH", ""),
+            "PYTHONPATH": os.getenv("PYTHONPATH", ""),
+            "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
            "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
-            "FLAGS_cudnn_deterministic": "1"
+            "FLAGS_cudnn_deterministic": "1",
+            "CPU_NUM": "1"
        }

        if check_error_log:

--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
@@ -14,6 +14,7 @@

 from __future__ import print_function

+import os
 import unittest
 import paddle
 from test_dist_base import TestDistBase
@@ -44,6 +45,14 @@ def download_files():
    test_url = url_prefix + 'newstest2013.tok.bpe.32000.en-de'
    test_md5 = '9dd74a266dbdb25314183899f269b4a2'
    paddle.dataset.common.download(test_url, 'test_dist_transformer', test_md5)
+    # cut test data for faster CI
+    orig_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "test_dist_transformer",
+                             "newstest2013.tok.bpe.32000.en-de")
+    head_path = os.path.join(paddle.dataset.common.DATA_HOME,
+                             "test_dist_transformer",
+                             "newstest2013.tok.bpe.32000.en-de.cut")
+    os.system("head -n10 %s > %s" % (orig_path, head_path))


 class TestDistTransformer2x2Sync(TestDistBase):

--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -62,7 +62,7 @@ class TranspilerTest(unittest.TestCase):

        t = self._transpiler_instance(config)

-        trainer_main = t.get_trainer_program()
+        trainer_main = t.get_trainer_program(wait_port=False)
        trainer_startup = fluid.default_startup_program()

        assert (src.num_blocks == 1)

--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels.py
@@ -20,10 +20,10 @@ import paddle.fluid as fluid
 from op_test import OpTest


-def generate_proposal_labels_in_python(
-        rpn_rois, gt_classes, gt_boxes, im_scales, batch_size_per_im,
-        fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
-        class_nums):
+def generate_proposal_labels_in_python(rpn_rois, gt_classes, is_crowd, gt_boxes,
+                                       im_info, batch_size_per_im, fg_fraction,
+                                       fg_thresh, bg_thresh_hi, bg_thresh_lo,
+                                       bbox_reg_weights, class_nums):
    rois = []
    labels_int32 = []
    bbox_targets = []
@@ -31,13 +31,13 @@ def generate_proposal_labels_in_python(
    bbox_outside_weights = []
    lod = []
    assert len(rpn_rois) == len(
-        im_scales), 'batch size of rpn_rois and ground_truth is not matched'
+        im_info), 'batch size of rpn_rois and ground_truth is not matched'

-    for im_i in range(len(im_scales)):
+    for im_i in range(len(im_info)):
        frcn_blobs = _sample_rois(
-            rpn_rois[im_i], gt_classes[im_i], gt_boxes[im_i], im_scales[im_i],
-            batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
-            bg_thresh_lo, bbox_reg_weights, class_nums)
+            rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i],
+            im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh,
+            bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums)

        lod.append(frcn_blobs['rois'].shape[0])

@@ -50,13 +50,14 @@ def generate_proposal_labels_in_python(
    return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod


-def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
-                 fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo,
-                 bbox_reg_weights, class_nums):
+def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
+                 batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
+                 bg_thresh_lo, bbox_reg_weights, class_nums):
    rois_per_image = int(batch_size_per_im)
    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))

    # Roidb
+    im_scale = im_info[2]
    inv_im_scale = 1. / im_scale
    rpn_rois = rpn_rois * inv_im_scale

@@ -78,6 +79,9 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
        box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
            overlapped_boxes_ind]

+    crowd_ind = np.where(is_crowd)[0]
+    gt_overlaps[crowd_ind] = -1
+
    max_overlaps = gt_overlaps.max(axis=1)
    max_classes = gt_overlaps.argmax(axis=1)

@@ -85,9 +89,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
    fg_inds = np.where(max_overlaps >= fg_thresh)[0]
    fg_rois_per_this_image = np.minimum(fg_rois_per_im, fg_inds.shape[0])
    # Sample foreground if there are too many
-    if fg_inds.shape[0] > fg_rois_per_this_image:
-        fg_inds = np.random.choice(
-            fg_inds, size=fg_rois_per_this_image, replace=False)
+    # if fg_inds.shape[0] > fg_rois_per_this_image:
+    #     fg_inds = np.random.choice(
+    #         fg_inds, size=fg_rois_per_this_image, replace=False)
+    fg_inds = fg_inds[:fg_rois_per_this_image]

    # Background
    bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
@@ -96,9 +101,10 @@ def _sample_rois(rpn_rois, gt_classes, gt_boxes, im_scale, batch_size_per_im,
    bg_rois_per_this_image = np.minimum(bg_rois_per_this_image,
                                        bg_inds.shape[0])
    # Sample background if there are too many
-    if bg_inds.shape[0] > bg_rois_per_this_image:
-        bg_inds = np.random.choice(
-            bg_inds, size=bg_rois_per_this_image, replace=False)
+    # if bg_inds.shape[0] > bg_rois_per_this_image:
+    #     bg_inds = np.random.choice(
+    #         bg_inds, size=bg_rois_per_this_image, replace=False)
+    bg_inds = bg_inds[:bg_rois_per_this_image]

    keep_inds = np.append(fg_inds, bg_inds)
    sampled_labels = max_classes[keep_inds]
@@ -208,8 +214,9 @@ class TestGenerateProposalLabelsOp(OpTest):
        self.inputs = {
            'RpnRois': (self.rpn_rois[0], self.rpn_rois_lod),
            'GtClasses': (self.gt_classes[0], self.gts_lod),
+            'IsCrowd': (self.is_crowd[0], self.gts_lod),
            'GtBoxes': (self.gt_boxes[0], self.gts_lod),
-            'ImScales': self.im_scales[0]
+            'ImInfo': self.im_info
        }
        self.attrs = {
            'batch_size_per_im': self.batch_size_per_im,
@@ -218,14 +225,15 @@ class TestGenerateProposalLabelsOp(OpTest):
            'bg_thresh_hi': self.bg_thresh_hi,
            'bg_thresh_lo': self.bg_thresh_lo,
            'bbox_reg_weights': self.bbox_reg_weights,
-            'class_nums': self.class_nums
+            'class_nums': self.class_nums,
+            'use_random': False
        }
        self.outputs = {
-            'Rois': (self.rois[0], [self.lod]),
-            'LabelsInt32': (self.labels_int32[0], [self.lod]),
-            'BboxTargets': (self.bbox_targets[0], [self.lod]),
-            'BboxInsideWeights': (self.bbox_inside_weights[0], [self.lod]),
-            'BboxOutsideWeights': (self.bbox_outside_weights[0], [self.lod]),
+            'Rois': (self.rois, [self.lod]),
+            'LabelsInt32': (self.labels_int32, [self.lod]),
+            'BboxTargets': (self.bbox_targets, [self.lod]),
+            'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]),
+            'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]),
        }

    def test_check_output(self):
@@ -236,8 +244,8 @@ class TestGenerateProposalLabelsOp(OpTest):
        self.set_data()

    def init_test_params(self):
-        self.batch_size_per_im = 10
-        self.fg_fraction = 1.0
+        self.batch_size_per_im = 512
+        self.fg_fraction = 0.25
        self.fg_thresh = 0.5
        self.bg_thresh_hi = 0.5
        self.bg_thresh_lo = 0.0
@@ -246,14 +254,14 @@ class TestGenerateProposalLabelsOp(OpTest):

    def init_test_input(self):
        np.random.seed(0)
-        image_nums = 1
        gt_nums = 6  # Keep same with batch_size_per_im for unittest
-        proposal_nums = self.batch_size_per_im - gt_nums
-        images_shape = []
-        self.im_scales = []
-        for i in range(image_nums):
-            images_shape.append(np.random.randint(200, size=2))
-            self.im_scales.append(np.ones((1)).astype(np.float32))
+        proposal_nums = 2000  #self.batch_size_per_im - gt_nums
+        images_shape = [[64, 64]]
+        self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
+        for i in range(len(images_shape)):
+            self.im_info[i, 0] = images_shape[i][0]
+            self.im_info[i, 1] = images_shape[i][1]
+            self.im_info[i, 2] = 0.8  #scale

        self.rpn_rois, self.rpn_rois_lod = _generate_proposals(images_shape,
                                                               proposal_nums)
@@ -261,16 +269,23 @@ class TestGenerateProposalLabelsOp(OpTest):
            images_shape, self.class_nums, gt_nums)
        self.gt_classes = [gt['gt_classes'] for gt in ground_truth]
        self.gt_boxes = [gt['boxes'] for gt in ground_truth]
+        self.is_crowd = [gt['is_crowd'] for gt in ground_truth]

    def init_test_output(self):
        self.rois, self.labels_int32, self.bbox_targets, \
        self.bbox_inside_weights, self.bbox_outside_weights, \
        self.lod = generate_proposal_labels_in_python(
-                self.rpn_rois, self.gt_classes, self.gt_boxes, self.im_scales,
+                self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info,
                self.batch_size_per_im, self.fg_fraction,
                self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo,
                self.bbox_reg_weights, self.class_nums
            )
+        self.rois = np.vstack(self.rois)
+        self.labels_int32 = np.hstack(self.labels_int32)
+        self.labels_int32 = self.labels_int32[:, np.newaxis]
+        self.bbox_targets = np.vstack(self.bbox_targets)
+        self.bbox_inside_weights = np.vstack(self.bbox_inside_weights)
+        self.bbox_outside_weights = np.vstack(self.bbox_outside_weights)


 def _generate_proposals(images_shape, proposal_nums):
@@ -280,7 +295,7 @@ def _generate_proposals(images_shape, proposal_nums):
    for i, image_shape in enumerate(images_shape):
        proposals = _generate_boxes(image_shape, proposal_nums)
        rpn_rois.append(proposals)
-        num_proposals += len(proposals)
+        num_proposals = len(proposals)
        rpn_rois_lod.append(num_proposals)
    return rpn_rois, [rpn_rois_lod]

@@ -294,7 +309,11 @@ def _generate_groundtruth(images_shape, class_nums, gt_nums):
        gt_classes = np.random.randint(
            low=1, high=class_nums, size=gt_nums).astype(np.int32)
        gt_boxes = _generate_boxes(image_shape, gt_nums)
-        ground_truth.append(dict(gt_classes=gt_classes, boxes=gt_boxes))
+        is_crowd = np.zeros((gt_nums), dtype=np.int32)
+        is_crowd[0] = 1
+        ground_truth.append(
+            dict(
+                gt_classes=gt_classes, boxes=gt_boxes, is_crowd=is_crowd))
        num_gts += len(gt_classes)
        gts_lod.append(num_gts)
    return ground_truth, [gts_lod]

--- a/python/paddle/fluid/tests/unittests/test_generate_proposals.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals.py
@@ -114,10 +114,10 @@ def box_coder(all_anchors, bbox_deltas, variances):
    #anchor_loc: width, height, center_x, center_y
    anchor_loc = np.zeros_like(bbox_deltas, dtype=np.float32)

-    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0]
-    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1]
-    anchor_loc[:, 2] = (all_anchors[:, 2] + all_anchors[:, 0]) / 2
-    anchor_loc[:, 3] = (all_anchors[:, 3] + all_anchors[:, 1]) / 2
+    anchor_loc[:, 0] = all_anchors[:, 2] - all_anchors[:, 0] + 1
+    anchor_loc[:, 1] = all_anchors[:, 3] - all_anchors[:, 1] + 1
+    anchor_loc[:, 2] = all_anchors[:, 0] + 0.5 * anchor_loc[:, 0]
+    anchor_loc[:, 3] = all_anchors[:, 1] + 0.5 * anchor_loc[:, 1]

    #predicted bbox: bbox_center_x, bbox_center_y, bbox_width, bbox_height 
    pred_bbox = np.zeros_like(bbox_deltas, dtype=np.float32)
@@ -127,23 +127,29 @@ def box_coder(all_anchors, bbox_deltas, variances):
                i, 0] + anchor_loc[i, 2]
            pred_bbox[i, 1] = variances[i, 1] * bbox_deltas[i, 1] * anchor_loc[
                i, 1] + anchor_loc[i, 3]
-            pred_bbox[i, 2] = math.exp(variances[i, 2] *
-                                       bbox_deltas[i, 2]) * anchor_loc[i, 0]
-            pred_bbox[i, 3] = math.exp(variances[i, 3] *
-                                       bbox_deltas[i, 3]) * anchor_loc[i, 1]
+            pred_bbox[i, 2] = math.exp(
+                min(variances[i, 2] * bbox_deltas[i, 2], math.log(
+                    1000 / 16.0))) * anchor_loc[i, 0]
+            pred_bbox[i, 3] = math.exp(
+                min(variances[i, 3] * bbox_deltas[i, 3], math.log(
+                    1000 / 16.0))) * anchor_loc[i, 1]
    else:
        for i in range(bbox_deltas.shape[0]):
            pred_bbox[i, 0] = bbox_deltas[i, 0] * anchor_loc[i, 0] + anchor_loc[
                i, 2]
            pred_bbox[i, 1] = bbox_deltas[i, 1] * anchor_loc[i, 1] + anchor_loc[
                i, 3]
-            pred_bbox[i, 2] = math.exp(bbox_deltas[i, 2]) * anchor_loc[i, 0]
-            pred_bbox[i, 3] = math.exp(bbox_deltas[i, 3]) * anchor_loc[i, 1]
+            pred_bbox[i, 2] = math.exp(
+                min(bbox_deltas[i, 2], math.log(1000 / 16.0))) * anchor_loc[i,
+                                                                            0]
+            pred_bbox[i, 3] = math.exp(
+                min(bbox_deltas[i, 3], math.log(1000 / 16.0))) * anchor_loc[i,
+                                                                            1]

    proposals[:, 0] = pred_bbox[:, 0] - pred_bbox[:, 2] / 2
    proposals[:, 1] = pred_bbox[:, 1] - pred_bbox[:, 3] / 2
-    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2
-    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2
+    proposals[:, 2] = pred_bbox[:, 0] + pred_bbox[:, 2] / 2 - 1
+    proposals[:, 3] = pred_bbox[:, 1] + pred_bbox[:, 3] / 2 - 1

    return proposals

@@ -170,13 +176,16 @@ def filter_boxes(boxes, min_size, im_info):
    """Only keep boxes with both sides >= min_size and center within the image.
    """
    # Scale min_size to match image scale
-    min_size *= im_info[2]
+    im_scale = im_info[2]
+    min_size = max(min_size, 1.0)
    ws = boxes[:, 2] - boxes[:, 0] + 1
    hs = boxes[:, 3] - boxes[:, 1] + 1
+    ws_orig_scale = (boxes[:, 2] - boxes[:, 0]) / im_scale + 1
+    hs_orig_scale = (boxes[:, 3] - boxes[:, 1]) / im_scale + 1
    x_ctr = boxes[:, 0] + ws / 2.
    y_ctr = boxes[:, 1] + hs / 2.
-    keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_info[1]) &
-                    (y_ctr < im_info[0]))[0]
+    keep = np.where((ws_orig_scale >= min_size) & (hs_orig_scale >= min_size) &
+                    (x_ctr < im_info[1]) & (y_ctr < im_info[0]))[0]
    return keep


@@ -204,7 +213,7 @@ def iou(box_a, box_b):
    xb = min(xmax_a, xmax_b)
    yb = min(ymax_a, ymax_b)

-    inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
+    inter_area = max(xb - xa + 1, 0.0) * max(yb - ya + 1, 0.0)

    iou_ratio = inter_area / (area_a + area_b - inter_area)


--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
@@ -13,7 +13,7 @@
 # limitations under the License.

 from __future__ import print_function
-
+import os
 import paddle.fluid as fluid
 import paddle
 import numpy as np
@@ -41,6 +41,8 @@ class TestReaderReset(unittest.TestCase):
                self.data_file_name, reader, feeder)

    def setUp(self):
+        # set parallel threads to fit 20 batches in line 49
+        os.environ['CPU_NUM'] = str(20)
        self.use_cuda = fluid.core.is_compiled_with_cuda()
        self.data_file_name = './reader_reset_test.recordio'
        self.ins_shape = [3]

--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
@@ -19,48 +19,58 @@ import numpy as np
 import paddle.fluid.core as core
 from op_test import OpTest
 from test_anchor_generator_op import anchor_generator_in_python
-from test_generate_proposal_labels import _generate_groundtruth
-from test_generate_proposal_labels import _bbox_overlaps, _box_to_delta
-
-
-def rpn_target_assign(gt_anchor_iou, rpn_batch_size_per_im,
-                      rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
-    iou = np.transpose(gt_anchor_iou)
-    anchor_to_gt_max = iou.max(axis=1)
-    anchor_to_gt_argmax = iou.argmax(axis=1)
-
-    gt_to_anchor_argmax = iou.argmax(axis=0)
-    gt_to_anchor_max = iou[gt_to_anchor_argmax, np.arange(iou.shape[1])]
-    anchors_with_max_overlap = np.where(iou == gt_to_anchor_max)[0]
-
-    tgt_lbl = np.ones((iou.shape[0], ), dtype=np.int32) * -1
-    tgt_lbl[anchors_with_max_overlap] = 1
-    tgt_lbl[anchor_to_gt_max >= rpn_positive_overlap] = 1
-
-    num_fg = int(fg_fraction * rpn_batch_size_per_im)
-    fg_inds = np.where(tgt_lbl == 1)[0]
-    if len(fg_inds) > num_fg:
+from test_generate_proposal_labels_op import _generate_groundtruth
+from test_generate_proposal_labels_op import _bbox_overlaps, _box_to_delta
+
+
+def rpn_target_assign(anchor_by_gt_overlap,
+                      rpn_batch_size_per_im,
+                      rpn_positive_overlap,
+                      rpn_negative_overlap,
+                      rpn_fg_fraction,
+                      use_random=True):
+    anchor_to_gt_argmax = anchor_by_gt_overlap.argmax(axis=1)
+    anchor_to_gt_max = anchor_by_gt_overlap[np.arange(
+        anchor_by_gt_overlap.shape[0]), anchor_to_gt_argmax]
+
+    gt_to_anchor_argmax = anchor_by_gt_overlap.argmax(axis=0)
+    gt_to_anchor_max = anchor_by_gt_overlap[gt_to_anchor_argmax, np.arange(
+        anchor_by_gt_overlap.shape[1])]
+    anchors_with_max_overlap = np.where(
+        anchor_by_gt_overlap == gt_to_anchor_max)[0]
+
+    labels = np.ones((anchor_by_gt_overlap.shape[0], ), dtype=np.int32) * -1
+    labels[anchors_with_max_overlap] = 1
+    labels[anchor_to_gt_max >= rpn_positive_overlap] = 1
+
+    num_fg = int(rpn_fg_fraction * rpn_batch_size_per_im)
+    fg_inds = np.where(labels == 1)[0]
+    if len(fg_inds) > num_fg and use_random:
        disable_inds = np.random.choice(
            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
-        tgt_lbl[disable_inds] = -1
-    fg_inds = np.where(tgt_lbl == 1)[0]
+    else:
+        disable_inds = fg_inds[num_fg:]
+    labels[disable_inds] = -1
+    fg_inds = np.where(labels == 1)[0]

-    num_bg = rpn_batch_size_per_im - np.sum(tgt_lbl == 1)
+    num_bg = rpn_batch_size_per_im - np.sum(labels == 1)
    bg_inds = np.where(anchor_to_gt_max < rpn_negative_overlap)[0]
-    tgt_lbl[bg_inds] = 0
-    if len(bg_inds) > num_bg:
+    if len(bg_inds) > num_bg and use_random:
        enable_inds = bg_inds[np.random.randint(len(bg_inds), size=num_bg)]
-        tgt_lbl[enable_inds] = 0
-    bg_inds = np.where(tgt_lbl == 0)[0]
-    tgt_lbl[bg_inds] = 0
+    else:
+        enable_inds = bg_inds[:num_bg]
+    labels[enable_inds] = 0
+    fg_inds = np.where(labels == 1)[0]
+    bg_inds = np.where(labels == 0)[0]

    loc_index = fg_inds
    score_index = np.hstack((fg_inds, bg_inds))
-    tgt_lbl = np.expand_dims(tgt_lbl, axis=1)
+    labels = labels[score_index]
+    assert not np.any(labels == -1), "Wrong labels with -1"

    gt_inds = anchor_to_gt_argmax[fg_inds]

-    return loc_index, score_index, tgt_lbl, gt_inds
+    return loc_index, score_index, labels, gt_inds


 def get_anchor(n, c, h, w):
@@ -75,85 +85,129 @@ def get_anchor(n, c, h, w):
    return anchors


-def rpn_blob(anchor, gt_boxes, iou, lod, rpn_batch_size_per_im,
-             rpn_positive_overlap, rpn_negative_overlap, fg_fraction):
-
-    loc_indexes = []
-    score_indexes = []
-    tmp_tgt_labels = []
-    tgt_bboxes = []
-    anchor_num = anchor.shape[0]
-
+def rpn_target_assign_in_python(all_anchors,
+                                gt_boxes,
+                                is_crowd,
+                                im_info,
+                                lod,
+                                rpn_straddle_thresh,
+                                rpn_batch_size_per_im,
+                                rpn_positive_overlap,
+                                rpn_negative_overlap,
+                                rpn_fg_fraction,
+                                use_random=True):
+    anchor_num = all_anchors.shape[0]
    batch_size = len(lod) - 1
    for i in range(batch_size):
+        im_height = im_info[i][0]
+        im_width = im_info[i][1]
+        im_scale = im_info[i][2]
+        if rpn_straddle_thresh >= 0:
+            # Only keep anchors inside the image by a margin of straddle_thresh
+            inds_inside = np.where(
+                (all_anchors[:, 0] >= -rpn_straddle_thresh) &
+                (all_anchors[:, 1] >= -rpn_straddle_thresh) & (
+                    all_anchors[:, 2] < im_width + rpn_straddle_thresh) & (
+                        all_anchors[:, 3] < im_height + rpn_straddle_thresh))[0]
+            # keep only inside anchors
+            inside_anchors = all_anchors[inds_inside, :]
+        else:
+            inds_inside = np.arange(all_anchors.shape[0])
+            inside_anchors = all_anchors
+
        b, e = lod[i], lod[i + 1]
-        iou_slice = iou[b:e, :]
-        bboxes_slice = gt_boxes[b:e, :]
+        gt_boxes_slice = gt_boxes[b:e, :] * im_scale
+        is_crowd_slice = is_crowd[b:e]
+
+        not_crowd_inds = np.where(is_crowd_slice == 0)[0]
+        gt_boxes_slice = gt_boxes_slice[not_crowd_inds]
+        iou = _bbox_overlaps(inside_anchors, gt_boxes_slice)

-        loc_idx, score_idx, tgt_lbl, gt_inds = rpn_target_assign(
-            iou_slice, rpn_batch_size_per_im, rpn_positive_overlap,
-            rpn_negative_overlap, fg_fraction)
+        loc_inds, score_inds, labels, gt_inds = rpn_target_assign(
+            iou, rpn_batch_size_per_im, rpn_positive_overlap,
+            rpn_negative_overlap, rpn_fg_fraction, use_random)
+        # unmap to all anchor 
+        loc_inds = inds_inside[loc_inds]
+        score_inds = inds_inside[score_inds]

-        fg_bboxes = bboxes_slice[gt_inds]
-        fg_anchors = anchor[loc_idx]
-        box_deltas = _box_to_delta(fg_anchors, fg_bboxes, [1., 1., 1., 1.])
+        sampled_gt = gt_boxes_slice[gt_inds]
+        sampled_anchor = all_anchors[loc_inds]
+        box_deltas = _box_to_delta(sampled_anchor, sampled_gt, [1., 1., 1., 1.])

        if i == 0:
-            loc_indexes = loc_idx
-            score_indexes = score_idx
-            tmp_tgt_labels = tgt_lbl
+            loc_indexes = loc_inds
+            score_indexes = score_inds
+            tgt_labels = labels
            tgt_bboxes = box_deltas
        else:
            loc_indexes = np.concatenate(
-                [loc_indexes, loc_idx + i * anchor_num])
+                [loc_indexes, loc_inds + i * anchor_num])
            score_indexes = np.concatenate(
-                [score_indexes, score_idx + i * anchor_num])
-            tmp_tgt_labels = np.concatenate([tmp_tgt_labels, tgt_lbl])
+                [score_indexes, score_inds + i * anchor_num])
+            tgt_labels = np.concatenate([tgt_labels, labels])
            tgt_bboxes = np.vstack([tgt_bboxes, box_deltas])

-    tgt_labels = tmp_tgt_labels[score_indexes]
    return loc_indexes, score_indexes, tgt_bboxes, tgt_labels


 class TestRpnTargetAssignOp(OpTest):
    def setUp(self):
        n, c, h, w = 2, 4, 14, 14
-        anchor = get_anchor(n, c, h, w)
+        all_anchors = get_anchor(n, c, h, w)
        gt_num = 10
-        anchor = anchor.reshape(-1, 4)
-        anchor_num = anchor.shape[0]
-
-        im_shapes = [[64, 64], [64, 64]]
-        gt_box, lod = _generate_groundtruth(im_shapes, 3, 4)
-        bbox = np.vstack([v['boxes'] for v in gt_box])
-
-        iou = _bbox_overlaps(bbox, anchor)
-
-        anchor = anchor.astype('float32')
-        bbox = bbox.astype('float32')
-        iou = iou.astype('float32')
-
-        loc_index, score_index, tgt_bbox, tgt_lbl = rpn_blob(
-            anchor, bbox, iou, [0, 4, 8], 25600, 0.95, 0.03, 0.25)
+        all_anchors = all_anchors.reshape(-1, 4)
+        anchor_num = all_anchors.shape[0]
+
+        images_shape = [[64, 64], [64, 64]]
+        #images_shape = [[64, 64]]
+        groundtruth, lod = _generate_groundtruth(images_shape, 3, 4)
+        lod = [0, 4, 8]
+        #lod = [0, 4]
+
+        im_info = np.ones((len(images_shape), 3)).astype(np.float32)
+        for i in range(len(images_shape)):
+            im_info[i, 0] = images_shape[i][0]
+            im_info[i, 1] = images_shape[i][1]
+            im_info[i, 2] = 0.8  #scale
+        gt_boxes = np.vstack([v['boxes'] for v in groundtruth])
+        is_crowd = np.hstack([v['is_crowd'] for v in groundtruth])
+
+        all_anchors = all_anchors.astype('float32')
+        gt_boxes = gt_boxes.astype('float32')
+
+        rpn_straddle_thresh = 0.0
+        rpn_batch_size_per_im = 256
+        rpn_positive_overlap = 0.7
+        rpn_negative_overlap = 0.3
+        rpn_fg_fraction = 0.5
+        use_random = False
+
+        loc_index, score_index, tgt_bbox, labels = rpn_target_assign_in_python(
+            all_anchors, gt_boxes, is_crowd, im_info, lod, rpn_straddle_thresh,
+            rpn_batch_size_per_im, rpn_positive_overlap, rpn_negative_overlap,
+            rpn_fg_fraction, use_random)
+        labels = labels[:, np.newaxis]

        self.op_type = "rpn_target_assign"
        self.inputs = {
-            'Anchor': anchor,
-            'GtBox': (bbox, [[4, 4]]),
-            'DistMat': (iou, [[4, 4]]),
+            'Anchor': all_anchors,
+            'GtBoxes': (gt_boxes, [[4, 4]]),
+            'IsCrowd': (is_crowd, [[4, 4]]),
+            'ImInfo': (im_info, [[1, 1]])
        }
        self.attrs = {
-            'rpn_batch_size_per_im': 25600,
-            'rpn_positive_overlap': 0.95,
-            'rpn_negative_overlap': 0.03,
-            'fg_fraction': 0.25,
-            'fix_seed': True
+            'rpn_batch_size_per_im': rpn_batch_size_per_im,
+            'rpn_straddle_thresh': rpn_straddle_thresh,
+            'rpn_positive_overlap': rpn_positive_overlap,
+            'rpn_negative_overlap': rpn_negative_overlap,
+            'rpn_fg_fraction': rpn_fg_fraction,
+            'use_random': use_random
        }
        self.outputs = {
            'LocationIndex': loc_index.astype('int32'),
            'ScoreIndex': score_index.astype('int32'),
            'TargetBBox': tgt_bbox.astype('float32'),
-            'TargetLabel': tgt_lbl.astype('int64'),
+            'TargetLabel': labels.astype('int32')
        }

    def test_check_output(self):

--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -41,6 +41,9 @@ class TestSliceOp(OpTest):
    def test_check_output(self):
        self.check_output()

+    def test_check_grad_normal(self):
+        self.check_grad(['Input'], 'Out', max_relative_error=0.006)
+

 class TestCase1(TestSliceOp):
    def config(self):

--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
@@ -30,8 +30,10 @@ class TestWhileOp(unittest.TestCase):
            "d1", shape=[10], append_batch_size=False, dtype='float32')
        d2 = layers.data(
            "d2", shape=[10], append_batch_size=False, dtype='float32')
+
        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True
+
        init = layers.zeros(shape=[10], dtype='float32')
        mem_array = layers.array_write(x=init, i=i)
        data_array = layers.array_write(x=d0, i=i)
@@ -45,11 +47,19 @@ class TestWhileOp(unittest.TestCase):
        i = layers.zeros(shape=[1], dtype='int64')
        i.stop_gradient = True

-        array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        array_len = layers.fill_constant(shape=[1], dtype='int64', value=1)
        array_len.stop_gradient = True
        cond = layers.less_than(x=i, y=array_len)

+        j = layers.fill_constant(shape=[1], dtype='int64', value=1)
+        j.stop_gradient = True
+
+        array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        array_len2.stop_gradient = True
+        cond2 = layers.less_than(x=j, y=array_len2)
+
        while_op = layers.While(cond=cond)
+        while_op2 = layers.While(cond=cond2)
        with while_op.block():
            d = layers.array_read(array=data_array, i=i)
            prev = layers.array_read(array=mem_array, i=i)
@@ -59,7 +69,16 @@ class TestWhileOp(unittest.TestCase):
            layers.array_write(result, i=i, array=mem_array)
            layers.less_than(x=i, y=array_len, cond=cond)

-        sum_result = layers.array_read(array=mem_array, i=i)
+            with while_op2.block():
+                d2 = layers.array_read(array=data_array, i=j)
+                prev2 = layers.array_read(array=mem_array, i=j)
+                result2 = layers.sums(input=[d2, prev2])
+
+                j = layers.increment(x=j, in_place=True)
+                layers.array_write(result2, i=j, array=mem_array)
+                layers.less_than(x=j, y=array_len2, cond=cond2)
+
+        sum_result = layers.array_read(array=mem_array, i=j)
        loss = layers.mean(sum_result)

        append_backward(loss)

--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
@@ -16,3 +16,4 @@ from __future__ import print_function

 from .program_utils import *
 from .ufind import *
+from .checkport import *
--- a/python/paddle/fluid/transpiler/details/checkport.py
+++ b/python/paddle/fluid/transpiler/details/checkport.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import time
+import socket
+from contextlib import closing
+
+
+def wait_server_ready(endpoints):
+    """
+    Wait until parameter servers are ready, use connext_ex to detect
+    port readiness.
+
+    Args:
+        endpoints (list): endpoints string list, like:
+                         ["127.0.0.1:8080", "127.0.0.1:8081"]
+
+    Examples:
+        .. code-block:: python
+
+           wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
+    """
+    while True:
+        all_ok = True
+        for ep in endpoints:
+            ip_port = ep.split(":")
+            with closing(socket.socket(socket.AF_INET,
+                                       socket.SOCK_STREAM)) as sock:
+                sock.settimeout(2)
+                result = sock.connect_ex((ip_port[0], int(ip_port[1])))
+                if result != 0:
+                    all_ok = False
+        if not all_ok:
+            sys.stderr.write("pserver not ready, wait 3 sec to retry...\n")
+            sys.stderr.flush()
+            time.sleep(3)
+        else:
+            break
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
@@ -113,27 +113,32 @@ def op_to_code(op):
            inputs_str += ", "
    inputs_str += "}"

+    attr_names = sorted(op.attr_names)
    attrs_str = ""
-    for i in range(0, len(op.attr_names)):
-        name = op.attr_names[i]
+    for i in range(0, len(attr_names)):
+        name = attr_names[i]

        attr_type = op.desc.attr_type(name)
        if attr_type == core.AttrType.BLOCK:
            a = "{name} = block[{value}]".format(
                name=name, type=attr_type, value=op.block_attr_id(name))
            attrs_str += a
+            if i != len(attr_names) - 1:
+                attrs_str += ", "
            continue

        if attr_type == core.AttrType.BLOCKS:
            a = "{name} = blocks{value}".format(
                name=name, type=attr_type, value=op.blocks_attr_ids(name))
            attrs_str += a
+            if i != len(attr_names) - 1:
+                attrs_str += ", "
            continue

        a = "{name} = {value}".format(
            name=name, type=attr_type, value=op.desc.attr(name))
        attrs_str += a
-        if i != len(op.attr_names) - 1:
+        if i != len(attr_names) - 1:
            attrs_str += ", "

    if outputs_str != "{}":

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -381,7 +381,7 @@ class DistributeTranspiler(object):
                                                        pserver_endpoints)
            self._split_table_grad_and_add_send_vars(program, pserver_endpoints)

-    def get_trainer_program(self):
+    def get_trainer_program(self, wait_port=True):
        """
        Get transpiled trainer side program.

@@ -393,6 +393,9 @@ class DistributeTranspiler(object):
        delete_ops(self.origin_program.global_block(), self.optimize_ops)
        self.origin_program.__str__()

+        if wait_port:
+            wait_server_ready(self.pserver_endpoints)
+
        return self.origin_program

    def _get_trainer_startup_program(self, recv_vars, eplist):

--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
@@ -65,8 +65,43 @@ class InferenceTranspiler(object):
        if use_mkldnn:
            self._fuse_conv_bias_mkldnn(program)
            self._fuse_conv_relu_mkldnn(program)
+            self._fuse_conv_eltwise_mkldnn(program)
+            self._fuse_conv_relu_mkldnn(
+                program)  # ResNet residual block merging
            self._fuse_bn_relu_mkldnn(program)

+    def _fuse_conv_eltwise_mkldnn(self, program):
+        '''
+        Transpile the program fusing elementwise_add into conv for MKLDNN
+        program. Elementwise add following convolution OP can be fused by adding
+        'fuse_eltwise' attribute to convolution OP and replacing its output
+        Tensor with second parameter of elementwise_add.
+        The result of fuse is:
+            - before:
+                - conv->elementwise_add->any_other_op
+            - after:
+                - conv->any_other_op
+        :param program: program to transpile
+        :type program: Program
+        '''
+        self.block = program.block(0)
+
+        i = 0
+        while i < len(self.block.ops):
+            current_op = self.block.ops[i]
+            if current_op.type in ['conv2d']:
+                next_op = self.block.ops[i + 1]
+                if next_op.type == 'elementwise_add':
+                    self._fuse_conv_eltwise(current_op, next_op)
+                    self.block._remove_op(i + 1)  # Remove elementwise_add
+            i = i + 1
+        self._adjust_input()
+        self._remove_unused_var()
+        # TODO(luotao): use clone() method to flush the program.desc in force,
+        # since some large program.desc will not be flushed immediately.
+        # And a better solution will be considered later.
+        program = program.clone()
+
    def _fuse_conv_relu_mkldnn(self, program):
        '''
        Transpile the program by fused relu activation for MKLDNN program.
@@ -88,9 +123,9 @@ class InferenceTranspiler(object):
            if current_op.type in ['conv2d']:
                next_op = self.block.ops[i + 1]
                if next_op.type == 'relu':
-                    # modify conv OP to include relu
+                    # modify bnorm OP to include relu
                    current_op.set_attr("fuse_relu", True)
-                    # remove conv OP
+                    # remove relu OP
                    self.block._remove_op(i + 1)
            i = i + 1

@@ -409,6 +444,20 @@ class InferenceTranspiler(object):
            outputs={"Output": out_var},
            attrs=attrs)

+    def _fuse_conv_eltwise(self, conv_op, eltwise_op):
+        '''
+        fuse the conv op with elementwise_add
+
+        :param conv_op: convolution operator
+        :type conv_op: Operator
+        :param eltwise_op: operator adding data from skip connection
+        :type eltwise_op: Operator
+        '''
+
+        conv_op.set_attr("fuse_eltwise", True)
+        self.input_map[conv_op.output("Output")[0]] = eltwise_op.input("Y")[0]
+        self.input_map[eltwise_op.output("Out")[0]] = eltwise_op.input("Y")[0]
+
    def _adjust_input(self):
        for i in range(len(self.block.ops)):
            current_op = self.block.ops[i]