merge develop

2c4fcaa6 · sneaxiy · 2a639d5c · 40f1dd81 · 2c4fcaa6 · 2c4fcaa6
125 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
-    tar -xz -C /usr/local && \
+
+RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
+    tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \
    cp -rf /usr/local/TensorRT/lib /usr


--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -238,7 +238,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var
 paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae'))
 paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8'))
 paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4'))
-paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '60cb8f843d625abf33f8bf12455b8f99'))
+paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '992eb42590fc1c380841a6db72ce78b3'))
 paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb'))
 paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535'))
 paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816'))
@@ -262,7 +262,7 @@ paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keyword
 paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77'))
 paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
 paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
-paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
+paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
 paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
 paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823'))
 paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
@@ -287,7 +287,7 @@ paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=N
 paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
 paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
-paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
+paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
 paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d'))
 paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
 paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
@@ -329,6 +329,8 @@ paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varar
 paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
+paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
+paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f'))
 paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
 paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
 paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51'))

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -20,6 +20,9 @@
 #include <numeric>
 #include <sstream>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/cpu_info.h"

@@ -302,7 +305,10 @@ std::string OrderedSet::ToString() const {

 bool NodeCanReused(ir::Node* node) {
  // valid the node is a var node
-  if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
+  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
+  if (node == nullptr || !node->IsVar() || node->IsCtrlVar() ||
+      node->Name() == kEmptyVarName)
+    return false;

  bool flag = true;
  // op output force generated in cpu, can not be reused.
@@ -348,10 +354,6 @@ bool NodeCanReused(const VarDesc& node) {
  if (shape.empty() || size < MinChunkSize()) {
    return false;
  }
-  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
-  std::string name = node.Name();
-  if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
-    return false;
  return true;
 }


--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,6 +13,8 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include <memory>
+#include <utility>
 #include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
@@ -29,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
    auto &g = graphs.back();
    g->Set(kGraphVars, new GraphVars(1UL));
    g->Set(kGraphDepVars, new GraphDepVars);
+    auto &stale_ops =
+        graph->Get<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs);
+    g->Erase(details::kStaleProgramOpDescs);
+    g->Set<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs,
+                                        new std::vector<OpDesc *>(stale_ops));
  }
  auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);


--- a/paddle/fluid/framework/small_stack.h
+++ b/paddle/fluid/framework/small_stack.h
@@ -14,7 +14,6 @@

 #pragma once

-#include <array>
 #include <deque>
 #include "paddle/fluid/platform/enforce.h"

@@ -22,7 +21,7 @@ namespace paddle {
 namespace framework {

 template <typename T, size_t N>
-class SmallStack {
+class InlinedStack {
  static_assert(N > 0, "N must be larger than 0");

 public:
@@ -66,8 +65,8 @@ class SmallStack {

 private:
  T head_[N];
+  size_t size_{0};
  std::deque<T> tail_;
-  size_t size_;
 };

 }  // namespace framework

--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,6 +25,10 @@ namespace ir {

 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
+// When we use trt or other third_party lib, the parameters are managed by
+// the lib, but not the fluid. So we need to record them to avoid duplicate
+// allocation.
+static const char kRepetitiveParamAttr[] = "__repetitive_param__";

 enum FuseOptions {
  DO_NOT_FUSE,  // fusing will not be done

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -130,15 +130,21 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    if (adj_list.find(n) == adj_list.end()) {
      adj_list[n] = std::unordered_set<ir::Node *>();
    }
+    std::vector<ir::Node *> nodes;
    for (auto &var : n->inputs) {
      for (auto &adj_n : var->inputs) {
        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                << " -> " << n->Name() << reinterpret_cast<void *>(n)
                << "  via " << var->Name() << reinterpret_cast<void *>(var);
-        adj_list[n].insert(adj_n);
+        nodes.push_back(adj_n);
      }
    }
+    std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
+      return node1->id() > node2->id();
+    });
+    adj_list[n].insert(std::make_move_iterator(nodes.begin()),
+                       std::make_move_iterator(nodes.end()));
  }
  return adj_list;
 }

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -467,12 +467,6 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
  return it->second.empty() ? nullptr : it->second[0];
 }

-const Variable* ExecutionContext::LegacyInputVar(
-    const std::string& name) const {
-  auto ipt = op_.Input(name);
-  return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-}
-
 Variable* ExecutionContext::OutputVar(const std::string& name) const {
  auto it = ctx_.outputs.find(name);
  if (it == ctx_.outputs.end()) return nullptr;
@@ -483,22 +477,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
  return it->second.empty() ? nullptr : it->second[0];
 }

-Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const {
-  auto opt = op_.Output(name);
-  return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
-}
-
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
  return Input<LoDTensor>(name);
 }

-template <>
-const Tensor* ExecutionContext::LegacyInput<Tensor>(
-    const std::string& name) const {
-  return LegacyInput<LoDTensor>(name);
-}
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const {
@@ -521,35 +504,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
  return res;
 }

-template <>
-const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
-    const std::string& name) const {
-  auto names = op().Inputs(name);
-  std::vector<const Tensor*> res;
-  res.reserve(names.size());
-  std::transform(names.begin(), names.end(), std::back_inserter(res),
-                 [&](const std::string& sub_name) -> const Tensor* {
-                   auto var = scope_.FindVar(sub_name);
-                   if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE(
-                       var->IsType<LoDTensor>(),
-                       "%s should be LoDTensor, but the received type is %s",
-                       sub_name, ToTypeName(var->Type()));
-                   return &(var->Get<LoDTensor>());
-                 });
-  return res;
-}
-
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
  return Output<LoDTensor>(name);
 }

-template <>
-Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
-  return LegacyOutput<LoDTensor>(name);
-}
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -16,9 +16,11 @@ limitations under the License. */

 #include <algorithm>
 #include <atomic>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <unordered_map>
+#include <utility>
 #include <vector>

 #include "glog/logging.h"  // For VLOG
@@ -253,31 +255,6 @@ class ExecutionContext {
    return it->second;
  }

-  const std::vector<Variable*> LegacyMultiInputVar(
-      const std::string& name) const {
-    auto names = op_.Inputs(name);
-    std::vector<Variable*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return name == kEmptyVarName ? nullptr
-                                                  : scope_.FindVar(name);
-                   });
-    return res;
-  }
-
-  std::vector<Variable*> LegacyMultiOutputVar(const std::string& name) const {
-    auto names = op_.Outputs(name);
-    std::vector<Variable*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return name == kEmptyVarName ? nullptr
-                                                  : scope_.FindVar(name);
-                   });
-    return res;
-  }
-
  template <typename T>
  const T* Input(const std::string& name) const {
    auto* var = InputVar(name);
@@ -290,22 +267,6 @@ class ExecutionContext {
    return var == nullptr ? nullptr : var->GetMutable<T>();
  }

-  template <typename T>
-  const T* LegacyInput(const std::string& name) const {
-    auto* var = LegacyInputVar(name);
-    return var == nullptr ? nullptr : &var->Get<T>();
-  }
-
-  template <typename T>
-  T* LegacyOutput(const std::string& name) const {
-    auto var = LegacyOutputVar(name);
-    return var == nullptr ? nullptr : var->GetMutable<T>();
-  }
-
-  const Variable* LegacyInputVar(const std::string& name) const;
-
-  Variable* LegacyOutputVar(const std::string& name) const;
-
  template <typename T>
  const std::vector<const T*> MultiInput(const std::string& name) const {
    auto it = ctx_.inputs.find(name);
@@ -338,32 +299,6 @@ class ExecutionContext {
    return res;
  }

-  template <typename T>
-  const std::vector<const T*> LegacyMultiInput(const std::string& name) const {
-    auto names = op_.Inputs(name);
-    std::vector<const T*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) -> const T* {
-                     auto var = scope_.FindVar(sub_name);
-                     return var == nullptr ? nullptr : &var->Get<T>();
-                   });
-    return res;
-  }
-
-  template <typename T>
-  std::vector<T*> LegacyMultiOutput(const std::string& name) const {
-    auto names = op_.Outputs(name);
-    std::vector<T*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) -> T* {
-                     auto var = scope_.FindVar(sub_name);
-                     return var == nullptr ? nullptr : var->GetMutable<T>();
-                   });
-    return res;
-  }
-
  platform::Place GetPlace() const { return device_context_.GetPlace(); }

  template <typename DeviceContextType>
@@ -433,24 +368,13 @@ class ExecutionContext {
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;

-template <>
-const Tensor* ExecutionContext::LegacyInput<Tensor>(
-    const std::string& name) const;
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const;

-template <>
-const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
-    const std::string& name) const;
-
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;

-template <>
-Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const;
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const;

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,8 +23,12 @@

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -133,6 +137,8 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
                      AnalysisConfig::Precision);
+  DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
+                      bool);

  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
+#include <memory>
 #include <set>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/framework.pb.h"
@@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
  return "";
 }

+static std::string GetTrtEngineSerializedPath(const std::string &model_root,
+                                              const std::string &engine_key) {
+  return model_root + "/trt_serialized_" + engine_key;
+}
+
+static std::string GetTrtEngineSerializedData(
+    const std::string &model_opt_cache_dir, const std::string &engine_key) {
+  std::string trt_serialized_path =
+      GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key);
+  if (FileExists(trt_serialized_path)) {
+    VLOG(3) << "Trt serialized file: " << trt_serialized_path
+            << "is found here";
+    std::ifstream infile(trt_serialized_path, std::ios::in);
+    std::stringstream buffer;
+    buffer << infile.rdbuf();
+    std::string trt_engine_serialized_data(buffer.str());
+    return trt_engine_serialized_data;
+  }
+  return "";
+}
+
+static void SaveTrtEngineSerializedDataToFile(
+    const std::string &trt_serialized_path,
+    const std::string &engine_serialized_data) {
+  std::ofstream outfile(trt_serialized_path);
+  outfile << engine_serialized_data;
+  outfile.close();
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set(
          "model_opt_cache_dir",
          new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
+      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
+      pass->Set("use_static_engine",
+                new bool(argument->tensorrt_use_static_engine()));
    }

    pre_pass = pass_name;

--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -22,7 +22,10 @@

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -14,13 +14,13 @@

 #include <algorithm>
 #include <set>
-#include <string>
-#include <vector>

 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/string/pretty_log.h"

@@ -33,8 +33,15 @@ using framework::ir::Node;
 std::vector<std::string> ExtractParameters(
    const std::unordered_set<Node *> &nodes);

-std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map);

+std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
    std::unique_ptr<framework::ir::Graph> graph) const {
  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());

@@ -47,9 +54,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
                      Get<int>("min_subgraph_size") /*min subgraph size*/);
  fuser();

+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in trt, and should not have another copy in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
  for (auto *node : graph->Nodes()) {
    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateTensorRTOp(node, graph.get());
+      CreateTensorRTOp(node, graph.get(), graph_param_names,
+                       &repetitive_params);

      std::unordered_set<const Node *> nodes2remove(
          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
@@ -64,12 +78,15 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
    }
  }
  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  graph->Set(framework::ir::kRepetitiveParamAttr,
+             new std::vector<std::string>(repetitive_params));

  return graph;
 }

 std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
-                              const std::set<std::string> &engine_outputs) {
+                              const std::set<std::string> &engine_outputs,
+                              const std::string &predictor_id) {
  std::string engine_hash_key = "";
  for (auto name : engine_inputs) {
    engine_hash_key += name;
@@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
  for (auto name : engine_outputs) {
    engine_hash_key += name;
  }
+  engine_hash_key += predictor_id;
  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
  return engine_key;
 }

-void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
-                                            Graph *graph) const {
+void TensorRtSubgraphPass::CreateTensorRTOp(
+    framework::ir::Node *node, Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
  auto *op_desc = node->Op();
  auto &subgraph = *Agent(node).subgraph();
  PADDLE_ENFORCE(!subgraph.empty());
@@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // is unique.
  std::set<std::string> input_names;
  std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+
+  // The node->inputs containes input tensors and parameters.
  for (auto *x : node->inputs) {
    input_names.insert(x->Name());
    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
  }
-  op_desc->SetInput(
-      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));

  std::set<std::string> output_names;
  std::set<std::string> output_names_with_id;
@@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
  }

-  op_desc->SetOutput(
-      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-  op_desc->SetType("tensorrt_engine");
-
  std::unordered_map<std::string, std::string> output_name_map;
+  auto &subgraph_nodes = *Agent(node).subgraph();

  // The following procedure is used to rename all the intermediate
  // variables and the output variables of the subgraph.
@@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // input of a OP, but also the output of a Op, there will be problems.
  // So we have to rename the variable in the subgraph to make sure
  // it is either an OP's input or an OP's output.
-
-  auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
-    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    for (auto *in_var : correspond_node->inputs) {
-      var2id[in_var->Name()] = in_var->id();
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      // one input
-      auto *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outputs) {
-      var2id[out_var->Name()] = out_var->id();
-    }
-
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id.count(arg_value_with_id)) {
-          output_name_map[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
-  }
+  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
+                      &output_names_with_id, &output_names, &output_name_map);

  // When tensorrt engine runs at the end of the operation,
  // output_mapping help us copy the data from the renamed ITensor
@@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
    PADDLE_ENFORCE(output_name_map.count(name) != 0);
    output_mapping.push_back(output_name_map[name]);
  }
+  PADDLE_ENFORCE(!output_mapping.empty());

  auto *vars = block_desc.Proto()->mutable_vars();
  for (framework::ir::Node *node : graph->Nodes()) {
@@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,

  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                 "the block has no var-desc");
-  PADDLE_ENFORCE(!output_mapping.empty());
+
+  // Set attrs
+  op_desc->SetType("tensorrt_engine");
+  op_desc->SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
+
+  op_desc->SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
+
  op_desc->SetBlockAttr("sub_block", new_block);
  SetAttr(op_desc->Proto(), "subgraph",
          block_desc.Proto()->SerializeAsString());
-  // Set attrs
  SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
  SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
-  SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+  SetAttr(op_desc->Proto(), "parameters", params);

  auto enable_int8 = Get<bool>("enable_int8");
-  auto engine_key =
-      GenerateEngineKey(input_names_with_id, output_names_with_id);
+  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
+                                      std::to_string(0));

+  // Get "" when there is no cached calibration table data.
  std::string calibration_data = GetTrtCalibTableData(
      Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
  SetAttr(op_desc->Proto(), "calibration_data", calibration_data);

  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
  SetAttr(op_desc->Proto(), "engine_key", engine_key);
+  SetAttr(op_desc->Proto(), "engine_serialized_data", std::string(""));
+
+  std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
+  if (enable_int8 && calibration_data.size() != 0) {
+    calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
+  }
+
+  bool use_static_engine = Get<bool>("use_static_engine");
+  // When in int8 mode and calibration_mode, the program just produce the
+  // calibration table data.
+  bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
+  if (!calibration_mode && use_static_engine) {
+    std::copy(params.begin(), params.end(),
+              std::back_inserter(*repetitive_params));
+    std::string trt_engine_serialized_data = GetTrtEngineSerializedData(
+        Get<std::string>("model_opt_cache_dir"), engine_key);
+
+    if (trt_engine_serialized_data.empty()) {
+      LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+                   "kernel etc). This process may cost a lot of time.";
+      std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
+          new tensorrt::TensorRTEngine(
+              Get<int>("max_batch_size"), Get<int>("workspace_size"),
+              enable_int8, calibrator.get(), Get<int>("gpu_device_id")));
+      auto *scope = param_scope();
+      framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+      std::unordered_set<std::string> param_set(params.begin(), params.end());
+      inference::Singleton<inference::tensorrt::OpConverter>::Global()
+          .ConvertBlockToTRTEngine(
+              &block_desc_temp, *scope,
+              std::vector<std::string>(input_names.begin(), input_names.end()),
+              param_set, output_mapping, trt_engine.get());
+      nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
+      trt_engine_serialized_data =
+          std::string((const char *)serialized_engine_data->data(),
+                      serialized_engine_data->size());
+      SaveTrtEngineSerializedDataToFile(
+          GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
+                                     engine_key),
+          trt_engine_serialized_data);
+    } else {
+      LOG(INFO) << "Load TRT Optimized Info from "
+                << GetTrtEngineSerializedPath(
+                       Get<std::string>("model_opt_cache_dir"), engine_key);
+    }
+
+    SetAttr(op_desc->Proto(), "engine_serialized_data",
+            trt_engine_serialized_data);
+  }
 }

 std::vector<std::string> ExtractParameters(
@@ -253,7 +279,7 @@ std::vector<std::string> ExtractParameters(
  for (const auto &node : nodes) {
    if (!node->IsOp()) continue;
    std::string op_type = node->Op()->Type();
-    if (op_type == "feed") {
+    if (op_type == "feed" || op_type == "fetch") {
      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
      std::copy(output_names.begin(), output_names.end(),
                std::back_inserter(feed_outputs));
@@ -272,6 +298,99 @@ std::vector<std::string> ExtractParameters(
  return parameters;
 }

+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map) {
+  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
+  // When there are more than two convolutions of 1 * 1 with the same input, the
+  // paddle-tensorrt will do the merging optimization, which fuse those conv
+  // into one conv, and then trigger bug. So,  We should use strategy to avoid
+  // this optimization for the time being. This bug will be fixed in the future.
+  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
+      same_hierarchy_conv2d_num_map;
+
+  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
+    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
+    framework::OpDesc op_desc(*op, nullptr);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    std::unordered_map<std::string, framework::ir::Node *> in_vars;
+    for (auto *in_var : correspond_node->inputs) {
+      var2id[in_var->Name()] = in_var->id();
+      in_vars[in_var->Name()] = in_var;
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      // one input
+      auto *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outputs) {
+      var2id[out_var->Name()] = out_var->id();
+    }
+
+    if (op_desc.Type() == "conv2d") {
+      auto input_var_name = op_desc.Input("Input").front();
+      auto filter_var_name = op_desc.Input("Filter").front();
+      auto out_var_name = op_desc.Output("Output").front();
+      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
+      const std::vector<int> strides =
+          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+      const std::vector<int> paddings =
+          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
+        (*output_names_with_id)
+            .insert(out_var_name + std::to_string(var2id[out_var_name]));
+        (*output_names).insert(out_var_name);
+      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
+                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
+                 paddings[1] == 0) {
+        same_hierarchy_conv2d_num_map[input_var_name] += 1;
+      }
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id->count(arg_value_with_id)) {
+          (*output_name_map)[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -13,7 +13,12 @@
 // limitations under the License.

 #pragma once
-#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"

 namespace paddle {
@@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
      std::unique_ptr<framework::ir::Graph> graph) const override;

 private:
-  void CreateTensorRTOp(framework::ir::Node *x,
-                        framework::ir::Graph *graph) const;
+  void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                        const std::vector<std::string> &graph_params,
+                        std::vector<std::string> *repetitive_params) const;
  void CleanIntermediateOutputs(framework::ir::Node *node);
 };


--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  // The parameters are on the cpu, therefore, synchronization is not necessary.
  if (!argument->use_gpu()) return;

+  auto &graph = argument->main_graph();
+  std::vector<std::string> repetitive_params;
+
+  if (graph.Has(framework::ir::kRepetitiveParamAttr))
+    repetitive_params = graph.Get<std::vector<std::string>>(
+        framework::ir::kRepetitiveParamAttr);
+
  LOG(INFO) << "Sync params from CPU to GPU";

  PADDLE_ENFORCE(argument->gpu_device_id_valid());
@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  // Because there exists the case that new parameter variables are not added to
  // the program in the analysis pass.
  for (auto &var_name : all_vars) {
+    if (std::count(repetitive_params.begin(), repetitive_params.end(),
+                   var_name)) {
+      continue;
+    }
    auto *var = scope->FindLocalVar(var_name);
    PADDLE_ENFORCE(var != nullptr);
    if (var->IsType<framework::LoDTensor>() ||

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>

+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(tensorrt_max_batchsize_);
  CP_MEMBER(tensorrt_min_subgraph_size_);
  CP_MEMBER(tensorrt_precision_mode_);
+  CP_MEMBER(trt_use_static_engine_);
  // MKLDNN related.
  CP_MEMBER(use_mkldnn_);
  CP_MEMBER(mkldnn_enabled_op_types_);
@@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() {

 void AnalysisConfig::EnableTensorRtEngine(
    int workspace_size, int max_batch_size, int min_subgraph_size,
-    AnalysisConfig::Precision precision_mode) {
+    AnalysisConfig::Precision precision_mode, bool use_static) {
 #ifdef PADDLE_WITH_CUDA
  if (!use_gpu()) {
    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine(
  tensorrt_max_batchsize_ = max_batch_size;
  tensorrt_min_subgraph_size_ = min_subgraph_size;
  tensorrt_precision_mode_ = precision_mode;
+  trt_use_static_engine_ = use_static;

  Update();
 #else

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                            std::vector<PaddleTensor> *output_data,
                            int batch_size) {
+  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
+    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+  }
  VLOG(3) << "Predictor::predict";
  inference::Timer timer;
  timer.tic();
@@ -362,6 +365,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
    argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
+    argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
  }

  if (config_.use_mkldnn_) {
@@ -435,12 +439,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
      }
      feeds_[idx] = op;
      feed_names_[op->Output("Out")[0]] = idx;
+      idx2feeds_[idx] = op->Output("Out")[0];
    } else if (op->Type() == "fetch") {
      int idx = boost::get<int>(op->GetAttr("col"));
      if (fetches_.size() <= static_cast<size_t>(idx)) {
        fetches_.resize(idx + 1);
      }
      fetches_[idx] = op;
+      idx2fetches_[idx] = op->Input("X")[0];
    }
  }
 }
@@ -453,6 +459,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
  var->GetMutable<framework::FeedFetchList>();
 }

+std::vector<std::string> AnalysisPredictor::GetInputNames() {
+  std::vector<std::string> input_names;
+  for (auto &item : idx2feeds_) {
+    input_names.push_back(item.second);
+  }
+  return input_names;
+}
+
+std::vector<std::string> AnalysisPredictor::GetOutputNames() {
+  std::vector<std::string> output_names;
+  for (auto &item : idx2fetches_) {
+    output_names.push_back(item.second);
+  }
+  return output_names;
+}
+
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
    const std::string &name) {
  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
@@ -460,6 +482,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = true;
  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = boost::get<platform::CUDAPlace>(place_);
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+
  return res;
 }

@@ -470,6 +499,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = false;
  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = boost::get<platform::CUDAPlace>(place_);
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
  return res;
 }


--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -15,12 +15,14 @@
 #pragma once
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
 #ifdef PADDLE_WITH_TESTING
@@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
           std::vector<PaddleTensor> *output_data,
           int batch_size = -1) override;

+  std::vector<std::string> GetInputNames();
+  std::vector<std::string> GetOutputNames();
+
  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
      const std::string &name) override;
  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
@@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
  std::shared_ptr<framework::ProgramDesc> inference_program_;
  std::vector<framework::OpDesc *> feeds_;
  std::map<std::string, size_t> feed_names_;
+  // Sorted according to the idx.
+  std::map<size_t, std::string> idx2feeds_;
  std::vector<framework::OpDesc *> fetches_;
+  std::map<size_t, std::string> idx2fetches_;
+
  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
  // concurrency problems, wrong results and memory leak, so cache them.
  std::vector<framework::LoDTensor> feed_tensors_;

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() {
 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
+  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
+    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+  }
  VLOG(3) << "Predictor::predict";
  Timer timer;
  timer.tic();

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
  return res;
 }

+template <typename T>
+void ZeroCopyTensor::copy_from_cpu(const T *data) {
+  EAGER_GET_TENSOR;
+  PADDLE_ENFORCE_GE(
+      tensor->numel(), 0,
+      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
+      "function before copy data from cpu.");
+  size_t ele_size = tensor->numel() * sizeof(T);
+
+  if (place_ == PaddlePlace::kCPU) {
+    auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
+    std::memcpy(static_cast<void *>(t_data), data, ele_size);
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    platform::CUDAPlace gpu_place(device_);
+    auto *t_data = tensor->mutable_data<T>(gpu_place);
+    auto *dev_ctx =
+        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+
+    memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
+                 data, ele_size, dev_ctx->stream());
+#else
+    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+  }
+}
+
+template <typename T>
+void ZeroCopyTensor::copy_to_cpu(T *data) {
+  EAGER_GET_TENSOR;
+  auto ele_num = tensor->numel();
+  auto *t_data = tensor->data<T>();
+  auto t_place = tensor->place();
+
+  if (platform::is_cpu_place(t_place)) {
+    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto gpu_place = boost::get<platform::CUDAPlace>(t_place);
+    auto *dev_ctx =
+        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
+                 t_data, ele_num * sizeof(T), dev_ctx->stream());
+#else
+    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+  }
+}
+template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
+template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
+template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
+template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
+
 template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
                                            int *size) const;
 template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
  return tensor;
 }

-std::vector<int64_t> ZeroCopyTensor::shape() const {
+std::vector<int> ZeroCopyTensor::shape() const {
  EAGER_GET_TENSOR;
  PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
-  return framework::vectorize(tensor->dims());
+  return framework::vectorize2int(tensor->dims());
 }

 void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {

--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);

 void *ZeroCopyTensor::FindTensor() const { return nullptr; }

-std::vector<int64_t> ZeroCopyTensor::shape() const { return {}; }
+std::vector<int> ZeroCopyTensor::shape() const { return {}; }

 void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}


--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -50,6 +50,11 @@ class Timer {
  }
 };

+static int GetUniqueId() {
+  static int id = 0;
+  return id++;
+}
+
 static void split(const std::string &str, char sep,
                  std::vector<std::string> *pieces) {
  pieces->clear();

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -135,7 +135,8 @@ struct AnalysisConfig {
   */
  void EnableTensorRtEngine(int workspace_size = 1 << 20,
                            int max_batch_size = 1, int min_subgraph_size = 3,
-                            Precision precision = Precision::kFloat32);
+                            Precision precision = Precision::kFloat32,
+                            bool use_static = true);
  /** A boolean state telling whether the TensorRT engine is used.
   */
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
@@ -233,6 +234,7 @@ struct AnalysisConfig {
  //  subgraph, 3 as default value.
  int tensorrt_min_subgraph_size_{3};
  Precision tensorrt_precision_mode_;
+  bool trt_use_static_engine_;

  // memory reuse related.
  bool enable_memory_optim_{false};

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -160,11 +160,21 @@ class ZeroCopyTensor {
  template <typename T>
  T* data(PaddlePlace* place, int* size) const;

-  std::vector<int64_t> shape() const;
+  template <typename T>
+  void copy_from_cpu(const T* data);
+
+  template <typename T>
+  void copy_to_cpu(T* data);
+
+  std::vector<int> shape() const;

  void SetLoD(const std::vector<std::vector<size_t>>& x);
  std::vector<std::vector<size_t>> lod() const;
  const std::string& name() const { return name_; }
+  void SetPlace(PaddlePlace place, int device = -1) {
+    place_ = place;
+    device_ = device;
+  }

 protected:
  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
@@ -179,6 +189,8 @@ class ZeroCopyTensor {
  // The corresponding tensor pointer inside Paddle workspace is cached for
  // performance.
  mutable void* tensor_{nullptr};
+  PaddlePlace place_;
+  int device_;
 };

 /** A simple Inference API for Paddle.
@@ -200,6 +212,14 @@ class PaddlePredictor {
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;

+  /** \brief Get input names of the model
+   */
+  virtual std::vector<std::string> GetInputNames() { return {}; }
+
+  /** \brief Get output names of the model
+   */
+  virtual std::vector<std::string> GetOutputNames() { return {}; }
+
  /** \brief Get a mutable tensor directly.
   *
   * NOTE Only works in AnalysisPredictor.

--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
@@ -49,11 +49,6 @@ class EngineBase {
  // Execute the engine, that will run the inference network.
  virtual void Execute(int batch_size) = 0;

-  // Return the IO buffer that allocated in engine. One can read/write directly
-  // on the buffer. If the buffer's buffer is nullptr, one can also allocate
-  // memory and maintain it outside the engine.
-  virtual Buffer& buffer(const std::string& name) = 0;
-
  virtual ~EngineBase() {}
 };  // class EngineBase


--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -18,21 +18,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-bool to_skip_merging_optimize(TensorRTEngine* engine,
-                              const std::vector<int>& filters,
-                              const std::vector<int>& strides,
-                              const std::vector<int>& paddings,
-                              std::string input_name) {
-  if (engine->itensor_quote_num[input_name] > 0) {
-    return true;
-  }
-  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
-      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
-    engine->itensor_quote_num[input_name] += 1;
-
-  return false;
-}
-
 template <typename RegistFunc, typename SetDilationFunc>
 void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode,
@@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
  weight_tensor->Resize(Y_t->dims());
  TensorCopySync((*Y_t), cpu_place, weight_tensor.get());

-  auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
+  auto* weight_data = weight_tensor->mutable_data<float>(cpu_place);

  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
  const int n_output = weight_tensor->dims()[0];
@@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
  layer->getOutput(0)->setName(output_name.c_str());
  engine->SetITensor(output_name, layer->getOutput(0));

-  if (test_mode ||
-      to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
-                               op_desc.Input("Input").front())) {
+  if (test_mode) {
    engine->DeclareOutput(output_name);
  }
 }

--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
    if (CheckDims(dims_x, dims_y)) {
      // The two input tensor should have the same dims
      VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
-
      nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
          engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
          *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
@@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
                 "ElementWisePluginLayer";

      plugin::ElementWisePlugin* plugin =
-          new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
+          new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
      plugin->AddInput(X);
      plugin->AddInput(Y);
      nvinfer1::IPluginLayer* layer = engine_->AddPlugin(

--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter {
           Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                  static_cast<void*>(weight_data),
-                                  Y_t->memory_size() / sizeof(float)};
+                                  static_cast<size_t>(Y_t->numel())};
    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
                                      static_cast<void*>(tmp->data<float>()),
-                                      Y_t->memory_size() / sizeof(float));
+                                      static_cast<size_t>(Y_t->numel()));
    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
    tmp_weight.dims = weight.dims;


--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -16,9 +16,12 @@ limitations under the License. */

 #include <string>
 #include <unordered_map>
+#include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"

@@ -26,6 +29,37 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {  // NOLINT
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
+  if (shape.size() == 4UL)
+    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
+  return nvinfer1::DimsCHW(shape[1], 1, 1);
+}
+
+}  // namespace // NOLINT
+
 /*
 * Convert Op from Fluid to TensorRT Engine.
 */
@@ -110,6 +144,34 @@ class OpConverter {
    }
  }

+  // The scope  here should be inited with the parameter vars.
+  void ConvertBlockToTRTEngine(
+      framework::BlockDesc* block_desc, const framework::Scope& scope,
+      const std::vector<std::string>& inputs,
+      const std::unordered_set<std::string>& parameters,
+      const std::vector<std::string>& outputs, TensorRTEngine* engine) {
+    engine->InitNetwork();
+    for (auto& input : inputs) {
+      if (parameters.count(input)) continue;
+      auto* var = block_desc->FindVar(input);
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                        "TensorRT engine only takes LoDTensor as input");
+      auto var_shape = var->GetShape();
+
+      engine->DeclareInput(
+          input, FluidDataType2TRT(
+                     var->Proto()->type().lod_tensor().tensor().data_type()),
+          Vec2TRT_Dims(var_shape));
+    }
+    framework::proto::BlockDesc* block_proto = block_desc->Proto();
+    ConvertBlock(*block_proto, parameters, scope, engine);
+    for (auto& output : outputs) {
+      engine->DeclareOutput(output);
+    }
+    engine->FreezeNetwork();
+  }
+
  void SetEngine(TensorRTEngine* engine) { engine_ = engine; }

  virtual ~OpConverter() {}

--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter {
    PADDLE_ENFORCE_NOT_NULL(alpha_var);
    auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();

-    platform::CUDAPlace place;
-    std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
        new framework::LoDTensor());
-    alpha_tensor_device->Resize(alpha_tensor->dims());
-    TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
-    float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
+    alpha_tensor_temp->Resize(alpha_tensor->dims());
+    TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
+    float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);

-    // Transform alpha to TensorRTEngine::Weight
-    TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
-                                    static_cast<void*>(alpha_data),
-                                    alpha_tensor_device->numel());
-    plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
+    plugin::PReluPlugin* plugin =
+        new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode);
    nvinfer1::IPluginLayer* layer =
        engine_->AddPlugin(&input, input_num, plugin);
    // keep alpha tensor to avoid release it's memory
    engine_->weight_map[op_desc.Input("Alpha")[0]] =
-        std::move(alpha_tensor_device);
+        std::move(alpha_tensor_temp);

    std::string layer_name = "prelu (Output: ";
    auto output_name = op_desc.Output("Out")[0];

--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -19,7 +19,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>

 #include "paddle/fluid/framework/lod_tensor.h"
@@ -79,7 +81,8 @@ class TRTConvertValidation {
        if_add_batch_(if_add_batch),
        max_batch_size_(max_batch_size) {
    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
-    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
+    engine_.reset(
+        new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0));
    engine_->InitNetwork();
  }

@@ -114,13 +117,12 @@ class TRTConvertValidation {
  }

  void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
-    platform::CUDAPlace place;
-    platform::CUDADeviceContext ctx(place);
+    platform::CUDADeviceContext ctx(place_);

    auto* x = scope_.Var(name);
    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
    x_tensor->Resize(framework::make_ddim(dim_vec));
-    RandomizeTensor(x_tensor, place, ctx);
+    RandomizeTensor(x_tensor, place_, ctx);
  }
  // Declare a variable in a fluid Scope.
  void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
@@ -146,19 +148,6 @@ class TRTConvertValidation {

    // Declare outputs.
    op_desc_.reset(new framework::OpDesc(desc, nullptr));
-
-    // Set Inputs.
-    for (const auto& input : op_desc_->InputArgumentNames()) {
-      if (parameters_.count(input)) continue;
-      auto* var = scope_.FindVar(input);
-      PADDLE_ENFORCE(var);
-      auto tensor = var->GetMutable<framework::LoDTensor>();
-
-      engine_->SetInputFromGPU(
-          input, static_cast<void*>(tensor->data<void>()),
-          sizeof(float) *
-              analysis::AccuDims(tensor->dims(), tensor->dims().size()));
-    }
  }

  // We use the set 'neglected_output' here, because some Ops like batch norm,
@@ -168,43 +157,71 @@ class TRTConvertValidation {
               std::unordered_set<std::string> neglected_output = {}) {
    // Execute Fluid Op
    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
-    platform::CUDAPlace place;
-    platform::CUDADeviceContext ctx(place);
-    op_->Run(scope_, place);
-    // Execute TRT.
-    engine_->Execute(batch_size);
-    cudaStreamSynchronize(engine_->stream());
+    platform::CUDADeviceContext ctx(place_);
+    op_->Run(scope_, place_);

-    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
-    const size_t output_space_size = 3000;
+    std::vector<std::string> input_output_names;
+
+    // Note: we need filter the parameter
+    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
+      input_output_names.push_back(input);
+    }
+
+    // Collect the fluid outputs.
+    std::vector<std::vector<float>> fluid_outs;
    for (const auto& output : op_desc_->OutputArgumentNames()) {
      if (neglected_output.count(output)) continue;
+      input_output_names.push_back(output);
      std::vector<float> fluid_out;
-      std::vector<float> trt_out(output_space_size);
-      engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
-      cudaStreamSynchronize(engine_->stream());
-
      auto* var = scope_.FindVar(output);
-      auto tensor = var->GetMutable<framework::LoDTensor>();
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
      framework::TensorToVector(*tensor, ctx, &fluid_out);
+      fluid_outs.push_back(fluid_out);
+    }
+
+    // Bind input and output for TRT.
+    const int num_bindings = input_output_names.size();
+    std::vector<void*> buffers(num_bindings);
+
+    for (const std::string& name : input_output_names) {
+      auto* var = scope_.FindVar(name);
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      const int bind_index = engine_->engine()->getBindingIndex(name.c_str());
+      buffers[bind_index] =
+          static_cast<void*>(tensor->mutable_data<float>(place_));
+    }
+
+    // Execute TRT.
+    engine_->Execute(batch_size, &buffers, stream_);

-      size_t fluid_out_size = fluid_out.size();
+    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    int index = 0;
+    for (const auto& output : op_desc_->OutputArgumentNames()) {
+      if (neglected_output.count(output)) continue;
+      std::vector<float> trt_out;
+      auto* var = scope_.FindVar(output);
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      framework::TensorToVector(*tensor, ctx, &trt_out);
+
+      size_t fluid_out_size = fluid_outs[index].size();
      if (if_add_batch_ == true) {
        fluid_out_size =
            batch_size * (framework::product(tensor->dims()) / max_batch_size_);
      }
-      // Compare two output
-      ASSERT_FALSE(fluid_out.empty());
+
      for (size_t i = 0; i < fluid_out_size; i++) {
        // Loose the threshold for CI in different machine model.
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
+        EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5);
      }
+      index += 1;
    }
  }

  framework::Scope& scope() { return scope_; }

 private:
+  platform::CUDAPlace place_;
  std::unique_ptr<TensorRTEngine> engine_;
  cudaStream_t stream_;
  std::unique_ptr<framework::OperatorBase> op_;

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
  PADDLE_ENFORCE(false, "not implemented");
 }

-void TensorRTEngine::Execute(int batch_size) {
+void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
+                             cudaStream_t stream) {
  freshDeviceId();
  batch_size_ = batch_size;
-  std::vector<void *> buffers;
-  for (auto &buf : buffers_) {
-    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
-    PADDLE_ENFORCE_GT(buf.max_size, 0);
-    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
-    buffers.push_back(buf.buffer);
-  }
-  infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
-  cudaStreamSynchronize(stream_);
+  infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr);
+  cudaStreamSynchronize(stream);
  SetRuntimeBatch(batch_size);
 }

-TensorRTEngine::~TensorRTEngine() {
-  cudaStreamSynchronize(stream_);
-  // clean buffer
-  for (auto &buf : buffers_) {
-    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
-      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
-      buf.buffer = nullptr;
-      buf.max_size = 0;
-    }
-  }
-}
-
 void TensorRTEngine::FreezeNetwork() {
-  VLOG(3) << "TRT to freeze network";
  freshDeviceId();
+  VLOG(3) << "TRT to freeze network";
  PADDLE_ENFORCE(infer_builder_ != nullptr,
                 "Call InitNetwork first to initialize network.");
  PADDLE_ENFORCE(infer_network_ != nullptr,
@@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");

  infer_context_.reset(infer_engine_->createExecutionContext());
-
-  // allocate GPU buffers.
-  buffers_.resize(buffer_sizes_.size());
-  for (auto &item : buffer_sizes_) {
-    // The output buffers are not set in the network building phrase, need to
-    // infer from the TesorRT network.
-    if (item.second == 0) {
-      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
-      auto dims = infer_engine_->getBindingDimensions(slot_offset);
-      item.second = kDataTypeSize[static_cast<int>(
-                        infer_engine_->getBindingDataType(slot_offset))] *
-                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
-      PADDLE_ENFORCE_GT(item.second, 0);
-    }
-
-    auto &buf = buffer(item.first);
-    buf.max_size = item.second * max_batch_;
-    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
-
-    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
-    buf.size = 0;
-    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
-    buf.device = DeviceType::GPU;
-  }
 }

 nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
@@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
  buffer_sizes_[name] = 0;
 }

-void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
-  return buffer(name).buffer;
-}
-
-void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
-                                    size_t max_size) {
-  // determine data size
-  auto *output = TensorRTEngine::GetITensor(name);
-  nvinfer1::Dims dims = output->getDimensions();
-  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
-  size_t dst_size = dim_size * runtime_batch_ *
-                    kDataTypeSize[static_cast<int>(output->getType())];
-
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
-  PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_LE(dst_size, it->second);
-  PADDLE_ENFORCE_GE(max_size, dst_size);
-  auto &buf = buffer(name);
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                    cudaMemcpyDeviceToDevice, stream_),
-                    0);
-}
-
-void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
-                                    size_t max_size) {
-  // determine data size
-
-  auto *output = TensorRTEngine::GetITensor(name);
-  nvinfer1::Dims dims = output->getDimensions();
-  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
-  size_t dst_size = dim_size * runtime_batch_ *
-                    kDataTypeSize[static_cast<int>(output->getType())];
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
-  PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_LE(dst_size, it->second);
-  PADDLE_ENFORCE_GE(max_size, dst_size);
-  auto &buf = buffer(name);
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                       cudaMemcpyDeviceToHost, stream_));
-}
-
-Buffer &TensorRTEngine::buffer(const std::string &name) {
-  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
-                 name);
-  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
-  return buffers_[slot_offset];
-}
-
-void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
-                                     size_t size) {
-  auto &buf = buffer(name);
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
-  PADDLE_ENFORCE_NOT_NULL(data);
-  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
-  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
-  buf.size = size;
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyHostToDevice, stream_));
-}
-
-void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
-                                     size_t size) {
-  auto &buf = buffer(name);
-  buf.size = size;
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
-  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
-  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyDeviceToDevice, stream_));
-}
-
 void TensorRTEngine::SetITensor(const std::string &name,
                                nvinfer1::ITensor *tensor) {
  PADDLE_ENFORCE(tensor != nullptr);
@@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {

 int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }

-void TensorRTEngine::freshDeviceId() {
-  int count;
-  cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE_LT(device_, count);
-  cudaSetDevice(device_);
-}
-
 nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
    nvinfer1::ITensor *const *inputs, int num_inputs,
    plugin::PluginTensorRT *plugin) {
@@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
  return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
 }

+void TensorRTEngine::freshDeviceId() {
+  int count;
+  cudaGetDeviceCount(&count);
+  PADDLE_ENFORCE_LT(device_id_, count);
+  cudaSetDevice(device_id_);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"

@@ -37,7 +38,9 @@ class TRTInt8Calibrator;
 * There are two alternative ways to use it, one is  to build from a paddle
 * protobuf model, another way is to manully construct the network.
 */
-class TensorRTEngine : public EngineBase {
+class TensorRTEngine {
+  using DescType = ::paddle::framework::proto::BlockDesc;
+
 public:
  // Weight is model parameter.
  class Weight {
@@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase {
    nvinfer1::Weights w_;
  };

-  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
-                 int device = 0, bool enable_int8 = false,
-                 TRTInt8Calibrator* calibrator = nullptr,
+  TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false,
+                 TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
                 nvinfer1::ILogger& logger = NaiveLogger::Global())
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
-        stream_(stream),
-        device_(device),
        enable_int8_(enable_int8),
        calibrator_(calibrator),
+        device_id_(device_id),
        logger_(logger) {}

-  virtual ~TensorRTEngine();
+  ~TensorRTEngine() {}

  // TODO(Superjomn) implement it later when graph segmentation is supported.
-  void Build(const DescType& paddle_model) override;
+  void Build(const DescType& paddle_model);

-  void Execute(int batch_size) override;
+  void Execute(int batch_size, std::vector<void*>* buffers,
+               cudaStream_t stream);

  // Initialize the inference network, so that TensorRT layers can add to this
  // network.
  void InitNetwork() {
+    freshDeviceId();
    infer_builder_.reset(createInferBuilder(&logger_));
    infer_network_.reset(infer_builder_->createNetwork());
  }
@@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase {
  // Check if the ITensor has been declared
  bool HasDeclared(const std::string& name);

-  // GPU memory address for an ITensor with specific name. One can operate on
-  // these memory directly for acceleration, for example, output the converted
-  // data directly to the buffer to save data copy overhead.
-  // NOTE this should be used after calling `FreezeNetwork`.
-  Buffer& buffer(const std::string& name) override;
-
-  cudaStream_t stream() { return stream_; }
-
-  // Fill an input from CPU memory with name and size.
-  void SetInputFromCPU(const std::string& name, const void* data, size_t size);
-  // TODO(Superjomn) is this method necessary given that buffer(xxx) can be
-  // accessed directly. Fill an input from GPU memory with name and size.
-  void SetInputFromGPU(const std::string& name, const void* data, size_t size);
-  // Get an output called name, the output of tensorrt is in GPU, so this method
-  // Return the output's GPU memory address without copy.
-  void* GetOutputInGPU(const std::string& name);
-  // Copy data into dst inside the GPU device.
-  void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
-  // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
-  // to CPU.
-  void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
-  // Fill an ITensor into map itensor_map_.
  void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
  // Get an ITensor called name.
  nvinfer1::ITensor* GetITensor(const std::string& name);

  nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
  nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
+
+  nvinfer1::IHostMemory* Serialize() {
+    PADDLE_ENFORCE(infer_engine_ != nullptr,
+                   "You should build engine first and then serialize");
+    ihost_memory_.reset(infer_engine_->serialize());
+    return ihost_memory_.get();
+  }
+
+  void Deserialize(const std::string& engine_serialized_data) {
+    freshDeviceId();
+    infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
+    infer_engine_.reset(runtime->deserializeCudaEngine(
+        engine_serialized_data.c_str(), engine_serialized_data.size(),
+        &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
+    PADDLE_ENFORCE(infer_engine_ != nullptr,
+                   "build cuda engine failed when deserialize engine info.!");
+    infer_context_.reset(infer_engine_->createExecutionContext());
+  }
+
  void SetRuntimeBatch(size_t batch_size);
  int GetRuntimeBatch();
-  int GetDevice() { return device_; }
+  int GetDeviceId() { return device_id_; }
  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                    int num_inputs, plugin::PluginTensorRT*);

@@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase {
  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
      weight_map;

-  // TODO(NHZLX)
-  // In the normal case, the paddle-trt exists bug when runing the googlenet.
-  // When there are more than two convolutions of 1 * 1 with the same input, the
-  // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into one conv, and then trigger bug. So,  We should use strategy to avoid
-  // this
-  // optimization for the time being. This bug will be fixed in the future.
-  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
-      itensor_quote_num;
-
 private:
+  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
+  // ensure that the thread is associated with the correct device by calling
+  // freshDeviceId().
+  void freshDeviceId();
+
  // the max batch size
  int max_batch_;
  // the runtime batch size
@@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase {
  // the max memory size the engine uses
  int max_workspace_;

-  cudaStream_t stream_;
-  // The specific GPU id that the TensorRTEngine bounded to.
-  int device_;
-
  bool enable_int8_;
  TRTInt8Calibrator* calibrator_;
  // batch size of the current data, will be updated each Executation.
  int batch_size_{-1};

+  int device_id_;
  nvinfer1::ILogger& logger_;

-  std::vector<Buffer> buffers_;
  // max data size for the buffers.
  std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
  std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
@@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase {
  infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
  infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
  infer_ptr<nvinfer1::IExecutionContext> infer_context_;
-  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
-  // ensure that the thread is associated with the correct device by calling
-  // freshDeviceId().
-  void freshDeviceId();
+  infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
 };  // class TensorRTEngine

 // Add an layer__ into engine__ with args ARGS.
 // For example:
-//   TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
 //
 // Reference
 // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network

--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -17,6 +17,9 @@
 #include <NvInfer.h>
 #include <cuda.h>
 #include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/platform/dynload/tensorrt.h"
 #include "paddle/fluid/platform/enforce.h"

@@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger {
  ~NaiveLogger() override {}
 };

+class NaiveProfiler : public nvinfer1::IProfiler {
+ public:
+  typedef std::pair<std::string, float> Record;
+  std::vector<Record> mProfile;
+
+  virtual void reportLayerTime(const char* layerName, float ms) {
+    auto record =
+        std::find_if(mProfile.begin(), mProfile.end(),
+                     [&](const Record& r) { return r.first == layerName; });
+    if (record == mProfile.end())
+      mProfile.push_back(std::make_pair(layerName, ms));
+    else
+      record->second += ms;
+  }
+
+  void printLayerTimes() {
+    float totalTime = 0;
+    for (size_t i = 0; i < mProfile.size(); i++) {
+      printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(),
+             mProfile[i].second);
+      totalTime += mProfile[i].second;
+    }
+    printf("Time over all layers: %4.3f\n", totalTime);
+  }
+};
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 nv_library(tensorrt_plugin
-           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
+           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
+           prelu_op_plugin.cu  trt_plugin_factory.cc
           avg_pool_op_plugin.cu
           DEPS enforce tensorrt_engine prelu)
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/operators/math/pooling.h"

 namespace paddle {
@@ -20,6 +21,12 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer,
+                                              size_t length) {
+  return new AvgPoolPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize);
+
 nvinfer1::Dims AvgPoolPlugin::getOutputDimensions(
    int index, const nvinfer1::Dims* inputDims, int nbInputs) {
  assert(nbInputs == 1);

--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
@@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT {

 protected:
  size_t getSerializationSize() override {
-    return SerializedSize(ceil_mode_) + SerializedSize(ksize_) +
-           SerializedSize(strides_) + SerializedSize(paddings_) +
-           SerializedSize(input_shape_) + getBaseSerializationSize();
+    return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) +
+           SerializedSize(ksize_) + SerializedSize(strides_) +
+           SerializedSize(paddings_) + SerializedSize(input_shape_) +
+           SerializedSize(output_shape_) + getBaseSerializationSize();
  }

  // TRT will call this func when we need to serialize the configuration of
  // tensorrt.
-  // It should not be called by users.
  void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
    serializeBase(buffer);
    SerializeValue(&buffer, ceil_mode_);
    SerializeValue(&buffer, ksize_);
    SerializeValue(&buffer, strides_);
    SerializeValue(&buffer, paddings_);
    SerializeValue(&buffer, input_shape_);
+    SerializeValue(&buffer, output_shape_);
  }

 public:
+  AvgPoolPlugin() {}
  AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize,
                std::vector<int> strides, std::vector<int> paddings,
                std::vector<int> input_shape)
@@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT {
    DeserializeValue(&serialData, &serialLength, &strides_);
    DeserializeValue(&serialData, &serialLength, &paddings_);
    DeserializeValue(&serialData, &serialLength, &input_shape_);
+    DeserializeValue(&serialData, &serialLength, &output_shape_);
  }

  AvgPoolPlugin *clone() const override {
@@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT {
                             input_shape_);
  }

-  const char *getPluginType() const override { return "avg_pool"; }
+  const char *getPluginType() const override { return "avg_pool_plugin"; }
  int getNbOutputs() const override { return 1; }
  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
                                     int nbInputDims) override;

--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -14,12 +14,19 @@ limitations under the License. */

 #include <glog/logging.h>
 #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"

 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {

+ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer,
+                                                      size_t length) {
+  return new ElementWisePlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize);
+
 namespace details {

 template <typename T>
@@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
  const float* y = reinterpret_cast<const float*>(inputs[1]);
  float* out = reinterpret_cast<float*>(outputs[0]);

-  if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
+  if (type_ == "add") {
    details::ElementWise(details::Add<float>(), x, y, out, batch_size,
                         prev_size_, midd_size_, post_size_, stream);
-  } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
+  } else if (type_ == "mul") {
    details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
                         prev_size_, midd_size_, post_size_, stream);
  } else {

--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"

@@ -24,9 +25,8 @@ namespace plugin {

 class ElementWisePlugin : public PluginTensorRT {
 public:
-  ElementWisePlugin(nvinfer1::ElementWiseOperation type,
-                    nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
-                    int axis)
+  ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x,
+                    nvinfer1::Dims const &dims_y, int axis)
      : type_(type),
        dims_x_(dims_x),
        dims_y_(dims_y),
@@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT {

  ElementWisePlugin(void const *serial_data, size_t serial_length) {
    deserializeBase(serial_data, serial_length);
+    const char *elementwise_type;
+    DeserializeValue(&serial_data, &serial_length, &elementwise_type);
+    type_ = std::string(elementwise_type);
    DeserializeValue(&serial_data, &serial_length, &axis_);
    DeserializeValue(&serial_data, &serial_length, &dims_x_);
    DeserializeValue(&serial_data, &serial_length, &dims_y_);
@@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT {
    return nullptr;
  }

-  const char *getPluginType() const override { return "elementwise"; }
+  const char *getPluginType() const override { return "elementwise_plugin"; }

  nvinfer1::Dims getOutputDimensions(int index,
                                     const nvinfer1::Dims *input_dims,
@@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT {

 protected:
  size_t getSerializationSize() override {
-    return SerializedSize(axis_) + SerializedSize(dims_x_) +
-           SerializedSize(dims_y_) + getBaseSerializationSize();
+    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
+           SerializedSize(dims_x_) + SerializedSize(dims_y_) +
+           getBaseSerializationSize();
  }

  void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
    serializeBase(buffer);
+    SerializeValue(&buffer, type_.c_str());
    SerializeValue(&buffer, axis_);
    SerializeValue(&buffer, dims_x_);
    SerializeValue(&buffer, dims_y_);
  }

-  nvinfer1::ElementWiseOperation type_;
+  std::string type_;
  nvinfer1::Dims dims_x_;
  nvinfer1::Dims dims_y_;
  int axis_;

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -17,6 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/operators/math/prelu.h"

 namespace paddle {
@@ -24,6 +25,17 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) {
+  return new PReluPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize);
+
+int PReluPlugin::initialize() {
+  cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
+  cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
+             cudaMemcpyHostToDevice);
+}
+
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
                                                const nvinfer1::Dims *inputDims,
                                                int nbInputs) {
@@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
  // input dims is CHW.
  const auto &input_dims = this->getInputDims(0);
  const float *input = reinterpret_cast<const float *>(inputs[0]);
-  const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  // const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  const float *alpha = p_gpu_weight_;
  float *output = reinterpret_cast<float **>(outputs)[0];

  std::vector<int> input_shape;

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -14,7 +14,12 @@

 #pragma once

+#include <algorithm>
 #include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"

@@ -24,39 +29,51 @@ namespace tensorrt {
 namespace plugin {

 class PReluPlugin : public PluginTensorRT {
-  TensorRTEngine::Weight alpha_;
+  std::vector<float> weight_;
+  float *p_gpu_weight_;
  std::string mode_;

 protected:
  size_t getSerializationSize() override {
-    // return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
-    return 0;
+    return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
+           SerializedSize(weight_) + SerializedSize(getPluginType());
  }

  // TRT will call this func when we need to serialize the configuration of
  // tensorrt.
  // It should not be called by users.
  void serialize(void *buffer) override {
-    // serializeBase(buffer);
-    // SerializeValue(&buffer, alpha_);
-    // SerializeValue(&buffer, mode_);
+    SerializeValue(&buffer, getPluginType());
+    serializeBase(buffer);
+    SerializeValue(&buffer, weight_);
+    SerializeValue(&buffer, mode_.c_str());
  }

 public:
-  PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode)
-      : alpha_(alpha), mode_(mode) {}
+  PReluPlugin(const float *weight, const int weight_num,
+              std::string const &mode)
+      : mode_(mode) {
+    weight_.resize(weight_num);
+    std::copy(weight, weight + weight_num, weight_.data());
+  }

  // It was used for tensorrt deserialization.
  // It should not be called by users.
  PReluPlugin(void const *serialData, size_t serialLength) {
-    // deserializeBase(serialData, serialLength);
-    // DeserializeValue(&serialData, &serialLength, &alpha_);
-    // DeserializeValue(&serialData, &serialLength, &mode_);
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &weight_);
+    const char *prelu_mode;
+    DeserializeValue(&serialData, &serialLength, &prelu_mode);
+    mode_ = std::string(prelu_mode);
  }
+  ~PReluPlugin() { cudaFree(p_gpu_weight_); }
+  int initialize() override;

-  PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); }
+  PReluPlugin *clone() const override {
+    return new PReluPlugin(weight_.data(), weight_.size(), mode_);
+  }

-  const char *getPluginType() const override { return "prelu"; }
+  const char *getPluginType() const override { return "prelu_plugin"; }
  int getNbOutputs() const override { return 1; }
  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
                                     int nbInputDims) override;

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -15,12 +15,18 @@
 #include <cuda_fp16.h>
 #include <algorithm>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"

 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {

+SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
+  return new SplitPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
+
 // copied from operators::math::SplitFunctor
 template <typename T>
 __global__ void SplitKernel(const T* input_data, const int in_row,

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <thrust/device_vector.h>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"

@@ -25,6 +26,7 @@ namespace plugin {

 class SplitPlugin : public PluginTensorRT {
 public:
+  SplitPlugin() {}
  SplitPlugin(int axis, std::vector<int> const &output_lengths)
      : axis_(axis), same_shape_(true), output_length_(output_lengths) {}

@@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
    return new SplitPlugin(axis_, output_length_);
  }

-  const char *getPluginType() const override { return "split"; }
+  const char *getPluginType() const override { return "split_plugin"; }
  int getNbOutputs() const override { return output_length_.size(); }
  nvinfer1::Dims getOutputDimensions(int index,
                                     const nvinfer1::Dims *input_dims,
@@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT {

 protected:
  size_t getSerializationSize() override {
-    return SerializedSize(axis_) + SerializedSize(output_length_) +
-           getBaseSerializationSize();
+    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
+           SerializedSize(output_length_) + getBaseSerializationSize();
  }

  void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
    serializeBase(buffer);
    SerializeValue(&buffer, axis_);
    SerializeValue(&buffer, output_length_);

--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -17,9 +17,10 @@
 #include <NvInfer.h>
 #include <cstring>
 #include <unordered_map>
+#include <utility>
 #include <vector>

-#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -30,6 +31,13 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+class PluginTensorRT;
+
+typedef std::function<PluginTensorRT*(const void*, size_t)>
+    PluginDeserializeFunc;
+
+typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
+
 class PluginTensorRT : public nvinfer1::IPluginExt {
 public:
  PluginTensorRT() {}

--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
+                                                    const void* serial_data,
+                                                    size_t serial_length) {
+  const char* plugin_type;
+  DeserializeValue(&serial_data, &serial_length, &plugin_type);
+
+  PADDLE_ENFORCE(Has(plugin_type),
+                 "trt plugin type %s does not exists, check it.", plugin_type);
+  auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
+  owned_plugins_.emplace_back(plugin);
+
+  return plugin;
+}
+
+bool PluginFactoryTensorRT::RegisterPlugin(
+    const std::string& op_name, PluginDeserializeFunc deserialize_func) {
+  if (Has(op_name)) return false;
+  auto ret = plugin_registry_.emplace(op_name, deserialize_func);
+  return ret.second;
+}
+
+void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); }
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <cstring>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class PluginFactoryTensorRT : public nvinfer1::IPluginFactory,
+                              public DeleteHelper {
+ public:
+  // Deserialization method
+  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
+                               size_t serial_length) override;
+
+  bool RegisterPlugin(const std::string& op_name,
+                      PluginDeserializeFunc deserialize_func);
+
+  bool Has(const std::string& op_name) {
+    return plugin_registry_.find(op_name) != plugin_registry_.end();
+  }
+
+  void DestroyPlugins();
+
+ protected:
+  std::unordered_map<std::string, PluginDeserializeFunc> plugin_registry_;
+
+  std::list<std::unique_ptr<PluginTensorRT>> owned_plugins_;
+};
+
+class TrtPluginRegistrar {
+ public:
+  TrtPluginRegistrar(const std::string& name,
+                     PluginDeserializeFunc deserialize_func) {
+    inference::Singleton<PluginFactoryTensorRT>::Global().RegisterPlugin(
+        name, deserialize_func);
+  }
+};
+
+#define REGISTER_TRT_PLUGIN(name, deserialize_func) \
+  REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func)
+
+#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func)      \
+  static paddle::inference::tensorrt::plugin::TrtPluginRegistrar   \
+      trt_plugin_registrar##ctr __attribute__((unused)) =          \
+          paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
+              name, deserialize_func)
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
@@ -13,8 +13,8 @@
 // limitations under the License.

 #pragma once
-
 #include <cstring>
+#include <string>
 #include <type_traits>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
@@ -24,6 +24,13 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+// Some trt base classes lack of the destructor.
+// We use a assisted class to fix this.
+struct DeleteHelper {
+ protected:
+  virtual ~DeleteHelper() {}
+};
+
 template <typename T>
 inline void SerializeValue(void** buffer, T const& value);


--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>

+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/platform/enforce.h"

@@ -27,19 +29,34 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
 protected:
  void SetUp() override {
-    ASSERT_EQ(0, cudaStreamCreate(&stream_));
-    engine_ = new TensorRTEngine(10, 1 << 10, stream_);
+    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+
+    engine_ = new TensorRTEngine(10, 1 << 10);
    engine_->InitNetwork();
  }

  void TearDown() override {
-    delete engine_;
-    cudaStreamDestroy(stream_);
+    if (engine_) {
+      delete engine_;
+      engine_ = nullptr;
+    }
+  }
+
+  void PrepareInputOutput(const std::vector<float> &input,
+                          std::vector<int> output_shape) {
+    TensorFromVector(input, *ctx_, &input_);
+    output_.Resize(framework::make_ddim(output_shape));
+  }
+
+  void GetOutput(std::vector<float> *output) {
+    TensorToVector(output_, *ctx_, output);
  }

 protected:
-  TensorRTEngine* engine_;
-  cudaStream_t stream_;
+  framework::Tensor input_;
+  framework::Tensor output_;
+  TensorRTEngine *engine_;
+  platform::CUDADeviceContext *ctx_;
 };

 TEST_F(TensorRTEngineTest, add_layer) {
@@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) {
  float raw_weight[size] = {2.};  // Weight in CPU memory.
  float raw_bias[size] = {3.};

+  std::vector<void *> buffers(2);  // TRT binded inputs
+
  LOG(INFO) << "create weights";
  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size);
  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size);
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                  nvinfer1::DimsCHW{1, 1, 1});
-  auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
+  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
                                        weight.get(), bias.get());
  PADDLE_ENFORCE(fc_layer != nullptr);

@@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) {
  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);

  // fill in real data
-  float x_v = 1234;
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           1 * sizeof(float));
+  std::vector<float> x_v = {1234};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {1});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
  LOG(INFO) << "to execute";
-  engine_->Execute(1);
+  engine_->Execute(1, &buffers, ctx_->stream());

  LOG(INFO) << "to get output";
-  float y_cpu;
-  engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
+  GetOutput(&y_cpu);

  LOG(INFO) << "to checkout output";
-  ASSERT_EQ(y_cpu, x_v * 2 + 3);
+  ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
 }

 TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
@@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
  // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
  float raw_weight[4] = {1.0, 1.1, 3.3, 4.4};
  float raw_bias[2] = {1.3, 2.4};
+  std::vector<void *> buffers(2);  // TRT binded inputs

  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4);
  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2);
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                  nvinfer1::DimsCHW{1, 2, 1});
-  auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
+  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
                                        weight.get(), bias.get());
  PADDLE_ENFORCE(fc_layer != nullptr);

@@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
  engine_->FreezeNetwork();
  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);

-  float x_v[2] = {1.0, 2.0};
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           2 * sizeof(float));
-  engine_->Execute(1);
+  // fill in real data
+  std::vector<float> x_v = {1.0, 2.0};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {2});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(1, &buffers, ctx_->stream());

  LOG(INFO) << "to get output";
-  float y_cpu[2] = {-1., -1.};
+  GetOutput(&y_cpu);

  auto dims = engine_->GetITensor("y")->getDimensions();
  ASSERT_EQ(dims.nbDims, 3);
  ASSERT_EQ(dims.d[0], 2);
  ASSERT_EQ(dims.d[1], 1);
-  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+
  ASSERT_EQ(y_cpu[0], 4.5);
  ASSERT_EQ(y_cpu[1], 14.5);
 }
@@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
  // Weight in CPU memory.
  float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
  float raw_bias[1] = {0};
+  std::vector<void *> buffers(2);  // TRT binded inputs

  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                  nvinfer1::Dims3{1, 3, 3});
-  auto* conv_layer =
+  auto *conv_layer =
      TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
                           weight.get(), bias.get());
  PADDLE_ENFORCE(conv_layer != nullptr);
@@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
  engine_->FreezeNetwork();
  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);

-  float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-                   1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           18 * sizeof(float));
-  engine_->Execute(2);
+  // fill in real data
+  std::vector<float> x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {18});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(2, &buffers, ctx_->stream());

  LOG(INFO) << "to get output";
-  float* y_cpu = new float[18];
-  engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
+  GetOutput(&y_cpu);
+
  ASSERT_EQ(y_cpu[0], 4.0);
  ASSERT_EQ(y_cpu[1], 6.0);
 }

 TEST_F(TensorRTEngineTest, test_pool2d) {
  // Weight in CPU memory.
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                  nvinfer1::Dims3{1, 2, 2});

+  std::vector<void *> buffers(2);  // TRT binded inputs
  nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
-  auto* pool_layer =
-      TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
-                           pool_t, nvinfer1::DimsHW{2, 2});
+  auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
+                                          nvinfer1::DimsHW{2, 2});

  PADDLE_ENFORCE(pool_layer != nullptr);
  pool_layer->setStride(nvinfer1::DimsHW{1, 1});
@@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
  engine_->FreezeNetwork();
  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);

-  float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           8 * sizeof(float));
-  engine_->Execute(2);
+  // fill in real data
+  std::vector<float> x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {2});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(2, &buffers, ctx_->stream());

  LOG(INFO) << "to get output";
-  float* y_cpu = new float[2];
-  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+  GetOutput(&y_cpu);

  ASSERT_EQ(y_cpu[0], 2.0);
  ASSERT_EQ(y_cpu[1], 5.0);

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
 #define NEW_TENSOR(name__) \
  auto name__##_tensor = predictor->GetInputTensor(#name__);

-  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  for (int tid = 1; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }
  double total_time_of_threads{0};
  std::vector<std::thread> threads;

  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
    threads.emplace_back([&, tid] {
-      // To ensure the thread binding correctly,
-      // please clone inside the threadpool.
-      auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
      NEW_TENSOR(data_lod_attention);
      NEW_TENSOR(cell_init);
      NEW_TENSOR(data);

--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
  SetConfig(&config);
  config.SwitchUseFeedFetchOps(false);

-  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  for (int tid = 1; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }
  double total_time_of_threads{0};
  std::vector<std::thread> threads;

  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
    threads.emplace_back([&, tid] {
-      // To ensure the thread binding correctly,
-      // please clone inside the threadpool.
-      auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
      std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
      PrepareZeroCopyInputs(predictor, &inputs);
      auto output_tensor = predictor->GetOutputTensor(out_var_name);

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -17,8 +17,10 @@
 #include <gtest/gtest.h>

 #include <algorithm>
+#include <memory>
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
 #include <vector>
 #ifdef WITH_GPERFTOOLS
 #include <gperftools/profiler.h>
@@ -252,7 +254,11 @@ void TestMultiThreadPrediction(
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
  std::vector<std::thread> threads;
-  auto main_predictor = CreateTestPredictor(config, use_analysis);
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
+  for (int tid = 1; tid < num_threads; tid++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }

  size_t total_time{0};
  for (int tid = 0; tid < num_threads; ++tid) {
@@ -260,9 +266,7 @@ void TestMultiThreadPrediction(
      // Each thread should have local inputs and outputs.
      // The inputs of each thread are all the same.
      std::vector<PaddleTensor> outputs_tid;
-      // To ensure the thread binding correctly,
-      // please clone inside the threadpool.
-      auto predictor = main_predictor->Clone();
+      auto &predictor = predictors[tid];
 #ifdef PADDLE_WITH_MKLDNN
      if (use_analysis) {
        static_cast<AnalysisPredictor *>(predictor.get())

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -54,7 +54,8 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
  if (use_gpu) {
    config->EnableUseGpu(100, 0);
    if (use_tensorrt) {
-      config->EnableTensorRtEngine(1 << 10, batch_size);
+      config->EnableTensorRtEngine(1 << 10, batch_size, 3,
+                                   AnalysisConfig::Precision::kFloat32, false);
      config->pass_builder()->DeletePass("conv_bn_fuse_pass");
      config->pass_builder()->DeletePass("fc_fuse_pass");
      config->pass_builder()->TurnOnDebug();

--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -26,20 +26,17 @@ Allocator::~Allocator() {}
 bool Allocator::IsAllocThreadSafe() const { return false; }

 AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
-  VLOG(2) << "Alloc allocation on " << typeid(*this).name();
  auto ptr = AllocateImpl(size, attr);
  ptr->RegisterAllocatorChain(this);
-  VLOG(2) << "Alloc success";
  return AllocationPtr(ptr);
 }

 void Allocator::FreeImpl(Allocation* allocation) {
-  auto* allocator = allocation->TopAllocator();
+  Allocator* allocator = allocation->TopAllocator();
  allocator->Free(allocation);
 }

 void Allocator::Free(Allocation* allocation) {
-  VLOG(2) << "Free allocation on " << typeid(*this).name();
  allocation->PopAllocator();
  FreeImpl(allocation);
 }
@@ -47,7 +44,7 @@ void Allocator::Free(Allocation* allocation) {
 const char* BadAlloc::what() const noexcept { return msg_.c_str(); }

 void AllocationDeleter::operator()(Allocation* allocation) const {
-  auto* allocator = allocation->TopAllocator();
+  Allocator* allocator = allocation->TopAllocator();
  allocator->Free(allocation);
 }


--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -16,7 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/small_stack.h"
+#include "paddle/fluid/framework/inlined_stack.h"
 #include "paddle/fluid/platform/place.h"

 namespace paddle {
@@ -82,7 +82,7 @@ class Allocation {
  std::vector<Allocator*> GetAllocatorChain() const {
    std::vector<Allocator*> allocators;
    for (size_t i = 0; i < allocator_chain_.size(); ++i) {
-      allocators[i] = allocator_chain_[i];
+      allocators.push_back(allocator_chain_[i]);
    }
    return allocators;
  }
@@ -100,7 +100,7 @@ class Allocation {
  void* ptr_;
  size_t size_;
  platform::Place place_;
-  framework::SmallStack<Allocator*, 8> allocator_chain_;
+  framework::InlinedStack<Allocator*, 8> allocator_chain_;

  friend class Allocator;
  friend class AllocationDeleter;

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -36,6 +36,8 @@ DEFINE_bool(init_allocated_mem, false,
            "that initializing the allocated memory with a small value "
            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(initial_gpu_memory_in_mb);
+DECLARE_double(reallocate_gpu_memory_in_mb);
 DECLARE_bool(benchmark);

 namespace paddle {
@@ -69,7 +71,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
  std::call_once(init_flag, []() {
    a = new detail::BuddyAllocator(
        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize(),
+        platform::CpuMaxChunkSize());
  });

  return a;
@@ -131,40 +134,53 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
 }

 #ifdef PADDLE_WITH_CUDA
-BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
-  static std::once_flag init_flag;
-  static detail::BuddyAllocator **a_arr = nullptr;
-  static std::vector<int> devices;
-
-  std::call_once(init_flag, [gpu_id]() {
-    devices = platform::GetSelectedDevices();
-    int gpu_num = devices.size();
-
-    allocation::GPUMemMonitor.Initialize(devices.size());
+class GPUBuddyAllocatorList {
+ public:
+  GPUBuddyAllocatorList()
+      : allocators_(platform::GetCUDADeviceCount()),
+        flags_(platform::GetCUDADeviceCount()) {
+    allocation::GPUMemMonitor.Initialize(allocators_.size());
+  }

-    a_arr = new BuddyAllocator *[gpu_num];
-    for (size_t i = 0; i < devices.size(); ++i) {
-      int dev_id = devices[i];
-      a_arr[i] = nullptr;
+  BuddyAllocator *Get(size_t dev_id) {
+    PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
+    std::call_once(flags_[dev_id], [this, dev_id] {
      platform::SetDeviceId(dev_id);
-      a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
-                                        new detail::GPUAllocator(dev_id)),
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-
-      VLOG(10) << "\n\nNOTE: each GPU device use "
-               << FLAGS_fraction_of_gpu_memory_to_use * 100
-               << "% of GPU memory.\n"
-               << "You can set GFlags environment variable '"
-               << "FLAGS_fraction_of_gpu_memory_to_use"
-               << "' to change the fraction of GPU usage.\n\n";
-    }
-  });
+      size_t first_size = platform::GpuFirstAllocateChunkSize();
+      size_t re_size = platform::GpuReAllocateChunkSize();
+      allocators_[dev_id] =
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+                                 new detail::GPUAllocator(dev_id)),
+                             platform::GpuMinChunkSize(), first_size, re_size);
+      VLOG(2) << "\n\nNOTE: each GPU device use "
+              << string::HumanReadableSize(first_size) << "(initial chunk) "
+              << string::HumanReadableSize(re_size) << "(reallocate chunk) "
+              << "% of GPU memory.\n"
+              << "You can set GFlags environment variable '"
+              << "FLAGS_fraction_of_gpu_memory_to_use"
+              << "' or "
+                 "'FLAGS_initial_gpu_memory_in_mb/"
+                 "FLAGS_reallocate_gpu_memory_in_mb' to change the fraction "
+                 "of GPU usage.\n\n";
+      VLOG(2) << "Currently, FLAGS_fraction_of_gpu_memory_to_use="
+              << FLAGS_fraction_of_gpu_memory_to_use << ", "
+              << "FLAGS_initial_gpu_memory_in_mb="
+              << FLAGS_initial_gpu_memory_in_mb << ", "
+              << "FLAGS_reallocate_gpu_memory_in_mb="
+              << FLAGS_reallocate_gpu_memory_in_mb;
+    });
+    return allocators_[dev_id];
+  }
+
+ private:
+  std::vector<BuddyAllocator *> allocators_;
+  std::vector<std::once_flag> flags_;
+};

+BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
+  static GPUBuddyAllocatorList allocators;
  platform::SetDeviceId(gpu_id);
-  auto pos = std::distance(devices.begin(),
-                           std::find(devices.begin(), devices.end(), gpu_id));
-  return a_arr[pos];
+  return allocators.Get(gpu_id);
 }
 #endif

@@ -183,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 #ifdef PADDLE_WITH_CUDA
  auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
  auto *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
+  if (ptr == nullptr && size > 0) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
    size_t avail, total;
@@ -234,6 +250,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                new detail::CUDAPinnedAllocator),
                            platform::CUDAPinnedMinChunkSize(),
+                            platform::CUDAPinnedMaxChunkSize(),
                            platform::CUDAPinnedMaxChunkSize());
  });


--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
@@ -14,16 +14,90 @@

 #include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
 #include <algorithm>
+#include <cctype>
+#include <fstream>
 #include <limits>
+#include <sstream>
+#include <string>
 #include "paddle/fluid/platform/lock_guard_ptr.h"

 DEFINE_double(tolerant_times, 2,
              "Tolerant memory size times of buffered_allocator");

+DEFINE_string(division_plan_path, "", "Division plan file path");
+
 namespace paddle {
 namespace memory {
 namespace allocation {

+std::string TrimStringAndToLowerCase(const std::string &str) {
+  auto not_space = [](char ch) { return std::isspace(ch) == 0; };
+  auto first_idx = static_cast<size_t>(
+      std::find_if(str.begin(), str.end(), not_space) - str.begin());
+  auto last_idx = static_cast<size_t>(
+      std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin());
+  if (first_idx == str.size() || last_idx == str.size()) return "";
+
+  last_idx = str.size() - 1 - last_idx;
+  auto ret = str.substr(first_idx, last_idx - first_idx);
+  std::for_each(ret.begin(), ret.end(),
+                [](char &ch) { ch = std::tolower(ch); });
+  return ret;
+}
+
+static size_t ParseStringToBytes(const std::string &str) {
+  std::string ret = str;
+  if (ret.back() == 'b') {
+    ret.pop_back();
+  }
+
+  PADDLE_ENFORCE(!ret.empty(), "Wrong format: %s", str);
+  size_t multiples = 1;
+  switch (ret.back()) {
+    case 'g':
+      multiples *= (static_cast<size_t>(1) << 30);
+      break;
+    case 'm':
+      multiples *= (static_cast<size_t>(1) << 20);
+      break;
+    case 'k':
+      multiples *= (static_cast<size_t>(1) << 10);
+      break;
+    default:
+      break;
+  }
+
+  if (multiples != 1) ret.pop_back();
+  ret = TrimStringAndToLowerCase(ret);
+  double ret_val = 0.0;
+  std::stringstream ss(ret);
+  PADDLE_ENFORCE((ss >> ret_val).good(), "Wrong format %s", str);
+  return static_cast<size_t>(ret_val * multiples);
+}
+
+static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
+  std::string ret("[");
+  for (auto sz : plan) {
+    ret += string::HumanReadableSize(sz);
+    ret += ", ";
+  }
+  return ret + "]";
+}
+
+static std::vector<size_t> ReadDivisionPlanFromFile(
+    const std::string &filepath) {
+  std::ifstream is(filepath.c_str());
+  PADDLE_ENFORCE(is.good(), "File not exist");
+  std::string str;
+  std::vector<size_t> plan;
+  while (std::getline(is, str).good()) {
+    str = TrimStringAndToLowerCase(str);
+    if (str.empty()) break;
+    plan.push_back(ParseStringToBytes(str));
+  }
+  return plan;
+}
+
 static void CheckAndModifyMemoryDivisionPlan(
    std::vector<size_t> *division_plan) {
  // Check whether the division plan is strictly sorted
@@ -50,10 +124,21 @@ static void CheckAndModifyMemoryDivisionPlan(
 }

 static std::vector<size_t> GetDefaultDivisionPlan() {
+  if (!FLAGS_division_plan_path.empty()) {
+    return ReadDivisionPlanFromFile(FLAGS_division_plan_path);
+  }
+
+  constexpr size_t kMaxLogSize = 30;
+
  std::vector<size_t> plan;
+  for (size_t i = 12; i <= kMaxLogSize; ++i) {
+    plan.push_back(static_cast<size_t>(1) << i);
+  }
+  /*
  for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
    plan.push_back(static_cast<size_t>(1) << i);
  }
+  */
  return plan;
 }

@@ -78,27 +163,32 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
    : underlying_allocator_(std::move(underlying_allocator)),
      division_plan_(division_plan) {
  CheckAndModifyMemoryDivisionPlan(&division_plan_);
-  allocations_.resize(division_plan_.size());
-  mtx_.resize(division_plan_.size());
+  allocations_.resize(division_plan_.size() - 1);
+  mtx_.resize(division_plan_.size() - 1);
  if (underlying_allocator_->IsAllocThreadSafe()) {
    for (auto &mtx : mtx_) {
      mtx.reset(new std::mutex());
    }
  }

+  VLOG(1) << "Division plan is: " << GetDebugStringOfPlan(division_plan_);
  VLOG(1) << "FLAGS_tolerant_times = " << FLAGS_tolerant_times;
 }

 void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) {
  auto bin_index = FindDivisionPlanBinIndex(division_plan_, allocation->size());
-  {
+  if (bin_index < allocations_.size()) {
    platform::LockGuardPtr<std::mutex> guard(mtx_[bin_index]);
    allocations_[bin_index].emplace(allocation->size(),
                                    AllocationPtr(allocation));
+  } else {
+    underlying_allocator_->Free(allocation);
  }
 }

-void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
+// bin_index is not used currently.
+// Maybe we can design more flexible FreeCache strategy based on bin_index
+size_t MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
  size_t accumulated_size = 0;
  // FIXME(zjl): free the largest first when there is no extra
  for (size_t i = allocations_.size() - 1; i != static_cast<size_t>(-1); --i) {
@@ -110,33 +200,53 @@ void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
      underlying_allocator_->Free(it->second.release());
      allocations_[i].erase(it--);
      if (accumulated_size >= size) {
-        return;
+        return accumulated_size;
      }
    } while (!allocations_[i].empty());
  }
+  return accumulated_size;
 }

 Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
  auto bin_index = FindDivisionPlanBinIndex(division_plan_, size);
  auto upper_size = TolerantUpperSize(size);

-  for (; upper_size >= division_plan_[bin_index]; ++bin_index) {
+  // if (bin_index >= allocations_.size()) {
+  //  VLOG(2) << "Allocate " << size << " from underlying directly";
+  //}
+
+  for (; bin_index < allocations_.size() &&
+         upper_size >= division_plan_[bin_index];
+       ++bin_index) {
    auto &allocation = allocations_[bin_index];
    platform::LockGuardPtr<std::mutex> lock(mtx_[bin_index]);
    auto it = allocation.lower_bound(size);
-    if (it != allocation.end() && it->second->size() < upper_size) {
+    if (it != allocation.end() && it->second->size() <= upper_size) {
+      size_t sz = it->second->size();
      auto ret = std::move(it->second);
      allocation.erase(it);
+      VLOG(3) << "Allocate " << sz << "(required " << size
+              << ") from cache directly";
      return ret.release();
    }
  }

-  try {
-    return underlying_allocator_->Allocate(size, attr).release();
-  } catch (BadAlloc &) {
-    VLOG(2) << "BadAlloc raises, try to free " << size << " caches";
-    FreeCache(size, bin_index);
-    return underlying_allocator_->Allocate(size, attr).release();
+  size_t retry_time = 1;
+  while (true) {
+    try {
+      auto ret = underlying_allocator_->Allocate(size, attr).release();
+      VLOG(2) << "Allocate " << size << " from underlying directly";
+      return ret;
+    } catch (BadAlloc &) {
+      VLOG(1) << retry_time << "-th BadAlloc raises, try to free " << size
+              << " bytes caches";
+      // size_t actual_free_size = FreeCache(size, bin_index);
+      size_t actual_free_size = FreeCache(-1UL, bin_index);
+      VLOG(1) << retry_time << "-th free " << actual_free_size
+              << " bytes caches";
+      if (actual_free_size == 0) throw;
+    }
+    ++retry_time;
  }
 }


--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
@@ -41,7 +41,7 @@ class MultiBinBufferedAllocator : public Allocator {
  void FreeImpl(Allocation* allocation) override;

 private:
-  void FreeCache(size_t size, size_t bin_index);
+  size_t FreeCache(size_t size, size_t bin_index);

  std::shared_ptr<Allocator> underlying_allocator_;
  std::vector<std::multimap<size_t, AllocationPtr>> allocations_;

--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -25,9 +25,11 @@ namespace detail {

 BuddyAllocator::BuddyAllocator(
    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
-    size_t max_chunk_size)
+    size_t first_allocate_chunk_size, size_t reallocate_chunk_size)
    : min_chunk_size_(min_chunk_size),
-      max_chunk_size_(max_chunk_size),
+      first_allocate_chunk_size_(first_allocate_chunk_size),
+      reallocate_chunk_size_(reallocate_chunk_size),
+      max_chunk_size_(first_allocate_chunk_size),
      cache_(system_allocator->UseGpu()),
      system_allocator_(std::move(system_allocator)) {}

@@ -36,9 +38,10 @@ BuddyAllocator::~BuddyAllocator() {
              "have actually been freed";
  while (!pool_.empty()) {
    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+    auto desc = cache_.load(block);
+    VLOG(10) << "Free from block (" << block << ", " << desc.size << ")";

-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, desc.size, desc.index);
    cache_.invalidate(block);
    pool_.erase(pool_.begin());
  }
@@ -63,7 +66,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // if the allocation is huge, send directly to the system allocator
  if (size > max_chunk_size_) {
    VLOG(10) << "Allocate from system allocator.";
-    return SystemAlloc(size);
+    return SystemAlloc(size, false);
  }

  // query and allocate from the existing chunk
@@ -72,9 +75,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // refill the pool if failure
  if (it == pool_.end()) {
    it = RefillPool();
-    // if still failure, fail fatally
+    // if still failure, try to allocate from SystemAllocator
    if (it == pool_.end()) {
-      return nullptr;
+      return SystemAlloc(size, false);
    }
  } else {
    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
@@ -98,7 +101,7 @@ void BuddyAllocator::Free(void* p) {

  VLOG(10) << "Free from address " << block;

-  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+  if (block->type(cache_) == MemoryBlock::UNMANAGED_HUGE_CHUNK) {
    VLOG(10) << "Free directly from system allocator";
    system_allocator_->Free(block, block->total_size(cache_),
                            block->index(cache_));
@@ -168,9 +171,12 @@ void BuddyAllocator::Free(void* p) {

 size_t BuddyAllocator::Used() { return total_used_; }
 size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
-size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
+size_t BuddyAllocator::GetMaxChunkSize() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return max_chunk_size_;
+}

-void* BuddyAllocator::SystemAlloc(size_t size) {
+void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {
  size_t index = 0;
  void* p = system_allocator_->Alloc(&index, size);

@@ -178,25 +184,23 @@ void* BuddyAllocator::SystemAlloc(size_t size) {

  if (p == nullptr) return nullptr;

-  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
-                                     size, nullptr, nullptr);
+  static_cast<MemoryBlock*>(p)->init(
+      &cache_, is_managed ? MemoryBlock::MANAGED_HUGE_CHUNK
+                          : MemoryBlock::UNMANAGED_HUGE_CHUNK,
+      index, size, nullptr, nullptr);

  return static_cast<MemoryBlock*>(p)->data();
 }

 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-#ifdef PADDLE_WITH_CUDA
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the maximum allocation size for the first allocation.
-      max_chunk_size_ = platform::GpuMaxChunkSize();
-    }
+  if (total_used_ + total_free_ > 0) {
+    max_chunk_size_ = reallocate_chunk_size_;
  }
-#endif

  // Allocate a new maximum sized block
  size_t index = 0;
-  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
+  size_t chunk_size = max_chunk_size_;
+  void* p = system_allocator_->Alloc(&index, chunk_size);

  if (p == nullptr) return pool_.end();

@@ -204,7 +208,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
           << " from system allocator";

  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
-                                     max_chunk_size_, nullptr, nullptr);
+                                     chunk_size, nullptr, nullptr);

  // gpu fallback allocation
  if (system_allocator_->UseGpu() &&
@@ -212,10 +216,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
    fallback_alloc_count_++;
  }

-  total_free_ += max_chunk_size_;
+  total_free_ += chunk_size;

  // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+  return pool_.insert(IndexSizeAddress(index, chunk_size, p)).first;
 }

 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
@@ -271,27 +275,24 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,

 void BuddyAllocator::CleanIdleFallBackAlloc() {
  // If fallback allocation does not exist, return directly
-  if (!fallback_alloc_count_) return;
+  if (!fallback_alloc_count_ || !system_allocator_->UseGpu()) return;

  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-
    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));

-    // If no GPU fallback allocator, return
-    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+    auto desc = cache_.load(block);
+    if (desc.index == 0) {
      return;
    }

    VLOG(10) << "Return block " << block << " to fallback allocator.";

-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, desc.size, block->index(cache_));
    cache_.invalidate(block);

    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));

-    total_free_ -= max_chunk_size_;
+    total_free_ -= desc.size;
    fallback_alloc_count_--;

    // If no fall allocation exists, return directly
@@ -315,19 +316,21 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
  if (!shall_free_alloc()) return;

  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-
    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+    auto desc = cache_.load(block);
+
+    if (desc.type != MemoryBlock::MANAGED_HUGE_CHUNK) {
+      return;
+    }

    VLOG(10) << "Return block " << block << " to base allocator.";

-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, desc.size, desc.index);
    cache_.invalidate(block);

    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));

-    total_free_ -= max_chunk_size_;
+    total_free_ -= desc.size;

    if (!shall_free_alloc()) return;
  }

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -34,7 +34,8 @@ namespace detail {
 class BuddyAllocator {
 public:
  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
-                 size_t min_chunk_size, size_t max_chunk_size);
+                 size_t min_chunk_size, size_t first_allocate_chunk_size,
+                 size_t reallocate_chunk_size);

  ~BuddyAllocator();

@@ -57,7 +58,7 @@ class BuddyAllocator {
  using PoolSet = std::set<IndexSizeAddress>;

  /*! \brief Allocate fixed-size memory from system */
-  void* SystemAlloc(size_t size);
+  void* SystemAlloc(size_t size, bool is_managed = true);

  /*! \brief If existing chunks are not suitable, refill pool */
  PoolSet::iterator RefillPool();
@@ -87,7 +88,11 @@ class BuddyAllocator {
  size_t total_free_ = 0;  // the total size of free memory

  size_t min_chunk_size_;  // the minimum size of each chunk
-  size_t max_chunk_size_;  // the maximum size of each chunk
+
+  size_t first_allocate_chunk_size_;
+  size_t reallocate_chunk_size_;
+
+  size_t max_chunk_size_;

 private:
  /**

--- a/paddle/fluid/memory/detail/memory_block.h
+++ b/paddle/fluid/memory/detail/memory_block.h
@@ -27,10 +27,11 @@ class MetadataCache;
 // MemoryBlock::Desc and the payload.
 struct MemoryBlock {
  enum Type {
-    FREE_CHUNK,    // memory is free and idle
-    ARENA_CHUNK,   // memory is being occupied
-    HUGE_CHUNK,    // memory is out of management
-    INVALID_CHUNK  // memory is invalid
+    FREE_CHUNK,            // memory is free and idle
+    ARENA_CHUNK,           // memory is being occupied
+    MANAGED_HUGE_CHUNK,    // memory is huge and out of management
+    UNMANAGED_HUGE_CHUNK,  // memory is huge and managed by allocator
+    INVALID_CHUNK          // memory is invalid
  };

  // init saves the MemoryBlock::Desc of the memory block in a MetadataCache.

--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) {
  // Initialize the OpDesc
  if (op_desc_info.Has(config_.op_type)) {
    type_ = config_.op_type;
-    op_desc_.SetType(config_.op_type);

+    CreateOpDesc();
    CreateInputVarDesc();
    CreateOutputVarDesc();
  } else {
@@ -131,6 +131,40 @@ std::vector<std::string> OpTester::GetOpProtoOutputNames() {
  return output_names;
 }

+std::unordered_map<std::string, framework::proto::AttrType>
+OpTester::GetOpProtoAttrNames() {
+  std::unordered_map<std::string, framework::proto::AttrType> attr_types;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  const std::vector<std::string> skipped_attrs = {
+      framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
+      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+      framework::OpProtoAndCheckerMaker::OpNamescopeAttrName(),
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()};
+  for (int i = 0; i != proto.attrs_size(); ++i) {
+    const auto &attr = proto.attrs(i);
+    if (!Has(skipped_attrs, attr.name())) {
+      VLOG(4) << "attr: " << attr.name() << ", type: " << attr.type();
+      attr_types[attr.name()] = attr.type();
+    }
+  }
+  return attr_types;
+}
+
+framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
+  if (str == "int32") {
+    return framework::proto::VarType::INT32;
+  } else if (str == "int64") {
+    return framework::proto::VarType::INT64;
+  } else if (str == "fp32") {
+    return framework::proto::VarType::FP32;
+  } else if (str == "fp64") {
+    return framework::proto::VarType::FP64;
+  } else {
+    PADDLE_THROW("Unsupported dtype %s.", str.c_str());
+  }
+}
+
 void OpTester::CreateInputVarDesc() {
  std::vector<std::string> input_names = GetOpProtoInputNames();
  for (auto &name : input_names) {
@@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() {
    // Need to support more type
    var->SetType(framework::proto::VarType::LOD_TENSOR);
    var->SetPersistable(false);
-    var->SetDataType(framework::proto::VarType::FP32);
+    var->SetDataType(TransToVarType(input->dtype));
    var->SetShape(input->dims);

    op_desc_.SetInput(name, {var_name});
-    input_lods_[var_name] = input->lod;
+    inputs_[var_name] = *input;
  }
 }

@@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() {
  }
 }

+void OpTester::CreateOpDesc() {
+  op_desc_.SetType(config_.op_type);
+  std::unordered_map<std::string, framework::proto::AttrType> attr_types =
+      GetOpProtoAttrNames();
+  for (auto item : config_.attrs) {
+    const std::string &name = item.first;
+    if (attr_types.find(name) == attr_types.end()) {
+      LOG(FATAL) << "Operator " << type_ << " do not have attr " << name;
+    }
+
+    const std::string &value_str = item.second;
+    const framework::proto::AttrType &type = attr_types[name];
+    switch (type) {
+      case framework::proto::AttrType::BOOLEAN:
+        break;
+      case framework::proto::AttrType::INT: {
+        int value = StringTo<int>(value_str);
+        op_desc_.SetAttr(name, {value});
+      } break;
+      case framework::proto::AttrType::FLOAT: {
+        float value = StringTo<float>(value_str);
+        op_desc_.SetAttr(name, {value});
+      } break;
+      case framework::proto::AttrType::STRING: {
+        op_desc_.SetAttr(name, {value_str});
+      } break;
+      case framework::proto::AttrType::BOOLEANS:
+      case framework::proto::AttrType::INTS:
+      case framework::proto::AttrType::FLOATS:
+      case framework::proto::AttrType::STRINGS:
+        LOG(FATAL) << "Not supported yet.";
+        break;
+      case framework::proto::AttrType::LONG: {
+        int64_t value = StringTo<int64_t>(value_str);
+        op_desc_.SetAttr(name, value);
+      } break;
+      case framework::proto::AttrType::LONGS:
+      default:
+        PADDLE_THROW("Unsupport attr type %d", type);
+    }
+  }
+}
+
 framework::VarDesc *OpTester::Var(const std::string &name) {
  auto it = vars_.find(name);
  if (it != vars_.end()) {
@@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) {

 template <typename T>
 void OpTester::SetupTensor(framework::LoDTensor *tensor,
-                           const std::vector<int64_t> &shape, T lower,
-                           T upper) {
+                           const std::vector<int64_t> &shape, T lower, T upper,
+                           const std::string &initializer) {
  static unsigned int seed = 100;
  std::mt19937 rng(seed++);
  std::uniform_real_distribution<double> uniform_dist(0, 1);

  T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
-  if (platform::is_cpu_place(place_)) {
-    for (int i = 0; i < tensor->numel(); ++i) {
-      ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-    }
+
+  framework::LoDTensor cpu_tensor;
+  T *cpu_ptr = nullptr;
+
+  if (!platform::is_cpu_place(place_)) {
+    cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
+                                         platform::CPUPlace());
  } else {
-    framework::LoDTensor cpu_tensor;
-    T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
-                                            platform::CPUPlace());
+    cpu_ptr = ptr;
+  }
+
+  if (initializer == "random") {
    for (int i = 0; i < cpu_tensor.numel(); ++i) {
      cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
    }
+  } else if (initializer == "natural") {
+    for (int i = 0; i < cpu_tensor.numel(); ++i) {
+      cpu_ptr[i] = lower + i;
+    }
+  } else if (initializer == "zeros") {
+    for (int i = 0; i < cpu_tensor.numel(); ++i) {
+      cpu_ptr[i] = 0;
+    }
+  } else {
+    PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
+  }
+
+  if (!platform::is_cpu_place(place_)) {
    TensorCopySync(cpu_tensor, place_, tensor);
  }
 }
@@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) {
    }
  }

-  for (auto &item : input_lods_) {
+  for (auto &item : inputs_) {
    // Allocate memory for input tensor
    auto &var_name = item.first;
    VLOG(3) << "Allocate memory for tensor " << var_name;
@@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) {

    auto *var = scope->Var(var_name);
    auto *tensor = var->GetMutable<framework::LoDTensor>();
-    SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
-                       static_cast<float>(1.0));
+    const auto &data_type = var_desc->GetDataType();
+    if (data_type == framework::proto::VarType::INT32) {
+      SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer);
+    } else if (data_type == framework::proto::VarType::INT64) {
+      SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer);
+    } else if (data_type == framework::proto::VarType::FP32) {
+      SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
+                         static_cast<float>(1.0), item.second.initializer);
+    } else if (data_type == framework::proto::VarType::FP64) {
+      SetupTensor<double>(tensor, shape, static_cast<double>(0.0),
+                          static_cast<double>(1.0), item.second.initializer);
+    } else {
+      PADDLE_THROW("Unsupported dtype %d.", data_type);
+    }

    VLOG(3) << "Set lod for tensor " << var_name;
-    std::vector<std::vector<size_t>> &lod_vec = item.second;
+    std::vector<std::vector<size_t>> &lod_vec = item.second.lod;
    framework::LoD lod;
    for (size_t i = 0; i < lod_vec.size(); ++i) {
      lod.push_back(lod_vec[i]);
@@ -261,7 +367,16 @@ std::string OpTester::DebugString() {
    ss << GenSpaces(count) << "type: LOD_TENSOR\n";
    ss << GenSpaces(count++) << "lod_tensor {\n";
    ss << GenSpaces(count++) << "tensor {\n";
-    ss << GenSpaces(count) << "data_type: FP32\n";
+    const auto &data_type = var->GetDataType();
+    if (data_type == framework::proto::VarType::INT32) {
+      ss << GenSpaces(count) << "data_type: INT32\n";
+    } else if (data_type == framework::proto::VarType::INT64) {
+      ss << GenSpaces(count) << "data_type: INT64\n";
+    } else if (data_type == framework::proto::VarType::FP32) {
+      ss << GenSpaces(count) << "data_type: FP32\n";
+    } else if (data_type == framework::proto::VarType::FP64) {
+      ss << GenSpaces(count) << "data_type: FP64\n";
+    }
    std::vector<int64_t> shape = var->GetShape();
    for (auto d : shape) {
      ss << GenSpaces(count) << "dims: " << d << "\n";
@@ -288,6 +403,63 @@ std::string OpTester::DebugString() {
    ss << GenSpaces(--count) << "}\n";
  }
  ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
+  for (auto &name : op_desc_.AttrNames()) {
+    ss << GenSpaces(count++) << "attrs {\n";
+    const auto &attr_type = op_desc_.GetAttrType(name);
+    const auto &attr = op_desc_.GetAttr(name);
+    ss << GenSpaces(count) << "name: \"" << name << "\"\n";
+    switch (attr_type) {
+      case framework::proto::AttrType::BOOLEAN: {
+        ss << GenSpaces(count) << "type: BOOLEAN\n";
+        ss << GenSpaces(count) << "b: " << boost::get<bool>(attr) << "\n";
+      } break;
+      case framework::proto::AttrType::INT: {
+        ss << GenSpaces(count) << "type: INT\n";
+        ss << GenSpaces(count) << "i: " << boost::get<int>(attr) << "\n";
+      } break;
+      case framework::proto::AttrType::FLOAT: {
+        ss << GenSpaces(count) << "type: FLOAT\n";
+        ss << GenSpaces(count) << "f: " << boost::get<float>(attr) << "\n";
+      } break;
+      case framework::proto::AttrType::STRING: {
+        ss << GenSpaces(count) << "type: STRING\n";
+        ss << GenSpaces(count) << "s: \"" << boost::get<std::string>(attr)
+           << "\"\n";
+      } break;
+      case framework::proto::AttrType::BOOLEANS: {
+        ss << GenSpaces(count) << "type: BOOLEANS\n";
+        ss << GenSpaces(count) << "bools: "
+           << "\n";
+      } break;
+      case framework::proto::AttrType::INTS: {
+        ss << GenSpaces(count) << "type: INTS\n";
+        ss << GenSpaces(count) << "ints: "
+           << "\n";
+      } break;
+      case framework::proto::AttrType::FLOATS: {
+        ss << GenSpaces(count) << "type: FLOATS\n";
+        ss << GenSpaces(count) << "floats: "
+           << "\n";
+      } break;
+      case framework::proto::AttrType::STRINGS: {
+        ss << GenSpaces(count) << "type: STRINGS\n";
+        ss << GenSpaces(count) << "strings: "
+           << "\n";
+      } break;
+      case framework::proto::AttrType::LONG: {
+        ss << GenSpaces(count) << "type: LONG\n";
+        ss << GenSpaces(count) << "l: " << boost::get<int64_t>(attr) << "\n";
+      } break;
+      case framework::proto::AttrType::LONGS: {
+        ss << GenSpaces(count) << "type: LONGS\n";
+        ss << GenSpaces(count) << "longs: "
+           << "\n";
+      } break;
+      default:
+        PADDLE_THROW("Unsupport attr type %d", attr_type);
+    }
+    ss << GenSpaces(--count) << "}\n";
+  }
  ss << GenSpaces(--count) << "}\n";
  return ss.str();
 }
@@ -299,6 +471,7 @@ TEST(op_tester, base) {
                   FLAGS_op_config_list.c_str());
    std::vector<OpTesterConfig> op_configs;
    while (!fin.eof()) {
+      VLOG(4) << "Reading config " << op_configs.size() << "...";
      OpTesterConfig config;
      bool result = config.Init(fin);
      if (result) {

--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -39,16 +41,21 @@ class OpTester {
 private:
  std::vector<std::string> GetOpProtoInputNames();
  std::vector<std::string> GetOpProtoOutputNames();
+  std::unordered_map<std::string, framework::proto::AttrType>
+  GetOpProtoAttrNames();

+  framework::proto::VarType::Type TransToVarType(std::string str);
  void CreateInputVarDesc();
  void CreateOutputVarDesc();
+  void CreateOpDesc();

  framework::VarDesc *Var(const std::string &name);
  void CreateVariables(framework::Scope *scope);

  template <typename T>
  void SetupTensor(framework::LoDTensor *input,
-                   const std::vector<int64_t> &shape, T lower, T upper);
+                   const std::vector<int64_t> &shape, T lower, T upper,
+                   const std::string &initializer);

  void RunImpl();

@@ -57,7 +64,7 @@ class OpTester {
  std::string type_;
  framework::OpDesc op_desc_;
  std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
-  std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_;
+  std::unordered_map<std::string, OpInputConfig> inputs_;
  std::unique_ptr<framework::OperatorBase> op_;
  platform::Place place_;
  std::unique_ptr<framework::Scope> scope_;

--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/benchmark/op_tester_config.h"
 #include <fstream>
-#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
@@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str,
  }
 }

+OpInputConfig::OpInputConfig(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "name" || sep == "name:") {
+        is >> name;
+        EraseEndSep(&name);
+      } else if (sep == "dtype" || sep == "dtype:") {
+        ParseDType(is);
+      } else if (sep == "initializer" || sep == "initializer:") {
+        ParseInitializer(is);
+      } else if (sep == "dims" || sep == "dims:") {
+        ParseDims(is);
+      } else if (sep == "lod" || sep == "lod:") {
+        ParseLoD(is);
+      }
+    }
+  }
+}
+
+void OpInputConfig::ParseDType(std::istream& is) {
+  std::string dtype_str;
+  is >> dtype_str;
+  EraseEndSep(&dtype_str);
+
+  if (dtype_str == "int32" || dtype_str == "int") {
+    dtype = "int32";
+  } else if (dtype_str == "int64" || dtype_str == "long") {
+    dtype = "int64";
+  } else if (dtype_str == "fp32" || dtype_str == "float") {
+    dtype = "fp32";
+  } else if (dtype_str == "fp64" || dtype_str == "double") {
+    dtype = "fp64";
+  } else {
+    PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str());
+  }
+  VLOG(4) << "dtype of input " << name << " is: " << dtype;
+}
+
+void OpInputConfig::ParseInitializer(std::istream& is) {
+  std::string initializer_str;
+  is >> initializer_str;
+  EraseEndSep(&initializer_str);
+
+  const std::vector<std::string> supported_initializers = {"random", "natural",
+                                                           "zeros"};
+  if (!Has(supported_initializers, initializer_str)) {
+    PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
+  }
+
+  initializer = initializer_str;
+  VLOG(4) << "initializer of input " << name << " is: " << initializer;
+}
+
 void OpInputConfig::ParseDims(std::istream& is) {
  std::string dims_str;
  is >> dims_str;
@@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) {
          number += lod_str[i];
          ++i;
        }
-        level.push_back(atoi(number.c_str()));
+        level.push_back(StringTo<size_t>(number));
      }
      lod.push_back(level);
    } else if (lod_str[i] == '}') {
@@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) {
  }
 }

-OpInputConfig::OpInputConfig(std::istream& is) {
-  std::string sep;
-  is >> sep;
-  if (sep == kStartSeparator) {
-    while (sep != kEndSeparator) {
-      is >> sep;
-      if (sep == "name" || sep == "name:") {
-        is >> name;
-        EraseEndSep(&name);
-      } else if (sep == "dims" || sep == "dims:") {
-        ParseDims(is);
-      } else if (sep == "lod" || sep == "lod:") {
-        ParseLoD(is);
-      }
-    }
-  }
-}
-
 OpTesterConfig::OpTesterConfig(const std::string& filename) {
  std::ifstream fin(filename, std::ios::in | std::ios::binary);
  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
@@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) {
      is >> value;
      EraseEndSep(&key, ":");
      EraseEndSep(&value);
+      VLOG(4) << "attrs: " << key << ", " << value;

      attrs[key] = value;
    }

--- a/paddle/fluid/operators/benchmark/op_tester_config.h
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <istream>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -27,10 +28,14 @@ struct OpInputConfig {
  OpInputConfig() {}
  explicit OpInputConfig(std::istream& is);

+  void ParseDType(std::istream& is);
+  void ParseInitializer(std::istream& is);
  void ParseDims(std::istream& is);
  void ParseLoD(std::istream& is);

  std::string name;
+  std::string dtype{"fp32"};  // int32/int, int64/long, fp32/float, fp64/double
+  std::string initializer{"random"};  // random, natural
  std::vector<int64_t> dims;
  std::vector<std::vector<size_t>> lod;
 };
@@ -55,6 +60,23 @@ struct OpTesterConfig {
  double runtime{0.0};
 };

+static bool Has(const std::vector<std::string>& vec, const std::string& item) {
+  for (size_t i = 0; i < vec.size(); ++i) {
+    if (vec[i] == item) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename T>
+T StringTo(const std::string& str) {
+  std::istringstream is(str);
+  T value;
+  is >> value;
+  return value;
+}
+
 }  // namespace benchmark
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/cast_op.h"
+#include <memory>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"

@@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 Cast Operator.

 This Operator casts the input tensor to another data type and
-returns tha Output Tensor.
+returns the Output Tensor. It's meaningless if the output dtype equals
+the input dtype, but it's fine if you do so.

 )DOC");
  }

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -33,11 +33,14 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
+detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)

 if(WITH_GPU)
  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
+  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
 else()
  detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
 endif()

 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)

--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("PriorBox"),
+        "Input(PriorBox) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("PriorBoxVar"),
+        "Input(PriorBoxVar) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("TargetBox"),
+        "Input(TargetBox) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("BoxScore"),
+        "Input(BoxScore) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("DecodeBox"),
+        "Output(DecodeBox) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("OutputAssignBox"),
+        "Output(OutputAssignBox) of BoxDecoderAndAssignOp should not be null.");
+
+    auto prior_box_dims = ctx->GetInputDim("PriorBox");
+    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+    auto target_box_dims = ctx->GetInputDim("TargetBox");
+    auto box_score_dims = ctx->GetInputDim("BoxScore");
+
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+                      "The rank of Input of PriorBox must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 1,
+                      "The rank of Input of PriorBoxVar must be 1");
+    PADDLE_ENFORCE_EQ(prior_box_var_dims[0], 4,
+                      "The shape of PriorBoxVar is [4]");
+    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                      "The rank of Input of TargetBox must be 2");
+    PADDLE_ENFORCE_EQ(box_score_dims.size(), 2,
+                      "The rank of Input of BoxScore must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0],
+                      "The first dim of prior_box and target_box is roi nums "
+                      "and should be same!");
+    PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0],
+                      "The first dim of prior_box and box_score is roi nums "
+                      "and should be same!");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], box_score_dims[1] * prior_box_dims[1],
+                      "The shape of target_box is [N, classnum * 4], The shape "
+                      "of box_score is [N, classnum], The shape of prior_box "
+                      "is [N, 4]");
+
+    ctx->SetOutputDim("DecodeBox", framework::make_ddim({target_box_dims[0],
+                                                         target_box_dims[1]}));
+    ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox");
+    ctx->SetOutputDim(
+        "OutputAssignBox",
+        framework::make_ddim({prior_box_dims[0], prior_box_dims[1]}));
+    ctx->ShareLoD("PriorBox", /*->*/ "OutputAssignBox");
+  }
+};
+
+class BoxDecoderAndAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "PriorBox",
+        "(Tensor, default Tensor<float>) "
+        "Box list PriorBox is a 2-D Tensor with shape [N, 4] which holds N "
+        "boxes and each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the anchor box, "
+        "if the input is image feature map, they are close to the origin "
+        "of the coordinate system. [xmax, ymax] is the right bottom "
+        "coordinate of the anchor box.");
+    AddInput("PriorBoxVar",
+             "(Tensor, default Tensor<float>, optional) "
+             "PriorBoxVar is a 2-D Tensor with shape [N, 4] which holds N "
+             "group of variance. PriorBoxVar will set all elements to 1 by "
+             "default.")
+        .AsDispensable();
+    AddInput("TargetBox",
+             "(LoDTensor or Tensor) "
+             "This input can be a 2-D LoDTensor with shape "
+             "[N, classnum*4]. It holds N targets for N boxes.");
+    AddInput("BoxScore",
+             "(LoDTensor or Tensor) "
+             "This input can be a 2-D LoDTensor with shape "
+             "[N, classnum], each box is represented as [classnum] which is "
+             "the classification probabilities.");
+    AddAttr<float>("box_clip",
+                   "(float, default 4.135, np.log(1000. / 16.)) "
+                   "clip box to prevent overflowing")
+        .SetDefault(4.135f);
+    AddOutput("DecodeBox",
+              "(LoDTensor or Tensor) "
+              "the output tensor of op with shape [N, classnum * 4] "
+              "representing the result of N target boxes decoded with "
+              "M Prior boxes and variances for each class.");
+    AddOutput("OutputAssignBox",
+              "(LoDTensor or Tensor) "
+              "the output tensor of op with shape [N, 4] "
+              "representing the result of N target boxes decoded with "
+              "M Prior boxes and variances with the best non-background class "
+              "by BoxScore.");
+    AddComment(R"DOC(
+
+Bounding Box Coder.
+
+Decode the target bounding box with the prior_box information.
+
+The Decoding schema is described below:
+
+    $$
+    ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2} 
+    $$
+    $$
+    oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2}
+    $$
+    $$
+    ow = \\exp (pwv \\times tw) \\times pw + \\frac{tw}{2}
+    $$
+    $$
+    oh = \\exp (phv \\times th) \\times ph + \\frac{th}{2}
+    $$
+
+where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
+and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
+prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
+`phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the
+decoded coordinates, width and height in decode_box. 
+
+decode_box is obtained after box decode, then assigning schema is described below:
+
+For each prior_box, use the best non-background class's decoded values to 
+update the prior_box locations and get output_assign_box. So, the shape of
+output_assign_box is the same as PriorBox.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(box_decoder_and_assign, ops::BoxDecoderAndAssignOp,
+                  ops::BoxDecoderAndAssignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    box_decoder_and_assign,
+    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void DecodeBoxKernel(const T* prior_box_data,
+                                const T* prior_box_var_data,
+                                const T* target_box_data, const int roi_num,
+                                const int class_num, const T box_clip,
+                                T* output_box_data) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < roi_num * class_num) {
+    int i = idx / class_num;
+    int j = idx % class_num;
+    T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1;
+    T prior_box_height =
+        prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1;
+    T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2;
+    T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2;
+
+    int offset = i * class_num * 4 + j * 4;
+    T dw = prior_box_var_data[2] * target_box_data[offset + 2];
+    T dh = prior_box_var_data[3] * target_box_data[offset + 3];
+    if (dw > box_clip) {
+      dw = box_clip;
+    }
+    if (dh > box_clip) {
+      dh = box_clip;
+    }
+    T target_box_center_x = 0, target_box_center_y = 0;
+    T target_box_width = 0, target_box_height = 0;
+    target_box_center_x =
+        prior_box_var_data[0] * target_box_data[offset] * prior_box_width +
+        prior_box_center_x;
+    target_box_center_y =
+        prior_box_var_data[1] * target_box_data[offset + 1] * prior_box_height +
+        prior_box_center_y;
+    target_box_width = expf(dw) * prior_box_width;
+    target_box_height = expf(dh) * prior_box_height;
+
+    output_box_data[offset] = target_box_center_x - target_box_width / 2;
+    output_box_data[offset + 1] = target_box_center_y - target_box_height / 2;
+    output_box_data[offset + 2] =
+        target_box_center_x + target_box_width / 2 - 1;
+    output_box_data[offset + 3] =
+        target_box_center_y + target_box_height / 2 - 1;
+  }
+}
+
+template <typename T>
+__global__ void AssignBoxKernel(const T* prior_box_data,
+                                const T* box_score_data, T* output_box_data,
+                                const int roi_num, const int class_num,
+                                T* output_assign_box_data) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < roi_num) {
+    int i = idx;
+    T max_score = -1;
+    int max_j = -1;
+    for (int j = 0; j < class_num; ++j) {
+      T score = box_score_data[i * class_num + j];
+      if (score > max_score && j > 0) {
+        max_score = score;
+        max_j = j;
+      }
+    }
+    if (max_j > 0) {
+      for (int pno = 0; pno < 4; pno++) {
+        output_assign_box_data[i * 4 + pno] =
+            output_box_data[i * class_num * 4 + max_j * 4 + pno];
+      }
+    } else {
+      for (int pno = 0; pno < 4; pno++) {
+        output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno];
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
+    auto* output_box = context.Output<framework::Tensor>("DecodeBox");
+    auto* output_assign_box =
+        context.Output<framework::Tensor>("OutputAssignBox");
+
+    auto roi_num = target_box->dims()[0];
+    auto class_num = box_score->dims()[1];
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    auto* prior_box_var_data = prior_box_var->data<T>();
+    auto* box_score_data = box_score->data<T>();
+    output_box->mutable_data<T>({roi_num, class_num * 4}, context.GetPlace());
+    output_assign_box->mutable_data<T>({roi_num, 4}, context.GetPlace());
+    T* output_box_data = output_box->data<T>();
+    T* output_assign_box_data = output_assign_box->data<T>();
+
+    int block = 512;
+    int grid = (roi_num * class_num + block - 1) / block;
+    auto& device_ctx = context.cuda_device_context();
+
+    const T box_clip = context.Attr<T>("box_clip");
+
+    DecodeBoxKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+        prior_box_data, prior_box_var_data, target_box_data, roi_num, class_num,
+        box_clip, output_box_data);
+
+    context.device_context().Wait();
+    int assign_grid = (roi_num + block - 1) / block;
+    AssignBoxKernel<T><<<assign_grid, block, 0, device_ctx.stream()>>>(
+        prior_box_data, box_score_data, output_box_data, roi_num, class_num,
+        output_assign_box_data);
+    context.device_context().Wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    box_decoder_and_assign,
+    ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
+                                       float>,
+    ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
+                                       double>);
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class BoxDecoderAndAssignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
+    auto* output_box = context.Output<framework::Tensor>("DecodeBox");
+    auto* output_assign_box =
+        context.Output<framework::Tensor>("OutputAssignBox");
+    int roi_num = target_box->dims()[0];
+    int class_num = box_score->dims()[1];
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    auto* prior_box_var_data = prior_box_var->data<T>();
+    auto* box_score_data = box_score->data<T>();
+    output_box->mutable_data<T>({roi_num, class_num * 4}, context.GetPlace());
+    output_assign_box->mutable_data<T>({roi_num, 4}, context.GetPlace());
+    T* output_box_data = output_box->data<T>();
+    T* output_assign_box_data = output_assign_box->data<T>();
+    const T bbox_clip = context.Attr<T>("box_clip");
+
+    for (int i = 0; i < roi_num; ++i) {
+      T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1;
+      T prior_box_height =
+          prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1;
+      T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2;
+      T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2;
+      for (int j = 0; j < class_num; ++j) {
+        int64_t offset = i * class_num * 4 + j * 4;
+        T dw = std::min(prior_box_var_data[2] * target_box_data[offset + 2],
+                        bbox_clip);
+        T dh = std::min(prior_box_var_data[3] * target_box_data[offset + 3],
+                        bbox_clip);
+        T target_box_center_x = 0, target_box_center_y = 0;
+        T target_box_width = 0, target_box_height = 0;
+        target_box_center_x =
+            prior_box_var_data[0] * target_box_data[offset] * prior_box_width +
+            prior_box_center_x;
+        target_box_center_y = prior_box_var_data[1] *
+                                  target_box_data[offset + 1] *
+                                  prior_box_height +
+                              prior_box_center_y;
+        target_box_width = std::exp(dw) * prior_box_width;
+        target_box_height = std::exp(dh) * prior_box_height;
+
+        output_box_data[offset] = target_box_center_x - target_box_width / 2;
+        output_box_data[offset + 1] =
+            target_box_center_y - target_box_height / 2;
+        output_box_data[offset + 2] =
+            target_box_center_x + target_box_width / 2 - 1;
+        output_box_data[offset + 3] =
+            target_box_center_y + target_box_height / 2 - 1;
+      }
+
+      T max_score = -1;
+      int max_j = -1;
+      for (int j = 0; j < class_num; ++j) {
+        T score = box_score_data[i * class_num + j];
+        if (score > max_score && j > 0) {
+          max_score = score;
+          max_j = j;
+        }
+      }
+
+      if (max_j > 0) {
+        for (int pno = 0; pno < 4; pno++) {
+          output_assign_box_data[i * 4 + pno] =
+              output_box_data[i * class_num * 4 + max_j * 4 + pno];
+        }
+      } else {
+        for (int pno = 0; pno < 4; pno++) {
+          output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno];
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("FpnRois"),
+                   "Input(FpnRois) shouldn't be null");
+    PADDLE_ENFORCE_GE(
+        ctx->Outputs("MultiFpnRois").size(), 1UL,
+        "Outputs(MultiFpnRois) of DistributeOp should not be empty");
+    size_t min_level = static_cast<size_t>(ctx->Attrs().Get<int>("min_level"));
+    size_t max_level = static_cast<size_t>(ctx->Attrs().Get<int>("max_level"));
+    PADDLE_ENFORCE_GE(max_level, min_level,
+                      "max_level must not lower than min_level");
+    // Set the output shape
+    size_t num_out_rois = max_level - min_level + 1;
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(num_out_rois);
+    for (size_t i = 0; i < num_out_rois; ++i) {
+      framework::DDim out_dim = {-1, 4};
+      outs_dims.push_back(out_dim);
+    }
+    ctx->SetOutputsDim("MultiFpnRois", outs_dims);
+    ctx->SetOutputDim("RestoreIndex", {1, -1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois"));
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)");
+    AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
+        .AsDuplicable();
+    AddOutput("RestoreIndex",
+              "(Tensor) An array of positive number which is "
+              "used to restore the order of FpnRois");
+    AddAttr<int>("min_level",
+                 "The lowest level of FPN layer where the"
+                 " proposals come from");
+    AddAttr<int>("max_level",
+                 "The highest level of FPN layer where the"
+                 " proposals come from");
+    AddAttr<int>("refer_level",
+                 "The referring level of FPN layer with"
+                 " specified scale");
+    AddAttr<int>("refer_scale",
+                 "The referring scale of FPN layer with"
+                 " specified level");
+    AddComment(R"DOC(
+This operator distribute all proposals into different fpn level,
+ with respect to scale of the proposals, the referring scale and
+ the referring level. Besides, to restore the order of proposals,
+we return an array which indicate the original index of rois in
+ current proposals.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(distribute_fpn_proposals, ops::DistributeFpnProposalsOp,
+                  ops::DistributeFpnProposalsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
+                       ops::DistributeFpnProposalsOpKernel<float>,
+                       ops::DistributeFpnProposalsOpKernel<double>);
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/fluid/memory/allocation/allocator.h>
+#include "cub/cub.cuh"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+int const BBoxSize = 4;
+
+struct RangeInitFunctor {
+  int start_;
+  int delta_;
+  int* out_;
+  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
+};
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+static inline void TransLoD(const int* length_lod, const int lod_size,
+                            int* offset_lod) {
+  int offset = 0;
+  for (int i = 0; i < lod_size; ++i) {
+    offset_lod[i] = offset;
+    offset += length_lod[i];
+  }
+}
+
+template <typename T>
+static __device__ inline T RoIArea(const T* box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static __global__ void GPUDistFpnProposalsHelper(
+    const int nthreads, const T* rois, const int lod_size,
+    const int refer_level, const int refer_scale, const int max_level,
+    const int min_level, int* roi_batch_id_data, int* sub_lod_list,
+    int* target_lvls) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+    const T* offset_roi = rois + i * BBoxSize;
+    int roi_batch_ind = roi_batch_id_data[i];
+    // get the target level of current rois
+    T roi_area = RoIArea(offset_roi, false);
+    T roi_scale = sqrt(roi_area);
+    int tgt_lvl = floor(log2(roi_scale / refer_scale) + refer_level);
+    tgt_lvl = min(max_level, max(tgt_lvl, min_level));
+    target_lvls[i] = tgt_lvl;
+    // compute number of rois in the same batch and same target level
+    platform::CudaAtomicAdd(sub_lod_list + tgt_lvl * lod_size + roi_batch_ind,
+                            1);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* fpn_rois = ctx.Input<paddle::framework::LoDTensor>("FpnRois");
+
+    auto multi_fpn_rois = ctx.MultiOutput<LoDTensor>("MultiFpnRois");
+    auto* restore_index = ctx.Output<Tensor>("RestoreIndex");
+
+    const int min_level = ctx.Attr<int>("min_level");
+    const int max_level = ctx.Attr<int>("max_level");
+    const int refer_level = ctx.Attr<int>("refer_level");
+    const int refer_scale = ctx.Attr<int>("refer_scale");
+    int num_level = max_level - min_level + 1;
+
+    // check that the fpn_rois is not empty
+    PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
+                      "DistributeFpnProposalsOp need 1 level of LoD");
+
+    auto fpn_rois_lod = fpn_rois->lod().back();
+    int lod_size = fpn_rois_lod.size() - 1;
+    int roi_num = fpn_rois_lod[lod_size];
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    // get batch id by lod in CPU
+    Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({roi_num});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    for (int n = 0; n < lod_size; ++n) {
+      for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+    // copy batch id list to GPU
+    Tensor roi_batch_id_list_gpu;
+    framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(),
+                              &roi_batch_id_list_gpu);
+
+    Tensor sub_lod_list;
+    sub_lod_list.Resize({num_level, lod_size});
+    int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
+    Tensor target_lvls;
+    target_lvls.Resize({roi_num});
+    int* target_lvls_data = target_lvls.mutable_data<int>(dev_ctx.GetPlace());
+
+    int blocks = NumBlocks(roi_num);
+    int threads = kNumCUDAThreads;
+
+    // get target levels and sub_lod list
+    GPUDistFpnProposalsHelper<T><<<blocks, threads>>>(
+        roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
+        max_level, min_level, roi_batch_id_list_gpu.data<int>(),
+        sub_lod_list_data, target_lvls_data);
+
+    Tensor index_in_t;
+    int* idx_in = index_in_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
+    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, roi_num);
+    for_range(RangeInitFunctor{0, 1, idx_in});
+
+    Tensor keys_out_t;
+    int* keys_out = keys_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
+    Tensor index_out_t;
+    int* idx_out = index_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
+
+    // Determine temporary device storage requirements
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairsDescending<int, int>(
+        nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in,
+        idx_out, roi_num);
+    // Allocate temporary storage
+    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
+    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes,
+                                        memory::Allocator::kScratchpad);
+
+    // Run sorting operation
+    // sort target level to get corresponding index
+    cub::DeviceRadixSort::SortPairsDescending<int, int>(
+        d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
+        idx_in, idx_out, roi_num);
+
+    int* restore_idx_data =
+        restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
+    // sort current index to get restore index
+    cub::DeviceRadixSort::SortPairsDescending<int, int>(
+        d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
+        restore_idx_data, roi_num);
+
+    Tensor offset_lod;
+    int* offset_lod_data =
+        offset_lod.mutable_data<int>({lod_size + 1}, dev_ctx.GetPlace());
+    for (int i = 0; i < num_level; ++i) {
+      Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
+      int* sub_lod_data = sub_lod.data<int>();
+      // transfer length-based lod to offset-based lod
+      TransLoD(sub_lod_data, lod_size + 1, offset_lod_data);
+      int sub_rois_num = offset_lod_data[lod_size];
+      Tensor sub_idx = index_out_t.Slice(0, sub_rois_num);
+
+      multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
+                                         dev_ctx.GetPlace());
+
+      GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
+      framework::LoD lod;
+      std::vector<size_t> offset;
+      memory::Copy(platform::CPUPlace(), offset.data(), place, offset_lod_data,
+                   sizeof(int) * (lod_size + 1), 0);
+      lod.emplace_back(offset);
+      multi_fpn_rois[i]->set_lod(lod);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    distribute_fpn_proposals,
+    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
+                                           double>);
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+const int kBoxDim = 4;
+
+template <typename T>
+static inline T BBoxArea(const T* box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <typename T>
+class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* fpn_rois = context.Input<paddle::framework::LoDTensor>("FpnRois");
+
+    auto multi_fpn_rois =
+        context.MultiOutput<paddle::framework::LoDTensor>("MultiFpnRois");
+
+    auto* restore_index =
+        context.Output<paddle::framework::Tensor>("RestoreIndex");
+
+    const int min_level = context.Attr<int>("min_level");
+    const int max_level = context.Attr<int>("max_level");
+    const int refer_level = context.Attr<int>("refer_level");
+    const int refer_scale = context.Attr<int>("refer_scale");
+    const int num_level = max_level - min_level + 1;
+
+    // check that the fpn_rois is not empty
+    PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
+                      "DistributeFpnProposalsOp need 1 level of LoD");
+
+    auto fpn_rois_lod = fpn_rois->lod().back();
+    int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+    std::vector<int> target_level;
+    // std::vector<int> target_level(fpn_rois_num, -1);
+    // record the number of rois in each level
+    std::vector<int> num_rois_level(num_level, 0);
+    std::vector<int> num_rois_level_integral(num_level + 1, 0);
+    for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+      Tensor fpn_rois_slice =
+          fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
+      const T* rois_data = fpn_rois_slice.data<T>();
+      for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+        // get the target level of current rois
+        T roi_scale = std::sqrt(BBoxArea(rois_data, false));
+        int tgt_lvl =
+            std::floor(std::log2(roi_scale / refer_scale) + refer_level);
+        tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
+        target_level.push_back(tgt_lvl);
+        num_rois_level[tgt_lvl - min_level]++;
+        rois_data += kBoxDim;
+      }
+    }
+    // define the output rois
+    // pointer which point to each level fpn rois
+    std::vector<T*> multi_fpn_rois_data(num_level);
+    // lod0 which will record the offset information of each level rois
+    std::vector<std::vector<size_t>> multi_fpn_rois_lod0;
+    for (int i = 0; i < num_level; ++i) {
+      // allocate memory for each level rois
+      multi_fpn_rois[i]->mutable_data<T>({num_rois_level[i], kBoxDim},
+                                         context.GetPlace());
+      multi_fpn_rois_data[i] = multi_fpn_rois[i]->data<T>();
+      std::vector<size_t> lod0(1, 0);
+      multi_fpn_rois_lod0.push_back(lod0);
+      // statistic start point for each level rois
+      num_rois_level_integral[i + 1] =
+          num_rois_level_integral[i] + num_rois_level[i];
+    }
+    restore_index->mutable_data<int>({1, fpn_rois_num}, context.GetPlace());
+    int* restore_index_data = restore_index->data<int>();
+    std::vector<int> restore_index_inter(fpn_rois_num, -1);
+    // distribute the rois into different fpn level by target level
+    for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+      Tensor fpn_rois_slice =
+          fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
+      const T* rois_data = fpn_rois_slice.data<T>();
+      size_t cur_offset = fpn_rois_lod[i];
+      // std::vector<size_t > lod_offset[num_level];
+      for (int j = 0; j < num_level; j++) {
+        multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
+      }
+      for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+        int lvl = target_level[cur_offset + j];
+        memcpy(multi_fpn_rois_data[lvl - min_level], rois_data,
+               kBoxDim * sizeof(T));
+        multi_fpn_rois_data[lvl - min_level] += kBoxDim;
+        int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
+                               multi_fpn_rois_lod0[lvl - min_level][i + 1];
+        restore_index_inter[index_in_shuffle] = cur_offset + j;
+        multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
+        rois_data += kBoxDim;
+      }
+    }
+    for (int i = 0; i < fpn_rois_num; ++i) {
+      restore_index_data[restore_index_inter[i]] = i;
+    }
+    // merge lod information into LoDTensor
+    for (int i = 0; i < num_level; ++i) {
+      framework::LoD lod;
+      lod.emplace_back(multi_fpn_rois_lod0[i]);
+      multi_fpn_rois[i]->set_lod(lod);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"

 namespace paddle {
 namespace operators {
@@ -47,7 +46,7 @@ struct EmbeddingVSumFunctor {
    auto *output = output_t->mutable_data<T>(context.GetPlace());

    PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
-    PADDLE_ENFORCE_GT(ids_lod.size(), 1UL);
+    PADDLE_ENFORCE_GT(ids_lod.size(), 1UL, "The LoD[0] could NOT be empty");

    jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
                                  out_width, jit::SeqPoolType::kSum);
@@ -83,11 +82,11 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
        FusedEmbeddingSeqPoolLastDim(table_var->dims(), ids_t->dims());
    const auto &ids_lod = ids_t->lod();
    // in run time, the LoD of ids must be 1
-    PADDLE_ENFORCE(ids_lod.size(), 1u, "The LoD level of Input(Ids) must be 1");
-    PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty");
+    PADDLE_ENFORCE(ids_lod.size(), 1UL,
+                   "The LoD level of Input(Ids) must be 1");
    int64_t batch_size = ids_lod[0].size() - 1;
    // in run time, the shape from Ids -> output
-    // should be [seq_length, 1] -> [batch_size, embedding_size]
+    // should be [seq_length, 1] -> [batch_size, last_dim]
    output_t->Resize({batch_size, last_dim});

    if (combiner_type == "sum") {
@@ -125,7 +124,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      auto *ids_data = ids->data<int64_t>();
      int64_t ids_num = ids->numel();
      auto lod = ids->lod()[0];
-      int64_t row_width = d_output->dims()[1];
+      int64_t out_width = d_output->dims()[1];

      framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
      new_rows->resize(ids_num);
@@ -136,15 +135,13 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
      const T *d_output_data = d_output->data<T>();

-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      auto vbroadcast = jit::Get<jit::kVBroadcast, jit::VBroadcastTuples<T>,
+                                 platform::CPUPlace>(out_width);
      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
        int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-        int64_t in_offset = lod[i] * row_width;
-        const T *out_pos = d_output_data + i * row_width;
-        T *in_pos = d_table_data + in_offset;
-        for (int r = 0; r != h; ++r) {
-          blas.VCOPY(row_width, out_pos, in_pos + r * row_width);
-        }
+        const T *src = d_output_data + i * out_width;
+        T *dst = d_table_data + lod[i] * out_width;
+        vbroadcast(src, dst, h, out_width);
      }
    } else {
      LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";

--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -474,6 +474,23 @@ void BenchCRFDecodingKernel() {
  }
 }

+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchVBroadcastKernel() {
+  for (int64_t w : {1, 16, 64, 100, 256}) {
+    Tensor x;
+    x.Resize({w});
+    RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
+    const T* x_data = x.data<T>();
+    for (int h : TestSizes()) {
+      Tensor y;
+      y.Resize({h * w});
+      T* y_data = y.mutable_data<T>(PlaceType());
+      BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>(
+          w, x_data, y_data, static_cast<int64_t>(h), w);
+    }
+  }
+}
+
 using T = float;
 using CPUPlace = paddle::platform::CPUPlace;

@@ -498,6 +515,7 @@ BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
 BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
 BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
 BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVCopy) { BenchXYNKernel<jit::kVCopy, T, CPUPlace>(); }

 // lstm and peephole
 BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
@@ -535,6 +553,11 @@ BENCH_FP32_CPU(kCRFDecoding) {
  BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
 }

+// vbroadcast function
+BENCH_FP32_CPU(kVBroadcast) {
+  BenchVBroadcastKernel<jit::kVBroadcast, T, CPUPlace>();
+}
+
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:

--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax)
 USE_JITKERNEL_GEN(kHSum)
 USE_JITKERNEL_GEN(kEmbSeqPool)
 USE_JITKERNEL_GEN(kSgd)
+USE_JITKERNEL_GEN(kVBroadcast)
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void VBroadcastJitCode::genCode() {
+  preCode();
+  constexpr int block = YMM_FLOAT_BLOCK;
+  constexpr int max_num_regs = 16;
+  const int num_block = w_ / block;
+  const int num_groups = num_block / max_num_regs;
+  const size_t block_size = sizeof(float) * block;
+  std::vector<int> groups(num_groups, max_num_regs);
+  int rest_num_regs = num_block % max_num_regs;
+  if (rest_num_regs > 0) {
+    groups.push_back(rest_num_regs);
+  }
+
+  // protect param_h
+  mov(reg_height, param_h);
+  Label l_next_h;
+  xor_(reg_h_i, reg_h_i);
+  mov(reg_ptr_dst_i, param_dst);
+  L(l_next_h);
+  {
+    mov(reg_ptr_src_i, param_src);
+    for (int num_regs : groups) {
+      size_t w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]);
+        w_offset += block_size;
+      }
+      add(reg_ptr_src_i, num_regs * block_size);
+
+      w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i));
+        w_offset += block_size;
+      }
+      add(reg_ptr_dst_i, num_regs * block_size);
+    }  // end of groups
+    inc(reg_h_i);
+    cmp(reg_h_i, reg_height);
+    jl(l_next_h, T_NEAR);
+  }  // end of l_next_h
+
+  postCode();
+}
+
+class VBroadcastCreator : public JitCodeCreator<int64_t> {
+ public:
+  bool UseMe(const int64_t& w) const override {
+    return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
+  }
+  size_t CodeSize(const int64_t& w) const override {
+    return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
+    PADDLE_ENFORCE_GT(w, 0);
+    return make_unique<VBroadcastJitCode>(w, CodeSize(w));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
--- a/paddle/fluid/operators/jit/gen/vbroadcast.h
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class VBroadcastJitCode : public JitCode {
+ public:
+  explicit VBroadcastJitCode(const int64_t& w, size_t code_size = 256 * 1024,
+                             void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), w_(w) {
+    this->genCode();
+  }
+
+  DECLARE_JIT_CODE(VBroadcastJitCode);
+  void genCode() override;
+
+ private:
+  int w_;
+  reg64_t param_src{abi_param1};
+  reg64_t param_dst{abi_param2};
+  reg64_t param_h{abi_param3};
+  reg64_t param_w{abi_param4};
+
+  reg64_t reg_height{r9};
+  reg64_t reg_h_i{r10};
+  reg64_t reg_ptr_src_i{r11};
+  reg64_t reg_ptr_dst_i{r12};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -36,6 +36,8 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kVScal);
    ONE_CASE(kVAddBias);
    ONE_CASE(kVRelu);
+    ONE_CASE(kVBroadcast);
+    ONE_CASE(kVCopy);
    ONE_CASE(kVIdentity);
    ONE_CASE(kVExp);
    ONE_CASE(kVSquare);

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -41,6 +41,8 @@ typedef enum {
  kVAdd,
  kVAddBias,
  kVAddRelu,
+  kVBroadcast,
+  kVCopy,
  kVExp,
  kVIdentity,
  kVMul,
@@ -133,6 +135,13 @@ struct GRUTuples {
  typedef void (*func_type)(gru_t*, const gru_attr_t*);
 };

+template <typename T>
+struct VBroadcastTuples {
+  typedef T data_type;
+  typedef int64_t attr_type;
+  typedef void (*func_type)(const T*, T*, int64_t, int64_t);
+};
+
 typedef struct seq_pool_attr_s {
  int h, w;  // h should always be the first one
  SeqPoolType type;

--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -24,6 +24,11 @@ size_t JitCodeKey<int>(const int& d) {
  return d;
 }

+template <>
+size_t JitCodeKey<int64_t>(const int64_t& d) {
+  return d;
+}
+
 // TODO(TJ): refine and benchmark JitCodeKey generatation
 constexpr int act_type_shift = 3;  // suppot 2^3 act types
 static inline int act_type_convert(KernelType type) {

--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -9,9 +9,11 @@ USE_JITKERNEL_MORE(kVAdd, mkl)
 USE_JITKERNEL_MORE(kVScal, mkl)
 USE_JITKERNEL_MORE(kVExp, mkl)
 USE_JITKERNEL_MORE(kVSquare, mkl)
+USE_JITKERNEL_MORE(kVCopy, mkl)
 USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
 USE_JITKERNEL_MORE(kSoftmax, mkl)
 USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
 USE_JITKERNEL_MORE(kSgd, mkl)
+USE_JITKERNEL_MORE(kVBroadcast, mkl)
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -13,6 +13,7 @@ USE_JITKERNEL_REFER(kVAddRelu)
 USE_JITKERNEL_REFER(kVSub)
 USE_JITKERNEL_REFER(kVScal)
 USE_JITKERNEL_REFER(kVAddBias)
+USE_JITKERNEL_REFER(kVCopy)
 USE_JITKERNEL_REFER(kVRelu)
 USE_JITKERNEL_REFER(kVIdentity)
 USE_JITKERNEL_REFER(kVExp)
@@ -34,3 +35,4 @@ USE_JITKERNEL_REFER(kHMax)
 USE_JITKERNEL_REFER(kSoftmax)
 USE_JITKERNEL_REFER(kEmbSeqPool)
 USE_JITKERNEL_REFER(kSgd)
+USE_JITKERNEL_REFER(kVBroadcast)
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
--- a/paddle/fluid/operators/requantize_op.cc
+++ b/paddle/fluid/operators/requantize_op.cc
--- a/paddle/fluid/operators/requantize_op.h
+++ b/paddle/fluid/operators/requantize_op.h
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
    AddAttr<std::string>("calibration_data", "the calibration data for int8");
+    AddAttr<std::string>(
+        "engine_serialized_data",
+        "the serialized data contains the all info of the ICUDAEngine");
    AddAttr<std::string>(
        "engine_key",
        "The engine_key here is used to distinguish different TRT Engines");

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
--- a/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py