merge develop

2c4fcaa6 · sneaxiy · 2a639d5c · 40f1dd81 · 2c4fcaa6 · 2c4fcaa6
125 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
-    tar -xz -C /usr/local && \
+
+RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
+    tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
    cp -rf /usr/local/TensorRT/include /usr && \
    cp -rf /usr/local/TensorRT/lib /usr


--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -238,7 +238,7 @@ paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], var
 paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'c0c3d0194f83fff8ea99ce0820657dae'))
 paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'd62b866c899bc1fedb5385f95b88e1f8'))
 paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', 'ab914fac893607e29ac6e52bbdbea1a4'))
-paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '60cb8f843d625abf33f8bf12455b8f99'))
+paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '992eb42590fc1c380841a6db72ce78b3'))
 paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b12717d3d4567e6119589f7f655b0cbb'))
 paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b19b79be4f05e85d1d6cec642c9fb535'))
 paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '42912092418620b4be07f36af31e7816'))
@@ -262,7 +262,7 @@ paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keyword
 paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', '73bb96ec4783ec1a11e760e8851b0e77'))
 paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
 paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
-paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords='ignored', defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
+paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
 paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
 paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', '0275133f1dde2aed528b4d3230edf823'))
 paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
@@ -287,7 +287,7 @@ paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=N
 paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
 paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
-paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords='ignored', defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
+paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
 paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '29a25ba78de79152076cacfc5443137d'))
 paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
 paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
@@ -329,6 +329,8 @@ paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varar
 paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
+paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
+paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '005a5ae47d6c8fff721931d69d072b9f'))
 paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', '9808534c12c5e739a10f73ebb0b4eafd'))
 paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', 'e0e95334fce92d16c2d9db6e7caffc47'))
 paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '98a5050bee8522fcea81aa795adaba51'))

--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -20,6 +20,9 @@
 #include <numeric>
 #include <sstream>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/platform/cpu_info.h"

@@ -302,7 +305,10 @@ std::string OrderedSet::ToString() const {

 bool NodeCanReused(ir::Node* node) {
  // valid the node is a var node
-  if (node == nullptr || !node->IsVar() || node->IsCtrlVar()) return false;
+  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
+  if (node == nullptr || !node->IsVar() || node->IsCtrlVar() ||
+      node->Name() == kEmptyVarName)
+    return false;

  bool flag = true;
  // op output force generated in cpu, can not be reused.
@@ -348,10 +354,6 @@ bool NodeCanReused(const VarDesc& node) {
  if (shape.empty() || size < MinChunkSize()) {
    return false;
  }
-  // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad
-  std::string name = node.Name();
-  if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@')
-    return false;
  return true;
 }


--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -13,6 +13,8 @@
 // limitations under the License.

 #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
+#include <memory>
+#include <utility>
 #include "paddle/fluid/framework/ir/graph_helper.h"

 namespace paddle {
@@ -29,6 +31,11 @@ ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) {
    auto &g = graphs.back();
    g->Set(kGraphVars, new GraphVars(1UL));
    g->Set(kGraphDepVars, new GraphDepVars);
+    auto &stale_ops =
+        graph->Get<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs);
+    g->Erase(details::kStaleProgramOpDescs);
+    g->Set<const std::vector<OpDesc *>>(details::kStaleProgramOpDescs,
+                                        new std::vector<OpDesc *>(stale_ops));
  }
  auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);


--- a/paddle/fluid/framework/small_stack.h
+++ b/paddle/fluid/framework/small_stack.h
@@ -14,7 +14,6 @@

 #pragma once

-#include <array>
 #include <deque>
 #include "paddle/fluid/platform/enforce.h"

@@ -22,7 +21,7 @@ namespace paddle {
 namespace framework {

 template <typename T, size_t N>
-class SmallStack {
+class InlinedStack {
  static_assert(N > 0, "N must be larger than 0");

 public:
@@ -66,8 +65,8 @@ class SmallStack {

 private:
  T head_[N];
+  size_t size_{0};
  std::deque<T> tail_;
-  size_t size_;
 };

 }  // namespace framework

--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,6 +25,10 @@ namespace ir {

 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
+// When we use trt or other third_party lib, the parameters are managed by
+// the lib, but not the fluid. So we need to record them to avoid duplicate
+// allocation.
+static const char kRepetitiveParamAttr[] = "__repetitive_param__";

 enum FuseOptions {
  DO_NOT_FUSE,  // fusing will not be done

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -130,15 +130,21 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
    if (adj_list.find(n) == adj_list.end()) {
      adj_list[n] = std::unordered_set<ir::Node *>();
    }
+    std::vector<ir::Node *> nodes;
    for (auto &var : n->inputs) {
      for (auto &adj_n : var->inputs) {
        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                << " -> " << n->Name() << reinterpret_cast<void *>(n)
                << "  via " << var->Name() << reinterpret_cast<void *>(var);
-        adj_list[n].insert(adj_n);
+        nodes.push_back(adj_n);
      }
    }
+    std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
+      return node1->id() > node2->id();
+    });
+    adj_list[n].insert(std::make_move_iterator(nodes.begin()),
+                       std::make_move_iterator(nodes.end()));
  }
  return adj_list;
 }

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -467,12 +467,6 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
  return it->second.empty() ? nullptr : it->second[0];
 }

-const Variable* ExecutionContext::LegacyInputVar(
-    const std::string& name) const {
-  auto ipt = op_.Input(name);
-  return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
-}
-
 Variable* ExecutionContext::OutputVar(const std::string& name) const {
  auto it = ctx_.outputs.find(name);
  if (it == ctx_.outputs.end()) return nullptr;
@@ -483,22 +477,11 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
  return it->second.empty() ? nullptr : it->second[0];
 }

-Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const {
-  auto opt = op_.Output(name);
-  return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
-}
-
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
  return Input<LoDTensor>(name);
 }

-template <>
-const Tensor* ExecutionContext::LegacyInput<Tensor>(
-    const std::string& name) const {
-  return LegacyInput<LoDTensor>(name);
-}
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const {
@@ -521,35 +504,11 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
  return res;
 }

-template <>
-const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
-    const std::string& name) const {
-  auto names = op().Inputs(name);
-  std::vector<const Tensor*> res;
-  res.reserve(names.size());
-  std::transform(names.begin(), names.end(), std::back_inserter(res),
-                 [&](const std::string& sub_name) -> const Tensor* {
-                   auto var = scope_.FindVar(sub_name);
-                   if (var == nullptr) return nullptr;
-                   PADDLE_ENFORCE(
-                       var->IsType<LoDTensor>(),
-                       "%s should be LoDTensor, but the received type is %s",
-                       sub_name, ToTypeName(var->Type()));
-                   return &(var->Get<LoDTensor>());
-                 });
-  return res;
-}
-
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
  return Output<LoDTensor>(name);
 }

-template <>
-Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const {
-  return LegacyOutput<LoDTensor>(name);
-}
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const {

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -16,9 +16,11 @@ limitations under the License. */

 #include <algorithm>
 #include <atomic>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <unordered_map>
+#include <utility>
 #include <vector>

 #include "glog/logging.h"  // For VLOG
@@ -253,31 +255,6 @@ class ExecutionContext {
    return it->second;
  }

-  const std::vector<Variable*> LegacyMultiInputVar(
-      const std::string& name) const {
-    auto names = op_.Inputs(name);
-    std::vector<Variable*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return name == kEmptyVarName ? nullptr
-                                                  : scope_.FindVar(name);
-                   });
-    return res;
-  }
-
-  std::vector<Variable*> LegacyMultiOutputVar(const std::string& name) const {
-    auto names = op_.Outputs(name);
-    std::vector<Variable*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [this](const std::string& name) {
-                     return name == kEmptyVarName ? nullptr
-                                                  : scope_.FindVar(name);
-                   });
-    return res;
-  }
-
  template <typename T>
  const T* Input(const std::string& name) const {
    auto* var = InputVar(name);
@@ -290,22 +267,6 @@ class ExecutionContext {
    return var == nullptr ? nullptr : var->GetMutable<T>();
  }

-  template <typename T>
-  const T* LegacyInput(const std::string& name) const {
-    auto* var = LegacyInputVar(name);
-    return var == nullptr ? nullptr : &var->Get<T>();
-  }
-
-  template <typename T>
-  T* LegacyOutput(const std::string& name) const {
-    auto var = LegacyOutputVar(name);
-    return var == nullptr ? nullptr : var->GetMutable<T>();
-  }
-
-  const Variable* LegacyInputVar(const std::string& name) const;
-
-  Variable* LegacyOutputVar(const std::string& name) const;
-
  template <typename T>
  const std::vector<const T*> MultiInput(const std::string& name) const {
    auto it = ctx_.inputs.find(name);
@@ -338,32 +299,6 @@ class ExecutionContext {
    return res;
  }

-  template <typename T>
-  const std::vector<const T*> LegacyMultiInput(const std::string& name) const {
-    auto names = op_.Inputs(name);
-    std::vector<const T*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) -> const T* {
-                     auto var = scope_.FindVar(sub_name);
-                     return var == nullptr ? nullptr : &var->Get<T>();
-                   });
-    return res;
-  }
-
-  template <typename T>
-  std::vector<T*> LegacyMultiOutput(const std::string& name) const {
-    auto names = op_.Outputs(name);
-    std::vector<T*> res;
-    res.reserve(names.size());
-    std::transform(names.begin(), names.end(), std::back_inserter(res),
-                   [&](const std::string& sub_name) -> T* {
-                     auto var = scope_.FindVar(sub_name);
-                     return var == nullptr ? nullptr : var->GetMutable<T>();
-                   });
-    return res;
-  }
-
  platform::Place GetPlace() const { return device_context_.GetPlace(); }

  template <typename DeviceContextType>
@@ -433,24 +368,13 @@ class ExecutionContext {
 template <>
 const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;

-template <>
-const Tensor* ExecutionContext::LegacyInput<Tensor>(
-    const std::string& name) const;
-
 template <>
 const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
    const std::string& name) const;

-template <>
-const std::vector<const Tensor*> ExecutionContext::LegacyMultiInput<Tensor>(
-    const std::string& name) const;
-
 template <>
 Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;

-template <>
-Tensor* ExecutionContext::LegacyOutput<Tensor>(const std::string& name) const;
-
 template <>
 std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
    const std::string& name) const;

--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,8 +23,12 @@

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -133,6 +137,8 @@ struct Argument {
  DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
  DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
                      AnalysisConfig::Precision);
+  DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
+                      bool);

  // Memory optimized related.
  DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);

--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
+#include <memory>
 #include <set>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
+#include <utility>
 #include <vector>

 #include "paddle/fluid/framework/framework.pb.h"
@@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
  return "";
 }

+static std::string GetTrtEngineSerializedPath(const std::string &model_root,
+                                              const std::string &engine_key) {
+  return model_root + "/trt_serialized_" + engine_key;
+}
+
+static std::string GetTrtEngineSerializedData(
+    const std::string &model_opt_cache_dir, const std::string &engine_key) {
+  std::string trt_serialized_path =
+      GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key);
+  if (FileExists(trt_serialized_path)) {
+    VLOG(3) << "Trt serialized file: " << trt_serialized_path
+            << "is found here";
+    std::ifstream infile(trt_serialized_path, std::ios::in);
+    std::stringstream buffer;
+    buffer << infile.rdbuf();
+    std::string trt_engine_serialized_data(buffer.str());
+    return trt_engine_serialized_data;
+  }
+  return "";
+}
+
+static void SaveTrtEngineSerializedDataToFile(
+    const std::string &trt_serialized_path,
+    const std::string &engine_serialized_data) {
+  std::ofstream outfile(trt_serialized_path);
+  outfile << engine_serialized_data;
+  outfile.close();
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -81,6 +81,9 @@ void IRPassManager::CreatePasses(Argument *argument,
      pass->Set(
          "model_opt_cache_dir",
          new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
+      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
+      pass->Set("use_static_engine",
+                new bool(argument->tensorrt_use_static_engine()));
    }

    pre_pass = pass_name;

--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -22,7 +22,10 @@

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -14,13 +14,13 @@

 #include <algorithm>
 #include <set>
-#include <string>
-#include <vector>

 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/string/pretty_log.h"

@@ -33,8 +33,15 @@ using framework::ir::Node;
 std::vector<std::string> ExtractParameters(
    const std::unordered_set<Node *> &nodes);

-std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map);

+std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
    std::unique_ptr<framework::ir::Graph> graph) const {
  framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());

@@ -47,9 +54,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
                      Get<int>("min_subgraph_size") /*min subgraph size*/);
  fuser();

+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in trt, and should not have another copy in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
  for (auto *node : graph->Nodes()) {
    if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateTensorRTOp(node, graph.get());
+      CreateTensorRTOp(node, graph.get(), graph_param_names,
+                       &repetitive_params);

      std::unordered_set<const Node *> nodes2remove(
          Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
@@ -64,12 +78,15 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
    }
  }
  framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  graph->Set(framework::ir::kRepetitiveParamAttr,
+             new std::vector<std::string>(repetitive_params));

  return graph;
 }

 std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
-                              const std::set<std::string> &engine_outputs) {
+                              const std::set<std::string> &engine_outputs,
+                              const std::string &predictor_id) {
  std::string engine_hash_key = "";
  for (auto name : engine_inputs) {
    engine_hash_key += name;
@@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
  for (auto name : engine_outputs) {
    engine_hash_key += name;
  }
+  engine_hash_key += predictor_id;
  auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
  return engine_key;
 }

-void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
-                                            Graph *graph) const {
+void TensorRtSubgraphPass::CreateTensorRTOp(
+    framework::ir::Node *node, Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
  auto *op_desc = node->Op();
  auto &subgraph = *Agent(node).subgraph();
  PADDLE_ENFORCE(!subgraph.empty());
@@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // is unique.
  std::set<std::string> input_names;
  std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+
+  // The node->inputs containes input tensors and parameters.
  for (auto *x : node->inputs) {
    input_names.insert(x->Name());
    input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
  }
-  op_desc->SetInput(
-      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));

  std::set<std::string> output_names;
  std::set<std::string> output_names_with_id;
@@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
    output_names_with_id.insert(x->Name() + std::to_string(x->id()));
  }

-  op_desc->SetOutput(
-      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-  op_desc->SetType("tensorrt_engine");
-
  std::unordered_map<std::string, std::string> output_name_map;
+  auto &subgraph_nodes = *Agent(node).subgraph();

  // The following procedure is used to rename all the intermediate
  // variables and the output variables of the subgraph.
@@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
  // input of a OP, but also the output of a Op, there will be problems.
  // So we have to rename the variable in the subgraph to make sure
  // it is either an OP's input or an OP's output.
-
-  auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
-    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    for (auto *in_var : correspond_node->inputs) {
-      var2id[in_var->Name()] = in_var->id();
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      // one input
-      auto *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outputs) {
-      var2id[out_var->Name()] = out_var->id();
-    }
-
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id.count(arg_value_with_id)) {
-          output_name_map[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
-  }
+  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
+                      &output_names_with_id, &output_names, &output_name_map);

  // When tensorrt engine runs at the end of the operation,
  // output_mapping help us copy the data from the renamed ITensor
@@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
    PADDLE_ENFORCE(output_name_map.count(name) != 0);
    output_mapping.push_back(output_name_map[name]);
  }
+  PADDLE_ENFORCE(!output_mapping.empty());

  auto *vars = block_desc.Proto()->mutable_vars();
  for (framework::ir::Node *node : graph->Nodes()) {
@@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,

  PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                 "the block has no var-desc");
-  PADDLE_ENFORCE(!output_mapping.empty());
+
+  // Set attrs
+  op_desc->SetType("tensorrt_engine");
+  op_desc->SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
+
+  op_desc->SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
+
  op_desc->SetBlockAttr("sub_block", new_block);
  SetAttr(op_desc->Proto(), "subgraph",
          block_desc.Proto()->SerializeAsString());
-  // Set attrs
  SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
  SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
-  SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
  SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+  SetAttr(op_desc->Proto(), "parameters", params);

  auto enable_int8 = Get<bool>("enable_int8");
-  auto engine_key =
-      GenerateEngineKey(input_names_with_id, output_names_with_id);
+  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
+                                      std::to_string(0));

+  // Get "" when there is no cached calibration table data.
  std::string calibration_data = GetTrtCalibTableData(
      Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
  SetAttr(op_desc->Proto(), "calibration_data", calibration_data);

  SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
  SetAttr(op_desc->Proto(), "engine_key", engine_key);
+  SetAttr(op_desc->Proto(), "engine_serialized_data", std::string(""));
+
+  std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
+  if (enable_int8 && calibration_data.size() != 0) {
+    calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
+  }
+
+  bool use_static_engine = Get<bool>("use_static_engine");
+  // When in int8 mode and calibration_mode, the program just produce the
+  // calibration table data.
+  bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
+  if (!calibration_mode && use_static_engine) {
+    std::copy(params.begin(), params.end(),
+              std::back_inserter(*repetitive_params));
+    std::string trt_engine_serialized_data = GetTrtEngineSerializedData(
+        Get<std::string>("model_opt_cache_dir"), engine_key);
+
+    if (trt_engine_serialized_data.empty()) {
+      LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+                   "kernel etc). This process may cost a lot of time.";
+      std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
+          new tensorrt::TensorRTEngine(
+              Get<int>("max_batch_size"), Get<int>("workspace_size"),
+              enable_int8, calibrator.get(), Get<int>("gpu_device_id")));
+      auto *scope = param_scope();
+      framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+      std::unordered_set<std::string> param_set(params.begin(), params.end());
+      inference::Singleton<inference::tensorrt::OpConverter>::Global()
+          .ConvertBlockToTRTEngine(
+              &block_desc_temp, *scope,
+              std::vector<std::string>(input_names.begin(), input_names.end()),
+              param_set, output_mapping, trt_engine.get());
+      nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
+      trt_engine_serialized_data =
+          std::string((const char *)serialized_engine_data->data(),
+                      serialized_engine_data->size());
+      SaveTrtEngineSerializedDataToFile(
+          GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
+                                     engine_key),
+          trt_engine_serialized_data);
+    } else {
+      LOG(INFO) << "Load TRT Optimized Info from "
+                << GetTrtEngineSerializedPath(
+                       Get<std::string>("model_opt_cache_dir"), engine_key);
+    }
+
+    SetAttr(op_desc->Proto(), "engine_serialized_data",
+            trt_engine_serialized_data);
+  }
 }

 std::vector<std::string> ExtractParameters(
@@ -253,7 +279,7 @@ std::vector<std::string> ExtractParameters(
  for (const auto &node : nodes) {
    if (!node->IsOp()) continue;
    std::string op_type = node->Op()->Type();
-    if (op_type == "feed") {
+    if (op_type == "feed" || op_type == "fetch") {
      std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
      std::copy(output_names.begin(), output_names.end(),
                std::back_inserter(feed_outputs));
@@ -272,6 +298,99 @@ std::vector<std::string> ExtractParameters(
  return parameters;
 }

+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map) {
+  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
+  // When there are more than two convolutions of 1 * 1 with the same input, the
+  // paddle-tensorrt will do the merging optimization, which fuse those conv
+  // into one conv, and then trigger bug. So,  We should use strategy to avoid
+  // this optimization for the time being. This bug will be fixed in the future.
+  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
+      same_hierarchy_conv2d_num_map;
+
+  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
+    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
+    framework::OpDesc op_desc(*op, nullptr);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    std::unordered_map<std::string, framework::ir::Node *> in_vars;
+    for (auto *in_var : correspond_node->inputs) {
+      var2id[in_var->Name()] = in_var->id();
+      in_vars[in_var->Name()] = in_var;
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      // one input
+      auto *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outputs) {
+      var2id[out_var->Name()] = out_var->id();
+    }
+
+    if (op_desc.Type() == "conv2d") {
+      auto input_var_name = op_desc.Input("Input").front();
+      auto filter_var_name = op_desc.Input("Filter").front();
+      auto out_var_name = op_desc.Output("Output").front();
+      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
+      const std::vector<int> strides =
+          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+      const std::vector<int> paddings =
+          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
+        (*output_names_with_id)
+            .insert(out_var_name + std::to_string(var2id[out_var_name]));
+        (*output_names).insert(out_var_name);
+      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
+                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
+                 paddings[1] == 0) {
+        same_hierarchy_conv2d_num_map[input_var_name] += 1;
+      }
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id->count(arg_value_with_id)) {
+          (*output_name_map)[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -13,7 +13,12 @@
 // limitations under the License.

 #pragma once
-#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"

 namespace paddle {
@@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
      std::unique_ptr<framework::ir::Graph> graph) const override;

 private:
-  void CreateTensorRTOp(framework::ir::Node *x,
-                        framework::ir::Graph *graph) const;
+  void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                        const std::vector<std::string> &graph_params,
+                        std::vector<std::string> *repetitive_params) const;
  void CleanIntermediateOutputs(framework::ir::Node *node);
 };


--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -31,6 +31,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  // The parameters are on the cpu, therefore, synchronization is not necessary.
  if (!argument->use_gpu()) return;

+  auto &graph = argument->main_graph();
+  std::vector<std::string> repetitive_params;
+
+  if (graph.Has(framework::ir::kRepetitiveParamAttr))
+    repetitive_params = graph.Get<std::vector<std::string>>(
+        framework::ir::kRepetitiveParamAttr);
+
  LOG(INFO) << "Sync params from CPU to GPU";

  PADDLE_ENFORCE(argument->gpu_device_id_valid());
@@ -43,6 +50,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
  // Because there exists the case that new parameter variables are not added to
  // the program in the analysis pass.
  for (auto &var_name : all_vars) {
+    if (std::count(repetitive_params.begin(), repetitive_params.end(),
+                   var_name)) {
+      continue;
+    }
    auto *var = scope->FindLocalVar(var_name);
    PADDLE_ENFORCE(var != nullptr);
    if (var->IsType<framework::LoDTensor>() ||

--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>

+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
  CP_MEMBER(tensorrt_max_batchsize_);
  CP_MEMBER(tensorrt_min_subgraph_size_);
  CP_MEMBER(tensorrt_precision_mode_);
+  CP_MEMBER(trt_use_static_engine_);
  // MKLDNN related.
  CP_MEMBER(use_mkldnn_);
  CP_MEMBER(mkldnn_enabled_op_types_);
@@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() {

 void AnalysisConfig::EnableTensorRtEngine(
    int workspace_size, int max_batch_size, int min_subgraph_size,
-    AnalysisConfig::Precision precision_mode) {
+    AnalysisConfig::Precision precision_mode, bool use_static) {
 #ifdef PADDLE_WITH_CUDA
  if (!use_gpu()) {
    LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine(
  tensorrt_max_batchsize_ = max_batch_size;
  tensorrt_min_subgraph_size_ = min_subgraph_size;
  tensorrt_precision_mode_ = precision_mode;
+  trt_use_static_engine_ = use_static;

  Update();
 #else

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -183,6 +183,9 @@ void AnalysisPredictor::SetMkldnnThreadID(int tid) {
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                            std::vector<PaddleTensor> *output_data,
                            int batch_size) {
+  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
+    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+  }
  VLOG(3) << "Predictor::predict";
  inference::Timer timer;
  timer.tic();
@@ -362,6 +365,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
    argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
    argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
+    argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
  }

  if (config_.use_mkldnn_) {
@@ -435,12 +439,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
      }
      feeds_[idx] = op;
      feed_names_[op->Output("Out")[0]] = idx;
+      idx2feeds_[idx] = op->Output("Out")[0];
    } else if (op->Type() == "fetch") {
      int idx = boost::get<int>(op->GetAttr("col"));
      if (fetches_.size() <= static_cast<size_t>(idx)) {
        fetches_.resize(idx + 1);
      }
      fetches_[idx] = op;
+      idx2fetches_[idx] = op->Input("X")[0];
    }
  }
 }
@@ -453,6 +459,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
  var->GetMutable<framework::FeedFetchList>();
 }

+std::vector<std::string> AnalysisPredictor::GetInputNames() {
+  std::vector<std::string> input_names;
+  for (auto &item : idx2feeds_) {
+    input_names.push_back(item.second);
+  }
+  return input_names;
+}
+
+std::vector<std::string> AnalysisPredictor::GetOutputNames() {
+  std::vector<std::string> output_names;
+  for (auto &item : idx2fetches_) {
+    output_names.push_back(item.second);
+  }
+  return output_names;
+}
+
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
    const std::string &name) {
  PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
@@ -460,6 +482,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = true;
  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = boost::get<platform::CUDAPlace>(place_);
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+
  return res;
 }

@@ -470,6 +499,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
      new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
  res->input_or_output_ = false;
  res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = boost::get<platform::CUDAPlace>(place_);
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
  return res;
 }


--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -15,12 +15,14 @@
 #pragma once
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
 #ifdef PADDLE_WITH_TESTING
@@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
           std::vector<PaddleTensor> *output_data,
           int batch_size = -1) override;

+  std::vector<std::string> GetInputNames();
+  std::vector<std::string> GetOutputNames();
+
  std::unique_ptr<ZeroCopyTensor> GetInputTensor(
      const std::string &name) override;
  std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
@@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
  std::shared_ptr<framework::ProgramDesc> inference_program_;
  std::vector<framework::OpDesc *> feeds_;
  std::map<std::string, size_t> feed_names_;
+  // Sorted according to the idx.
+  std::map<size_t, std::string> idx2feeds_;
  std::vector<framework::OpDesc *> fetches_;
+  std::map<size_t, std::string> idx2fetches_;
+
  // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
  // concurrency problems, wrong results and memory leak, so cache them.
  std::vector<framework::LoDTensor> feed_tensors_;

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -131,6 +131,9 @@ NativePaddlePredictor::~NativePaddlePredictor() {
 bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
                                std::vector<PaddleTensor> *output_data,
                                int batch_size) {
+  if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) {
+    paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+  }
  VLOG(3) << "Predictor::predict";
  Timer timer;
  timer.tic();

--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
  return res;
 }

+template <typename T>
+void ZeroCopyTensor::copy_from_cpu(const T *data) {
+  EAGER_GET_TENSOR;
+  PADDLE_ENFORCE_GE(
+      tensor->numel(), 0,
+      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
+      "function before copy data from cpu.");
+  size_t ele_size = tensor->numel() * sizeof(T);
+
+  if (place_ == PaddlePlace::kCPU) {
+    auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
+    std::memcpy(static_cast<void *>(t_data), data, ele_size);
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    platform::CUDAPlace gpu_place(device_);
+    auto *t_data = tensor->mutable_data<T>(gpu_place);
+    auto *dev_ctx =
+        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+
+    memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
+                 data, ele_size, dev_ctx->stream());
+#else
+    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+  }
+}
+
+template <typename T>
+void ZeroCopyTensor::copy_to_cpu(T *data) {
+  EAGER_GET_TENSOR;
+  auto ele_num = tensor->numel();
+  auto *t_data = tensor->data<T>();
+  auto t_place = tensor->place();
+
+  if (platform::is_cpu_place(t_place)) {
+    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto gpu_place = boost::get<platform::CUDAPlace>(t_place);
+    auto *dev_ctx =
+        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
+                 t_data, ele_num * sizeof(T), dev_ctx->stream());
+#else
+    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+  }
+}
+template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
+template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
+template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
+template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
+
 template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
                                            int *size) const;
 template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
  return tensor;
 }

-std::vector<int64_t> ZeroCopyTensor::shape() const {
+std::vector<int> ZeroCopyTensor::shape() const {
  EAGER_GET_TENSOR;
  PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
-  return framework::vectorize(tensor->dims());
+  return framework::vectorize2int(tensor->dims());
 }

 void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {

--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);

 void *ZeroCopyTensor::FindTensor() const { return nullptr; }

-std::vector<int64_t> ZeroCopyTensor::shape() const { return {}; }
+std::vector<int> ZeroCopyTensor::shape() const { return {}; }

 void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}


--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -50,6 +50,11 @@ class Timer {
  }
 };

+static int GetUniqueId() {
+  static int id = 0;
+  return id++;
+}
+
 static void split(const std::string &str, char sep,
                  std::vector<std::string> *pieces) {
  pieces->clear();

--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -135,7 +135,8 @@ struct AnalysisConfig {
   */
  void EnableTensorRtEngine(int workspace_size = 1 << 20,
                            int max_batch_size = 1, int min_subgraph_size = 3,
-                            Precision precision = Precision::kFloat32);
+                            Precision precision = Precision::kFloat32,
+                            bool use_static = true);
  /** A boolean state telling whether the TensorRT engine is used.
   */
  bool tensorrt_engine_enabled() const { return use_tensorrt_; }
@@ -233,6 +234,7 @@ struct AnalysisConfig {
  //  subgraph, 3 as default value.
  int tensorrt_min_subgraph_size_{3};
  Precision tensorrt_precision_mode_;
+  bool trt_use_static_engine_;

  // memory reuse related.
  bool enable_memory_optim_{false};

--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -160,11 +160,21 @@ class ZeroCopyTensor {
  template <typename T>
  T* data(PaddlePlace* place, int* size) const;

-  std::vector<int64_t> shape() const;
+  template <typename T>
+  void copy_from_cpu(const T* data);
+
+  template <typename T>
+  void copy_to_cpu(T* data);
+
+  std::vector<int> shape() const;

  void SetLoD(const std::vector<std::vector<size_t>>& x);
  std::vector<std::vector<size_t>> lod() const;
  const std::string& name() const { return name_; }
+  void SetPlace(PaddlePlace place, int device = -1) {
+    place_ = place;
+    device_ = device;
+  }

 protected:
  explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
@@ -179,6 +189,8 @@ class ZeroCopyTensor {
  // The corresponding tensor pointer inside Paddle workspace is cached for
  // performance.
  mutable void* tensor_{nullptr};
+  PaddlePlace place_;
+  int device_;
 };

 /** A simple Inference API for Paddle.
@@ -200,6 +212,14 @@ class PaddlePredictor {
                   std::vector<PaddleTensor>* output_data,
                   int batch_size = -1) = 0;

+  /** \brief Get input names of the model
+   */
+  virtual std::vector<std::string> GetInputNames() { return {}; }
+
+  /** \brief Get output names of the model
+   */
+  virtual std::vector<std::string> GetOutputNames() { return {}; }
+
  /** \brief Get a mutable tensor directly.
   *
   * NOTE Only works in AnalysisPredictor.

--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
@@ -49,11 +49,6 @@ class EngineBase {
  // Execute the engine, that will run the inference network.
  virtual void Execute(int batch_size) = 0;

-  // Return the IO buffer that allocated in engine. One can read/write directly
-  // on the buffer. If the buffer's buffer is nullptr, one can also allocate
-  // memory and maintain it outside the engine.
-  virtual Buffer& buffer(const std::string& name) = 0;
-
  virtual ~EngineBase() {}
 };  // class EngineBase


--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -18,21 +18,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

-bool to_skip_merging_optimize(TensorRTEngine* engine,
-                              const std::vector<int>& filters,
-                              const std::vector<int>& strides,
-                              const std::vector<int>& paddings,
-                              std::string input_name) {
-  if (engine->itensor_quote_num[input_name] > 0) {
-    return true;
-  }
-  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
-      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
-    engine->itensor_quote_num[input_name] += 1;
-
-  return false;
-}
-
 template <typename RegistFunc, typename SetDilationFunc>
 void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode,
@@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
  weight_tensor->Resize(Y_t->dims());
  TensorCopySync((*Y_t), cpu_place, weight_tensor.get());

-  auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
+  auto* weight_data = weight_tensor->mutable_data<float>(cpu_place);

  PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
  const int n_output = weight_tensor->dims()[0];
@@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
  layer->getOutput(0)->setName(output_name.c_str());
  engine->SetITensor(output_name, layer->getOutput(0));

-  if (test_mode ||
-      to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
-                               op_desc.Input("Input").front())) {
+  if (test_mode) {
    engine->DeclareOutput(output_name);
  }
 }

--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
    if (CheckDims(dims_x, dims_y)) {
      // The two input tensor should have the same dims
      VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
-
      nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
          engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
          *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
@@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
                 "ElementWisePluginLayer";

      plugin::ElementWisePlugin* plugin =
-          new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
+          new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
      plugin->AddInput(X);
      plugin->AddInput(Y);
      nvinfer1::IPluginLayer* layer = engine_->AddPlugin(

--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter {
           Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                  static_cast<void*>(weight_data),
-                                  Y_t->memory_size() / sizeof(float)};
+                                  static_cast<size_t>(Y_t->numel())};
    TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
                                      static_cast<void*>(tmp->data<float>()),
-                                      Y_t->memory_size() / sizeof(float));
+                                      static_cast<size_t>(Y_t->numel()));
    weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
    tmp_weight.dims = weight.dims;


--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -16,9 +16,12 @@ limitations under the License. */

 #include <string>
 #include <unordered_map>
+#include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"

@@ -26,6 +29,37 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {

+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {  // NOLINT
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
+  if (shape.size() == 4UL)
+    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
+  return nvinfer1::DimsCHW(shape[1], 1, 1);
+}
+
+}  // namespace // NOLINT
+
 /*
 * Convert Op from Fluid to TensorRT Engine.
 */
@@ -110,6 +144,34 @@ class OpConverter {
    }
  }

+  // The scope  here should be inited with the parameter vars.
+  void ConvertBlockToTRTEngine(
+      framework::BlockDesc* block_desc, const framework::Scope& scope,
+      const std::vector<std::string>& inputs,
+      const std::unordered_set<std::string>& parameters,
+      const std::vector<std::string>& outputs, TensorRTEngine* engine) {
+    engine->InitNetwork();
+    for (auto& input : inputs) {
+      if (parameters.count(input)) continue;
+      auto* var = block_desc->FindVar(input);
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                        "TensorRT engine only takes LoDTensor as input");
+      auto var_shape = var->GetShape();
+
+      engine->DeclareInput(
+          input, FluidDataType2TRT(
+                     var->Proto()->type().lod_tensor().tensor().data_type()),
+          Vec2TRT_Dims(var_shape));
+    }
+    framework::proto::BlockDesc* block_proto = block_desc->Proto();
+    ConvertBlock(*block_proto, parameters, scope, engine);
+    for (auto& output : outputs) {
+      engine->DeclareOutput(output);
+    }
+    engine->FreezeNetwork();
+  }
+
  void SetEngine(TensorRTEngine* engine) { engine_ = engine; }

  virtual ~OpConverter() {}

--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter {
    PADDLE_ENFORCE_NOT_NULL(alpha_var);
    auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();

-    platform::CUDAPlace place;
-    std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
        new framework::LoDTensor());
-    alpha_tensor_device->Resize(alpha_tensor->dims());
-    TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
-    float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
+    alpha_tensor_temp->Resize(alpha_tensor->dims());
+    TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
+    float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);

-    // Transform alpha to TensorRTEngine::Weight
-    TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
-                                    static_cast<void*>(alpha_data),
-                                    alpha_tensor_device->numel());
-    plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
+    plugin::PReluPlugin* plugin =
+        new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode);
    nvinfer1::IPluginLayer* layer =
        engine_->AddPlugin(&input, input_num, plugin);
    // keep alpha tensor to avoid release it's memory
    engine_->weight_map[op_desc.Input("Alpha")[0]] =
-        std::move(alpha_tensor_device);
+        std::move(alpha_tensor_temp);

    std::string layer_name = "prelu (Output: ";
    auto output_name = op_desc.Output("Out")[0];

--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -19,7 +19,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>

 #include "paddle/fluid/framework/lod_tensor.h"
@@ -79,7 +81,8 @@ class TRTConvertValidation {
        if_add_batch_(if_add_batch),
        max_batch_size_(max_batch_size) {
    PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
-    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
+    engine_.reset(
+        new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0));
    engine_->InitNetwork();
  }

@@ -114,13 +117,12 @@ class TRTConvertValidation {
  }

  void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
-    platform::CUDAPlace place;
-    platform::CUDADeviceContext ctx(place);
+    platform::CUDADeviceContext ctx(place_);

    auto* x = scope_.Var(name);
    auto* x_tensor = x->GetMutable<framework::LoDTensor>();
    x_tensor->Resize(framework::make_ddim(dim_vec));
-    RandomizeTensor(x_tensor, place, ctx);
+    RandomizeTensor(x_tensor, place_, ctx);
  }
  // Declare a variable in a fluid Scope.
  void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
@@ -146,19 +148,6 @@ class TRTConvertValidation {

    // Declare outputs.
    op_desc_.reset(new framework::OpDesc(desc, nullptr));
-
-    // Set Inputs.
-    for (const auto& input : op_desc_->InputArgumentNames()) {
-      if (parameters_.count(input)) continue;
-      auto* var = scope_.FindVar(input);
-      PADDLE_ENFORCE(var);
-      auto tensor = var->GetMutable<framework::LoDTensor>();
-
-      engine_->SetInputFromGPU(
-          input, static_cast<void*>(tensor->data<void>()),
-          sizeof(float) *
-              analysis::AccuDims(tensor->dims(), tensor->dims().size()));
-    }
  }

  // We use the set 'neglected_output' here, because some Ops like batch norm,
@@ -168,43 +157,71 @@ class TRTConvertValidation {
               std::unordered_set<std::string> neglected_output = {}) {
    // Execute Fluid Op
    PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
-    platform::CUDAPlace place;
-    platform::CUDADeviceContext ctx(place);
-    op_->Run(scope_, place);
-    // Execute TRT.
-    engine_->Execute(batch_size);
-    cudaStreamSynchronize(engine_->stream());
+    platform::CUDADeviceContext ctx(place_);
+    op_->Run(scope_, place_);

-    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
-    const size_t output_space_size = 3000;
+    std::vector<std::string> input_output_names;
+
+    // Note: we need filter the parameter
+    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
+      input_output_names.push_back(input);
+    }
+
+    // Collect the fluid outputs.
+    std::vector<std::vector<float>> fluid_outs;
    for (const auto& output : op_desc_->OutputArgumentNames()) {
      if (neglected_output.count(output)) continue;
+      input_output_names.push_back(output);
      std::vector<float> fluid_out;
-      std::vector<float> trt_out(output_space_size);
-      engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
-      cudaStreamSynchronize(engine_->stream());
-
      auto* var = scope_.FindVar(output);
-      auto tensor = var->GetMutable<framework::LoDTensor>();
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
      framework::TensorToVector(*tensor, ctx, &fluid_out);
+      fluid_outs.push_back(fluid_out);
+    }
+
+    // Bind input and output for TRT.
+    const int num_bindings = input_output_names.size();
+    std::vector<void*> buffers(num_bindings);
+
+    for (const std::string& name : input_output_names) {
+      auto* var = scope_.FindVar(name);
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      const int bind_index = engine_->engine()->getBindingIndex(name.c_str());
+      buffers[bind_index] =
+          static_cast<void*>(tensor->mutable_data<float>(place_));
+    }
+
+    // Execute TRT.
+    engine_->Execute(batch_size, &buffers, stream_);

-      size_t fluid_out_size = fluid_out.size();
+    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    int index = 0;
+    for (const auto& output : op_desc_->OutputArgumentNames()) {
+      if (neglected_output.count(output)) continue;
+      std::vector<float> trt_out;
+      auto* var = scope_.FindVar(output);
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      framework::TensorToVector(*tensor, ctx, &trt_out);
+
+      size_t fluid_out_size = fluid_outs[index].size();
      if (if_add_batch_ == true) {
        fluid_out_size =
            batch_size * (framework::product(tensor->dims()) / max_batch_size_);
      }
-      // Compare two output
-      ASSERT_FALSE(fluid_out.empty());
+
      for (size_t i = 0; i < fluid_out_size; i++) {
        // Loose the threshold for CI in different machine model.
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
+        EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5);
      }
+      index += 1;
    }
  }

  framework::Scope& scope() { return scope_; }

 private:
+  platform::CUDAPlace place_;
  std::unique_ptr<TensorRTEngine> engine_;
  cudaStream_t stream_;
  std::unique_ptr<framework::OperatorBase> op_;

--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
  PADDLE_ENFORCE(false, "not implemented");
 }

-void TensorRTEngine::Execute(int batch_size) {
+void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
+                             cudaStream_t stream) {
  freshDeviceId();
  batch_size_ = batch_size;
-  std::vector<void *> buffers;
-  for (auto &buf : buffers_) {
-    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
-    PADDLE_ENFORCE_GT(buf.max_size, 0);
-    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
-    buffers.push_back(buf.buffer);
-  }
-  infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
-  cudaStreamSynchronize(stream_);
+  infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr);
+  cudaStreamSynchronize(stream);
  SetRuntimeBatch(batch_size);
 }

-TensorRTEngine::~TensorRTEngine() {
-  cudaStreamSynchronize(stream_);
-  // clean buffer
-  for (auto &buf : buffers_) {
-    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
-      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
-      buf.buffer = nullptr;
-      buf.max_size = 0;
-    }
-  }
-}
-
 void TensorRTEngine::FreezeNetwork() {
-  VLOG(3) << "TRT to freeze network";
  freshDeviceId();
+  VLOG(3) << "TRT to freeze network";
  PADDLE_ENFORCE(infer_builder_ != nullptr,
                 "Call InitNetwork first to initialize network.");
  PADDLE_ENFORCE(infer_network_ != nullptr,
@@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() {
  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");

  infer_context_.reset(infer_engine_->createExecutionContext());
-
-  // allocate GPU buffers.
-  buffers_.resize(buffer_sizes_.size());
-  for (auto &item : buffer_sizes_) {
-    // The output buffers are not set in the network building phrase, need to
-    // infer from the TesorRT network.
-    if (item.second == 0) {
-      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
-      auto dims = infer_engine_->getBindingDimensions(slot_offset);
-      item.second = kDataTypeSize[static_cast<int>(
-                        infer_engine_->getBindingDataType(slot_offset))] *
-                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
-      PADDLE_ENFORCE_GT(item.second, 0);
-    }
-
-    auto &buf = buffer(item.first);
-    buf.max_size = item.second * max_batch_;
-    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
-
-    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
-    buf.size = 0;
-    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
-    buf.device = DeviceType::GPU;
-  }
 }

 nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
@@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
  buffer_sizes_[name] = 0;
 }

-void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
-  return buffer(name).buffer;
-}
-
-void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
-                                    size_t max_size) {
-  // determine data size
-  auto *output = TensorRTEngine::GetITensor(name);
-  nvinfer1::Dims dims = output->getDimensions();
-  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
-  size_t dst_size = dim_size * runtime_batch_ *
-                    kDataTypeSize[static_cast<int>(output->getType())];
-
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
-  PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_LE(dst_size, it->second);
-  PADDLE_ENFORCE_GE(max_size, dst_size);
-  auto &buf = buffer(name);
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                    cudaMemcpyDeviceToDevice, stream_),
-                    0);
-}
-
-void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
-                                    size_t max_size) {
-  // determine data size
-
-  auto *output = TensorRTEngine::GetITensor(name);
-  nvinfer1::Dims dims = output->getDimensions();
-  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
-  size_t dst_size = dim_size * runtime_batch_ *
-                    kDataTypeSize[static_cast<int>(output->getType())];
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
-  PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_LE(dst_size, it->second);
-  PADDLE_ENFORCE_GE(max_size, dst_size);
-  auto &buf = buffer(name);
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                       cudaMemcpyDeviceToHost, stream_));
-}
-
-Buffer &TensorRTEngine::buffer(const std::string &name) {
-  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
-                 name);
-  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
-  return buffers_[slot_offset];
-}
-
-void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
-                                     size_t size) {
-  auto &buf = buffer(name);
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
-  PADDLE_ENFORCE_NOT_NULL(data);
-  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
-  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
-  buf.size = size;
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyHostToDevice, stream_));
-}
-
-void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
-                                     size_t size) {
-  auto &buf = buffer(name);
-  buf.size = size;
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
-  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
-  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyDeviceToDevice, stream_));
-}
-
 void TensorRTEngine::SetITensor(const std::string &name,
                                nvinfer1::ITensor *tensor) {
  PADDLE_ENFORCE(tensor != nullptr);
@@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {

 int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }

-void TensorRTEngine::freshDeviceId() {
-  int count;
-  cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE_LT(device_, count);
-  cudaSetDevice(device_);
-}
-
 nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
    nvinfer1::ITensor *const *inputs, int num_inputs,
    plugin::PluginTensorRT *plugin) {
@@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
  return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
 }

+void TensorRTEngine::freshDeviceId() {
+  int count;
+  cudaGetDeviceCount(&count);
+  PADDLE_ENFORCE_LT(device_id_, count);
+  cudaSetDevice(device_id_);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"

@@ -37,7 +38,9 @@ class TRTInt8Calibrator;
 * There are two alternative ways to use it, one is  to build from a paddle
 * protobuf model, another way is to manully construct the network.
 */
-class TensorRTEngine : public EngineBase {
+class TensorRTEngine {
+  using DescType = ::paddle::framework::proto::BlockDesc;
+
 public:
  // Weight is model parameter.
  class Weight {
@@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase {
    nvinfer1::Weights w_;
  };

-  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
-                 int device = 0, bool enable_int8 = false,
-                 TRTInt8Calibrator* calibrator = nullptr,
+  TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false,
+                 TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
                 nvinfer1::ILogger& logger = NaiveLogger::Global())
      : max_batch_(max_batch),
        max_workspace_(max_workspace),
-        stream_(stream),
-        device_(device),
        enable_int8_(enable_int8),
        calibrator_(calibrator),
+        device_id_(device_id),
        logger_(logger) {}

-  virtual ~TensorRTEngine();
+  ~TensorRTEngine() {}

  // TODO(Superjomn) implement it later when graph segmentation is supported.
-  void Build(const DescType& paddle_model) override;
+  void Build(const DescType& paddle_model);

-  void Execute(int batch_size) override;
+  void Execute(int batch_size, std::vector<void*>* buffers,
+               cudaStream_t stream);

  // Initialize the inference network, so that TensorRT layers can add to this
  // network.
  void InitNetwork() {
+    freshDeviceId();
    infer_builder_.reset(createInferBuilder(&logger_));
    infer_network_.reset(infer_builder_->createNetwork());
  }
@@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase {
  // Check if the ITensor has been declared
  bool HasDeclared(const std::string& name);

-  // GPU memory address for an ITensor with specific name. One can operate on
-  // these memory directly for acceleration, for example, output the converted
-  // data directly to the buffer to save data copy overhead.
-  // NOTE this should be used after calling `FreezeNetwork`.
-  Buffer& buffer(const std::string& name) override;
-
-  cudaStream_t stream() { return stream_; }
-
-  // Fill an input from CPU memory with name and size.
-  void SetInputFromCPU(const std::string& name, const void* data, size_t size);
-  // TODO(Superjomn) is this method necessary given that buffer(xxx) can be
-  // accessed directly. Fill an input from GPU memory with name and size.
-  void SetInputFromGPU(const std::string& name, const void* data, size_t size);
-  // Get an output called name, the output of tensorrt is in GPU, so this method
-  // Return the output's GPU memory address without copy.
-  void* GetOutputInGPU(const std::string& name);
-  // Copy data into dst inside the GPU device.
-  void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
-  // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
-  // to CPU.
-  void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
-  // Fill an ITensor into map itensor_map_.
  void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
  // Get an ITensor called name.
  nvinfer1::ITensor* GetITensor(const std::string& name);

  nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
  nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
+
+  nvinfer1::IHostMemory* Serialize() {
+    PADDLE_ENFORCE(infer_engine_ != nullptr,
+                   "You should build engine first and then serialize");
+    ihost_memory_.reset(infer_engine_->serialize());
+    return ihost_memory_.get();
+  }
+
+  void Deserialize(const std::string& engine_serialized_data) {
+    freshDeviceId();
+    infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
+    infer_engine_.reset(runtime->deserializeCudaEngine(
+        engine_serialized_data.c_str(), engine_serialized_data.size(),
+        &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
+    PADDLE_ENFORCE(infer_engine_ != nullptr,
+                   "build cuda engine failed when deserialize engine info.!");
+    infer_context_.reset(infer_engine_->createExecutionContext());
+  }
+
  void SetRuntimeBatch(size_t batch_size);
  int GetRuntimeBatch();
-  int GetDevice() { return device_; }
+  int GetDeviceId() { return device_id_; }
  nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                    int num_inputs, plugin::PluginTensorRT*);

@@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase {
  std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
      weight_map;

-  // TODO(NHZLX)
-  // In the normal case, the paddle-trt exists bug when runing the googlenet.
-  // When there are more than two convolutions of 1 * 1 with the same input, the
-  // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into one conv, and then trigger bug. So,  We should use strategy to avoid
-  // this
-  // optimization for the time being. This bug will be fixed in the future.
-  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
-      itensor_quote_num;
-
 private:
+  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
+  // ensure that the thread is associated with the correct device by calling
+  // freshDeviceId().
+  void freshDeviceId();
+
  // the max batch size
  int max_batch_;
  // the runtime batch size
@@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase {
  // the max memory size the engine uses
  int max_workspace_;

-  cudaStream_t stream_;
-  // The specific GPU id that the TensorRTEngine bounded to.
-  int device_;
-
  bool enable_int8_;
  TRTInt8Calibrator* calibrator_;
  // batch size of the current data, will be updated each Executation.
  int batch_size_{-1};

+  int device_id_;
  nvinfer1::ILogger& logger_;

-  std::vector<Buffer> buffers_;
  // max data size for the buffers.
  std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
  std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
@@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase {
  infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
  infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
  infer_ptr<nvinfer1::IExecutionContext> infer_context_;
-  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
-  // ensure that the thread is associated with the correct device by calling
-  // freshDeviceId().
-  void freshDeviceId();
+  infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
 };  // class TensorRTEngine

 // Add an layer__ into engine__ with args ARGS.
 // For example:
-//   TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
 //
 // Reference
 // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network

--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -17,6 +17,9 @@
 #include <NvInfer.h>
 #include <cuda.h>
 #include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/platform/dynload/tensorrt.h"
 #include "paddle/fluid/platform/enforce.h"

@@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger {
  ~NaiveLogger() override {}
 };

+class NaiveProfiler : public nvinfer1::IProfiler {
+ public:
+  typedef std::pair<std::string, float> Record;
+  std::vector<Record> mProfile;
+
+  virtual void reportLayerTime(const char* layerName, float ms) {
+    auto record =
+        std::find_if(mProfile.begin(), mProfile.end(),
+                     [&](const Record& r) { return r.first == layerName; });
+    if (record == mProfile.end())
+      mProfile.push_back(std::make_pair(layerName, ms));
+    else
+      record->second += ms;
+  }
+
+  void printLayerTimes() {
+    float totalTime = 0;
+    for (size_t i = 0; i < mProfile.size(); i++) {
+      printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(),
+             mProfile[i].second);
+      totalTime += mProfile[i].second;
+    }
+    printf("Time over all layers: %4.3f\n", totalTime);
+  }
+};
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
 nv_library(tensorrt_plugin
-           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
+           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
+           prelu_op_plugin.cu  trt_plugin_factory.cc
           avg_pool_op_plugin.cu
           DEPS enforce tensorrt_engine prelu)
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/operators/math/pooling.h"

 namespace paddle {
@@ -20,6 +21,12 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer,
+                                              size_t length) {
+  return new AvgPoolPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize);
+
 nvinfer1::Dims AvgPoolPlugin::getOutputDimensions(
    int index, const nvinfer1::Dims* inputDims, int nbInputs) {
  assert(nbInputs == 1);

--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
@@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT {

 protected:
  size_t getSerializationSize() override {
-    return SerializedSize(ceil_mode_) + SerializedSize(ksize_) +
-           SerializedSize(strides_) + SerializedSize(paddings_) +
-           SerializedSize(input_shape_) + getBaseSerializationSize();
+    return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) +
+           SerializedSize(ksize_) + SerializedSize(strides_) +
+           SerializedSize(paddings_) + SerializedSize(input_shape_) +
+           SerializedSize(output_shape_) + getBaseSerializationSize();
  }

  // TRT will call this func when we need to serialize the configuration of
  // tensorrt.
-  // It should not be called by users.
  void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
    serializeBase(buffer);
    SerializeValue(&buffer, ceil_mode_);
    SerializeValue(&buffer, ksize_);
    SerializeValue(&buffer, strides_);
    SerializeValue(&buffer, paddings_);
    SerializeValue(&buffer, input_shape_);
+    SerializeValue(&buffer, output_shape_);
  }

 public:
+  AvgPoolPlugin() {}
  AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize,
                std::vector<int> strides, std::vector<int> paddings,
                std::vector<int> input_shape)
@@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT {
    DeserializeValue(&serialData, &serialLength, &strides_);
    DeserializeValue(&serialData, &serialLength, &paddings_);
    DeserializeValue(&serialData, &serialLength, &input_shape_);
+    DeserializeValue(&serialData, &serialLength, &output_shape_);
  }

  AvgPoolPlugin *clone() const override {
@@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT {
                             input_shape_);
  }

-  const char *getPluginType() const override { return "avg_pool"; }
+  const char *getPluginType() const override { return "avg_pool_plugin"; }
  int getNbOutputs() const override { return 1; }
  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
                                     int nbInputDims) override;

--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -14,12 +14,19 @@ limitations under the License. */

 #include <glog/logging.h>
 #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"

 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {

+ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer,
+                                                      size_t length) {
+  return new ElementWisePlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize);
+
 namespace details {

 template <typename T>
@@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
  const float* y = reinterpret_cast<const float*>(inputs[1]);
  float* out = reinterpret_cast<float*>(outputs[0]);

-  if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
+  if (type_ == "add") {
    details::ElementWise(details::Add<float>(), x, y, out, batch_size,
                         prev_size_, midd_size_, post_size_, stream);
-  } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
+  } else if (type_ == "mul") {
    details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
                         prev_size_, midd_size_, post_size_, stream);
  } else {

--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -14,6 +14,7 @@ limitations under the License. */

 #pragma once

+#include <string>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"

@@ -24,9 +25,8 @@ namespace plugin {

 class ElementWisePlugin : public PluginTensorRT {
 public:
-  ElementWisePlugin(nvinfer1::ElementWiseOperation type,
-                    nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
-                    int axis)
+  ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x,
+                    nvinfer1::Dims const &dims_y, int axis)
      : type_(type),
        dims_x_(dims_x),
        dims_y_(dims_y),
@@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT {

  ElementWisePlugin(void const *serial_data, size_t serial_length) {
    deserializeBase(serial_data, serial_length);
+    const char *elementwise_type;
+    DeserializeValue(&serial_data, &serial_length, &elementwise_type);
+    type_ = std::string(elementwise_type);
    DeserializeValue(&serial_data, &serial_length, &axis_);
    DeserializeValue(&serial_data, &serial_length, &dims_x_);
    DeserializeValue(&serial_data, &serial_length, &dims_y_);
@@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT {
    return nullptr;
  }

-  const char *getPluginType() const override { return "elementwise"; }
+  const char *getPluginType() const override { return "elementwise_plugin"; }

  nvinfer1::Dims getOutputDimensions(int index,
                                     const nvinfer1::Dims *input_dims,
@@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT {

 protected:
  size_t getSerializationSize() override {
-    return SerializedSize(axis_) + SerializedSize(dims_x_) +
-           SerializedSize(dims_y_) + getBaseSerializationSize();
+    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
+           SerializedSize(dims_x_) + SerializedSize(dims_y_) +
+           getBaseSerializationSize();
  }

  void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
    serializeBase(buffer);
+    SerializeValue(&buffer, type_.c_str());
    SerializeValue(&buffer, axis_);
    SerializeValue(&buffer, dims_x_);
    SerializeValue(&buffer, dims_y_);
  }

-  nvinfer1::ElementWiseOperation type_;
+  std::string type_;
  nvinfer1::Dims dims_x_;
  nvinfer1::Dims dims_y_;
  int axis_;

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -17,6 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/operators/math/prelu.h"

 namespace paddle {
@@ -24,6 +25,17 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) {
+  return new PReluPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize);
+
+int PReluPlugin::initialize() {
+  cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
+  cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
+             cudaMemcpyHostToDevice);
+}
+
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
                                                const nvinfer1::Dims *inputDims,
                                                int nbInputs) {
@@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
  // input dims is CHW.
  const auto &input_dims = this->getInputDims(0);
  const float *input = reinterpret_cast<const float *>(inputs[0]);
-  const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  // const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  const float *alpha = p_gpu_weight_;
  float *output = reinterpret_cast<float **>(outputs)[0];

  std::vector<int> input_shape;

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -14,7 +14,12 @@

 #pragma once

+#include <algorithm>
 #include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"

@@ -24,39 +29,51 @@ namespace tensorrt {
 namespace plugin {

 class PReluPlugin : public PluginTensorRT {
-  TensorRTEngine::Weight alpha_;
+  std::vector<float> weight_;
+  float *p_gpu_weight_;
  std::string mode_;

 protected:
  size_t getSerializationSize() override {
-    // return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
-    return 0;
+    return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
+           SerializedSize(weight_) + SerializedSize(getPluginType());
  }

  // TRT will call this func when we need to serialize the configuration of
  // tensorrt.
  // It should not be called by users.
  void serialize(void *buffer) override {
-    // serializeBase(buffer);
-    // SerializeValue(&buffer, alpha_);
-    // SerializeValue(&buffer, mode_);
+    SerializeValue(&buffer, getPluginType());
+    serializeBase(buffer);
+    SerializeValue(&buffer, weight_);
+    SerializeValue(&buffer, mode_.c_str());
  }

 public:
-  PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode)
-      : alpha_(alpha), mode_(mode) {}
+  PReluPlugin(const float *weight, const int weight_num,
+              std::string const &mode)
+      : mode_(mode) {
+    weight_.resize(weight_num);
+    std::copy(weight, weight + weight_num, weight_.data());
+  }

  // It was used for tensorrt deserialization.
  // It should not be called by users.
  PReluPlugin(void const *serialData, size_t serialLength) {
-    // deserializeBase(serialData, serialLength);
-    // DeserializeValue(&serialData, &serialLength, &alpha_);
-    // DeserializeValue(&serialData, &serialLength, &mode_);
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &weight_);
+    const char *prelu_mode;
+    DeserializeValue(&serialData, &serialLength, &prelu_mode);
+    mode_ = std::string(prelu_mode);
  }
+  ~PReluPlugin() { cudaFree(p_gpu_weight_); }
+  int initialize() override;

-  PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); }
+  PReluPlugin *clone() const override {
+    return new PReluPlugin(weight_.data(), weight_.size(), mode_);
+  }

-  const char *getPluginType() const override { return "prelu"; }
+  const char *getPluginType() const override { return "prelu_plugin"; }
  int getNbOutputs() const override { return 1; }
  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
                                     int nbInputDims) override;

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -15,12 +15,18 @@
 #include <cuda_fp16.h>
 #include <algorithm>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"

 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {

+SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
+  return new SplitPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
+
 // copied from operators::math::SplitFunctor
 template <typename T>
 __global__ void SplitKernel(const T* input_data, const int in_row,

--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -15,6 +15,7 @@
 #pragma once

 #include <thrust/device_vector.h>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"

@@ -25,6 +26,7 @@ namespace plugin {

 class SplitPlugin : public PluginTensorRT {
 public:
+  SplitPlugin() {}
  SplitPlugin(int axis, std::vector<int> const &output_lengths)
      : axis_(axis), same_shape_(true), output_length_(output_lengths) {}

@@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
    return new SplitPlugin(axis_, output_length_);
  }

-  const char *getPluginType() const override { return "split"; }
+  const char *getPluginType() const override { return "split_plugin"; }
  int getNbOutputs() const override { return output_length_.size(); }
  nvinfer1::Dims getOutputDimensions(int index,
                                     const nvinfer1::Dims *input_dims,
@@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT {

 protected:
  size_t getSerializationSize() override {
-    return SerializedSize(axis_) + SerializedSize(output_length_) +
-           getBaseSerializationSize();
+    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
+           SerializedSize(output_length_) + getBaseSerializationSize();
  }

  void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
    serializeBase(buffer);
    SerializeValue(&buffer, axis_);
    SerializeValue(&buffer, output_length_);

--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -17,9 +17,10 @@
 #include <NvInfer.h>
 #include <cstring>
 #include <unordered_map>
+#include <utility>
 #include <vector>

-#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"

@@ -30,6 +31,13 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+class PluginTensorRT;
+
+typedef std::function<PluginTensorRT*(const void*, size_t)>
+    PluginDeserializeFunc;
+
+typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
+
 class PluginTensorRT : public nvinfer1::IPluginExt {
 public:
  PluginTensorRT() {}

--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
+                                                    const void* serial_data,
+                                                    size_t serial_length) {
+  const char* plugin_type;
+  DeserializeValue(&serial_data, &serial_length, &plugin_type);
+
+  PADDLE_ENFORCE(Has(plugin_type),
+                 "trt plugin type %s does not exists, check it.", plugin_type);
+  auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
+  owned_plugins_.emplace_back(plugin);
+
+  return plugin;
+}
+
+bool PluginFactoryTensorRT::RegisterPlugin(
+    const std::string& op_name, PluginDeserializeFunc deserialize_func) {
+  if (Has(op_name)) return false;
+  auto ret = plugin_registry_.emplace(op_name, deserialize_func);
+  return ret.second;
+}
+
+void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); }
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <cstring>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class PluginFactoryTensorRT : public nvinfer1::IPluginFactory,
+                              public DeleteHelper {
+ public:
+  // Deserialization method
+  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
+                               size_t serial_length) override;
+
+  bool RegisterPlugin(const std::string& op_name,
+                      PluginDeserializeFunc deserialize_func);
+
+  bool Has(const std::string& op_name) {
+    return plugin_registry_.find(op_name) != plugin_registry_.end();
+  }
+
+  void DestroyPlugins();
+
+ protected:
+  std::unordered_map<std::string, PluginDeserializeFunc> plugin_registry_;
+
+  std::list<std::unique_ptr<PluginTensorRT>> owned_plugins_;
+};
+
+class TrtPluginRegistrar {
+ public:
+  TrtPluginRegistrar(const std::string& name,
+                     PluginDeserializeFunc deserialize_func) {
+    inference::Singleton<PluginFactoryTensorRT>::Global().RegisterPlugin(
+        name, deserialize_func);
+  }
+};
+
+#define REGISTER_TRT_PLUGIN(name, deserialize_func) \
+  REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func)
+
+#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func)      \
+  static paddle::inference::tensorrt::plugin::TrtPluginRegistrar   \
+      trt_plugin_registrar##ctr __attribute__((unused)) =          \
+          paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
+              name, deserialize_func)
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/serialize.h
@@ -13,8 +13,8 @@
 // limitations under the License.

 #pragma once
-
 #include <cstring>
+#include <string>
 #include <type_traits>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
@@ -24,6 +24,13 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {

+// Some trt base classes lack of the destructor.
+// We use a assisted class to fix this.
+struct DeleteHelper {
+ protected:
+  virtual ~DeleteHelper() {}
+};
+
 template <typename T>
 inline void SerializeValue(void** buffer, T const& value);


--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>

+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/platform/enforce.h"

@@ -27,19 +29,34 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
 protected:
  void SetUp() override {
-    ASSERT_EQ(0, cudaStreamCreate(&stream_));
-    engine_ = new TensorRTEngine(10, 1 << 10, stream_);
+    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+
+    engine_ = new TensorRTEngine(10, 1 << 10);
    engine_->InitNetwork();
  }

  void TearDown() override {
+    if (engine_) {
      delete engine_;
-    cudaStreamDestroy(stream_);
+      engine_ = nullptr;
+    }
+  }
+
+  void PrepareInputOutput(const std::vector<float> &input,
+                          std::vector<int> output_shape) {
+    TensorFromVector(input, *ctx_, &input_);
+    output_.Resize(framework::make_ddim(output_shape));
+  }
+
+  void GetOutput(std::vector<float> *output) {
+    TensorToVector(output_, *ctx_, output);
  }

 protected:
-  TensorRTEngine* engine_;
-  cudaStream_t stream_;
+  framework::Tensor input_;
+  framework::Tensor output_;
+  TensorRTEngine *engine_;
+  platform::CUDADeviceContext *ctx_;
 };

 TEST_F(TensorRTEngineTest, add_layer) {
@@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) {
  float raw_weight[size] = {2.};  // Weight in CPU memory.
  float raw_bias[size] = {3.};

+  std::vector<void *> buffers(2);  // TRT binded inputs
+
  LOG(INFO) << "create weights";
  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size);
  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size);
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                  nvinfer1::DimsCHW{1, 1, 1});
-  auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
+  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
                                        weight.get(), bias.get());
  PADDLE_ENFORCE(fc_layer != nullptr);

@@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) {
  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);

  // fill in real data
-  float x_v = 1234;
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           1 * sizeof(float));
+  std::vector<float> x_v = {1234};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {1});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
  LOG(INFO) << "to execute";
-  engine_->Execute(1);
+  engine_->Execute(1, &buffers, ctx_->stream());

  LOG(INFO) << "to get output";
-  float y_cpu;
-  engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
+  GetOutput(&y_cpu);

  LOG(INFO) << "to checkout output";
-  ASSERT_EQ(y_cpu, x_v * 2 + 3);
+  ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
 }

 TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
@@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
  // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
  float raw_weight[4] = {1.0, 1.1, 3.3, 4.4};
  float raw_bias[2] = {1.3, 2.4};
+  std::vector<void *> buffers(2);  // TRT binded inputs

  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4);
  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2);
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                  nvinfer1::DimsCHW{1, 2, 1});
-  auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
+  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
                                        weight.get(), bias.get());
  PADDLE_ENFORCE(fc_layer != nullptr);

@@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
  engine_->FreezeNetwork();
  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);

-  float x_v[2] = {1.0, 2.0};
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           2 * sizeof(float));
-  engine_->Execute(1);
+  // fill in real data
+  std::vector<float> x_v = {1.0, 2.0};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {2});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(1, &buffers, ctx_->stream());

  LOG(INFO) << "to get output";
-  float y_cpu[2] = {-1., -1.};
+  GetOutput(&y_cpu);

  auto dims = engine_->GetITensor("y")->getDimensions();
  ASSERT_EQ(dims.nbDims, 3);
  ASSERT_EQ(dims.d[0], 2);
  ASSERT_EQ(dims.d[1], 1);
-  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+
  ASSERT_EQ(y_cpu[0], 4.5);
  ASSERT_EQ(y_cpu[1], 14.5);
 }
@@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
  // Weight in CPU memory.
  float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
  float raw_bias[1] = {0};
+  std::vector<void *> buffers(2);  // TRT binded inputs

  TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
  TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                  nvinfer1::Dims3{1, 3, 3});
-  auto* conv_layer =
+  auto *conv_layer =
      TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
                           weight.get(), bias.get());
  PADDLE_ENFORCE(conv_layer != nullptr);
@@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
  engine_->FreezeNetwork();
  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);

-  float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+  // fill in real data
+  std::vector<float> x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
                            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           18 * sizeof(float));
-  engine_->Execute(2);
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {18});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(2, &buffers, ctx_->stream());

  LOG(INFO) << "to get output";
-  float* y_cpu = new float[18];
-  engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
+  GetOutput(&y_cpu);
+
  ASSERT_EQ(y_cpu[0], 4.0);
  ASSERT_EQ(y_cpu[1], 6.0);
 }

 TEST_F(TensorRTEngineTest, test_pool2d) {
  // Weight in CPU memory.
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                  nvinfer1::Dims3{1, 2, 2});

+  std::vector<void *> buffers(2);  // TRT binded inputs
  nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
-  auto* pool_layer =
-      TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
-                           pool_t, nvinfer1::DimsHW{2, 2});
+  auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
+                                          nvinfer1::DimsHW{2, 2});

  PADDLE_ENFORCE(pool_layer != nullptr);
  pool_layer->setStride(nvinfer1::DimsHW{1, 1});
@@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
  engine_->FreezeNetwork();
  ASSERT_EQ(engine_->engine()->getNbBindings(), 2);

-  float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           8 * sizeof(float));
-  engine_->Execute(2);
+  // fill in real data
+  std::vector<float> x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {2});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(2, &buffers, ctx_->stream());

  LOG(INFO) << "to get output";
-  float* y_cpu = new float[2];
-  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+  GetOutput(&y_cpu);

  ASSERT_EQ(y_cpu[0], 2.0);
  ASSERT_EQ(y_cpu[1], 5.0);

--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -366,15 +366,17 @@ TEST(Analyzer_rnn1, ZeroCopyMultiThread) {
 #define NEW_TENSOR(name__) \
  auto name__##_tensor = predictor->GetInputTensor(#name__);

-  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  for (int tid = 1; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }
  double total_time_of_threads{0};
  std::vector<std::thread> threads;

  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
    threads.emplace_back([&, tid] {
-      // To ensure the thread binding correctly,
-      // please clone inside the threadpool.
-      auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
      NEW_TENSOR(data_lod_attention);
      NEW_TENSOR(cell_init);
      NEW_TENSOR(data);

--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -266,15 +266,17 @@ TEST(Analyzer_seq_pool1, zerocopy_profile_threads) {
  SetConfig(&config);
  config.SwitchUseFeedFetchOps(false);

-  auto base_predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreatePaddlePredictor<AnalysisConfig>(config));
+  for (int tid = 1; tid < FLAGS_num_threads; tid++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }
  double total_time_of_threads{0};
  std::vector<std::thread> threads;

  for (int tid = 0; tid < FLAGS_num_threads; tid++) {
    threads.emplace_back([&, tid] {
-      // To ensure the thread binding correctly,
-      // please clone inside the threadpool.
-      auto predictor = base_predictor->Clone();
+      auto &predictor = predictors[tid];
      std::vector<std::unique_ptr<ZeroCopyTensor>> inputs;
      PrepareZeroCopyInputs(predictor, &inputs);
      auto output_tensor = predictor->GetOutputTensor(out_var_name);

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -17,8 +17,10 @@
 #include <gtest/gtest.h>

 #include <algorithm>
+#include <memory>
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
 #include <vector>
 #ifdef WITH_GPERFTOOLS
 #include <gperftools/profiler.h>
@@ -252,7 +254,11 @@ void TestMultiThreadPrediction(
  int batch_size = FLAGS_batch_size;
  int num_times = FLAGS_repeat;
  std::vector<std::thread> threads;
-  auto main_predictor = CreateTestPredictor(config, use_analysis);
+  std::vector<std::unique_ptr<PaddlePredictor>> predictors;
+  predictors.emplace_back(CreateTestPredictor(config, use_analysis));
+  for (int tid = 1; tid < num_threads; tid++) {
+    predictors.emplace_back(predictors.front()->Clone());
+  }

  size_t total_time{0};
  for (int tid = 0; tid < num_threads; ++tid) {
@@ -260,9 +266,7 @@ void TestMultiThreadPrediction(
      // Each thread should have local inputs and outputs.
      // The inputs of each thread are all the same.
      std::vector<PaddleTensor> outputs_tid;
-      // To ensure the thread binding correctly,
-      // please clone inside the threadpool.
-      auto predictor = main_predictor->Clone();
+      auto &predictor = predictors[tid];
 #ifdef PADDLE_WITH_MKLDNN
      if (use_analysis) {
        static_cast<AnalysisPredictor *>(predictor.get())

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -54,7 +54,8 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
  if (use_gpu) {
    config->EnableUseGpu(100, 0);
    if (use_tensorrt) {
-      config->EnableTensorRtEngine(1 << 10, batch_size);
+      config->EnableTensorRtEngine(1 << 10, batch_size, 3,
+                                   AnalysisConfig::Precision::kFloat32, false);
      config->pass_builder()->DeletePass("conv_bn_fuse_pass");
      config->pass_builder()->DeletePass("fc_fuse_pass");
      config->pass_builder()->TurnOnDebug();

--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -26,20 +26,17 @@ Allocator::~Allocator() {}
 bool Allocator::IsAllocThreadSafe() const { return false; }

 AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) {
-  VLOG(2) << "Alloc allocation on " << typeid(*this).name();
  auto ptr = AllocateImpl(size, attr);
  ptr->RegisterAllocatorChain(this);
-  VLOG(2) << "Alloc success";
  return AllocationPtr(ptr);
 }

 void Allocator::FreeImpl(Allocation* allocation) {
-  auto* allocator = allocation->TopAllocator();
+  Allocator* allocator = allocation->TopAllocator();
  allocator->Free(allocation);
 }

 void Allocator::Free(Allocation* allocation) {
-  VLOG(2) << "Free allocation on " << typeid(*this).name();
  allocation->PopAllocator();
  FreeImpl(allocation);
 }
@@ -47,7 +44,7 @@ void Allocator::Free(Allocation* allocation) {
 const char* BadAlloc::what() const noexcept { return msg_.c_str(); }

 void AllocationDeleter::operator()(Allocation* allocation) const {
-  auto* allocator = allocation->TopAllocator();
+  Allocator* allocator = allocation->TopAllocator();
  allocator->Free(allocation);
 }


--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -16,7 +16,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/small_stack.h"
+#include "paddle/fluid/framework/inlined_stack.h"
 #include "paddle/fluid/platform/place.h"

 namespace paddle {
@@ -82,7 +82,7 @@ class Allocation {
  std::vector<Allocator*> GetAllocatorChain() const {
    std::vector<Allocator*> allocators;
    for (size_t i = 0; i < allocator_chain_.size(); ++i) {
-      allocators[i] = allocator_chain_[i];
+      allocators.push_back(allocator_chain_[i]);
    }
    return allocators;
  }
@@ -100,7 +100,7 @@ class Allocation {
  void* ptr_;
  size_t size_;
  platform::Place place_;
-  framework::SmallStack<Allocator*, 8> allocator_chain_;
+  framework::InlinedStack<Allocator*, 8> allocator_chain_;

  friend class Allocator;
  friend class AllocationDeleter;

--- a/paddle/fluid/memory/allocation/legacy_allocator.cc
+++ b/paddle/fluid/memory/allocation/legacy_allocator.cc
@@ -36,6 +36,8 @@ DEFINE_bool(init_allocated_mem, false,
            "that initializing the allocated memory with a small value "
            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
+DECLARE_double(initial_gpu_memory_in_mb);
+DECLARE_double(reallocate_gpu_memory_in_mb);
 DECLARE_bool(benchmark);

 namespace paddle {
@@ -69,7 +71,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
  std::call_once(init_flag, []() {
    a = new detail::BuddyAllocator(
        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
+        platform::CpuMinChunkSize(), platform::CpuMaxChunkSize(),
+        platform::CpuMaxChunkSize());
  });

  return a;
@@ -131,40 +134,53 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
 }

 #ifdef PADDLE_WITH_CUDA
-BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
-  static std::once_flag init_flag;
-  static detail::BuddyAllocator **a_arr = nullptr;
-  static std::vector<int> devices;
-
-  std::call_once(init_flag, [gpu_id]() {
-    devices = platform::GetSelectedDevices();
-    int gpu_num = devices.size();
-
-    allocation::GPUMemMonitor.Initialize(devices.size());
+class GPUBuddyAllocatorList {
+ public:
+  GPUBuddyAllocatorList()
+      : allocators_(platform::GetCUDADeviceCount()),
+        flags_(platform::GetCUDADeviceCount()) {
+    allocation::GPUMemMonitor.Initialize(allocators_.size());
+  }

-    a_arr = new BuddyAllocator *[gpu_num];
-    for (size_t i = 0; i < devices.size(); ++i) {
-      int dev_id = devices[i];
-      a_arr[i] = nullptr;
+  BuddyAllocator *Get(size_t dev_id) {
+    PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id);
+    std::call_once(flags_[dev_id], [this, dev_id] {
      platform::SetDeviceId(dev_id);
-      a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
+      size_t first_size = platform::GpuFirstAllocateChunkSize();
+      size_t re_size = platform::GpuReAllocateChunkSize();
+      allocators_[dev_id] =
+          new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                 new detail::GPUAllocator(dev_id)),
-                                    platform::GpuMinChunkSize(),
-                                    platform::GpuMaxChunkSize());
-
-      VLOG(10) << "\n\nNOTE: each GPU device use "
-               << FLAGS_fraction_of_gpu_memory_to_use * 100
+                             platform::GpuMinChunkSize(), first_size, re_size);
+      VLOG(2) << "\n\nNOTE: each GPU device use "
+              << string::HumanReadableSize(first_size) << "(initial chunk) "
+              << string::HumanReadableSize(re_size) << "(reallocate chunk) "
              << "% of GPU memory.\n"
              << "You can set GFlags environment variable '"
              << "FLAGS_fraction_of_gpu_memory_to_use"
-               << "' to change the fraction of GPU usage.\n\n";
-    }
+              << "' or "
+                 "'FLAGS_initial_gpu_memory_in_mb/"
+                 "FLAGS_reallocate_gpu_memory_in_mb' to change the fraction "
+                 "of GPU usage.\n\n";
+      VLOG(2) << "Currently, FLAGS_fraction_of_gpu_memory_to_use="
+              << FLAGS_fraction_of_gpu_memory_to_use << ", "
+              << "FLAGS_initial_gpu_memory_in_mb="
+              << FLAGS_initial_gpu_memory_in_mb << ", "
+              << "FLAGS_reallocate_gpu_memory_in_mb="
+              << FLAGS_reallocate_gpu_memory_in_mb;
    });
+    return allocators_[dev_id];
+  }
+
+ private:
+  std::vector<BuddyAllocator *> allocators_;
+  std::vector<std::once_flag> flags_;
+};

+BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
+  static GPUBuddyAllocatorList allocators;
  platform::SetDeviceId(gpu_id);
-  auto pos = std::distance(devices.begin(),
-                           std::find(devices.begin(), devices.end(), gpu_id));
-  return a_arr[pos];
+  return allocators.Get(gpu_id);
 }
 #endif

@@ -183,7 +199,7 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
 #ifdef PADDLE_WITH_CUDA
  auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
  auto *ptr = buddy_allocator->Alloc(size);
-  if (ptr == nullptr) {
+  if (ptr == nullptr && size > 0) {
    int cur_dev = platform::GetCurrentDeviceId();
    platform::SetDeviceId(place.device);
    size_t avail, total;
@@ -234,6 +250,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                new detail::CUDAPinnedAllocator),
                            platform::CUDAPinnedMinChunkSize(),
+                            platform::CUDAPinnedMaxChunkSize(),
                            platform::CUDAPinnedMaxChunkSize());
  });


--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.cc
@@ -14,16 +14,90 @@

 #include "paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h"
 #include <algorithm>
+#include <cctype>
+#include <fstream>
 #include <limits>
+#include <sstream>
+#include <string>
 #include "paddle/fluid/platform/lock_guard_ptr.h"

 DEFINE_double(tolerant_times, 2,
              "Tolerant memory size times of buffered_allocator");

+DEFINE_string(division_plan_path, "", "Division plan file path");
+
 namespace paddle {
 namespace memory {
 namespace allocation {

+std::string TrimStringAndToLowerCase(const std::string &str) {
+  auto not_space = [](char ch) { return std::isspace(ch) == 0; };
+  auto first_idx = static_cast<size_t>(
+      std::find_if(str.begin(), str.end(), not_space) - str.begin());
+  auto last_idx = static_cast<size_t>(
+      std::find_if(str.rbegin(), str.rend(), not_space) - str.rbegin());
+  if (first_idx == str.size() || last_idx == str.size()) return "";
+
+  last_idx = str.size() - 1 - last_idx;
+  auto ret = str.substr(first_idx, last_idx - first_idx);
+  std::for_each(ret.begin(), ret.end(),
+                [](char &ch) { ch = std::tolower(ch); });
+  return ret;
+}
+
+static size_t ParseStringToBytes(const std::string &str) {
+  std::string ret = str;
+  if (ret.back() == 'b') {
+    ret.pop_back();
+  }
+
+  PADDLE_ENFORCE(!ret.empty(), "Wrong format: %s", str);
+  size_t multiples = 1;
+  switch (ret.back()) {
+    case 'g':
+      multiples *= (static_cast<size_t>(1) << 30);
+      break;
+    case 'm':
+      multiples *= (static_cast<size_t>(1) << 20);
+      break;
+    case 'k':
+      multiples *= (static_cast<size_t>(1) << 10);
+      break;
+    default:
+      break;
+  }
+
+  if (multiples != 1) ret.pop_back();
+  ret = TrimStringAndToLowerCase(ret);
+  double ret_val = 0.0;
+  std::stringstream ss(ret);
+  PADDLE_ENFORCE((ss >> ret_val).good(), "Wrong format %s", str);
+  return static_cast<size_t>(ret_val * multiples);
+}
+
+static std::string GetDebugStringOfPlan(const std::vector<size_t> &plan) {
+  std::string ret("[");
+  for (auto sz : plan) {
+    ret += string::HumanReadableSize(sz);
+    ret += ", ";
+  }
+  return ret + "]";
+}
+
+static std::vector<size_t> ReadDivisionPlanFromFile(
+    const std::string &filepath) {
+  std::ifstream is(filepath.c_str());
+  PADDLE_ENFORCE(is.good(), "File not exist");
+  std::string str;
+  std::vector<size_t> plan;
+  while (std::getline(is, str).good()) {
+    str = TrimStringAndToLowerCase(str);
+    if (str.empty()) break;
+    plan.push_back(ParseStringToBytes(str));
+  }
+  return plan;
+}
+
 static void CheckAndModifyMemoryDivisionPlan(
    std::vector<size_t> *division_plan) {
  // Check whether the division plan is strictly sorted
@@ -50,10 +124,21 @@ static void CheckAndModifyMemoryDivisionPlan(
 }

 static std::vector<size_t> GetDefaultDivisionPlan() {
+  if (!FLAGS_division_plan_path.empty()) {
+    return ReadDivisionPlanFromFile(FLAGS_division_plan_path);
+  }
+
+  constexpr size_t kMaxLogSize = 30;
+
  std::vector<size_t> plan;
+  for (size_t i = 12; i <= kMaxLogSize; ++i) {
+    plan.push_back(static_cast<size_t>(1) << i);
+  }
+  /*
  for (size_t i = 0; i < sizeof(size_t) * 8; ++i) {
    plan.push_back(static_cast<size_t>(1) << i);
  }
+  */
  return plan;
 }

@@ -78,27 +163,32 @@ MultiBinBufferedAllocator::MultiBinBufferedAllocator(
    : underlying_allocator_(std::move(underlying_allocator)),
      division_plan_(division_plan) {
  CheckAndModifyMemoryDivisionPlan(&division_plan_);
-  allocations_.resize(division_plan_.size());
-  mtx_.resize(division_plan_.size());
+  allocations_.resize(division_plan_.size() - 1);
+  mtx_.resize(division_plan_.size() - 1);
  if (underlying_allocator_->IsAllocThreadSafe()) {
    for (auto &mtx : mtx_) {
      mtx.reset(new std::mutex());
    }
  }

+  VLOG(1) << "Division plan is: " << GetDebugStringOfPlan(division_plan_);
  VLOG(1) << "FLAGS_tolerant_times = " << FLAGS_tolerant_times;
 }

 void MultiBinBufferedAllocator::FreeImpl(Allocation *allocation) {
  auto bin_index = FindDivisionPlanBinIndex(division_plan_, allocation->size());
-  {
+  if (bin_index < allocations_.size()) {
    platform::LockGuardPtr<std::mutex> guard(mtx_[bin_index]);
    allocations_[bin_index].emplace(allocation->size(),
                                    AllocationPtr(allocation));
+  } else {
+    underlying_allocator_->Free(allocation);
  }
 }

-void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
+// bin_index is not used currently.
+// Maybe we can design more flexible FreeCache strategy based on bin_index
+size_t MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
  size_t accumulated_size = 0;
  // FIXME(zjl): free the largest first when there is no extra
  for (size_t i = allocations_.size() - 1; i != static_cast<size_t>(-1); --i) {
@@ -110,33 +200,53 @@ void MultiBinBufferedAllocator::FreeCache(size_t size, size_t bin_index) {
      underlying_allocator_->Free(it->second.release());
      allocations_[i].erase(it--);
      if (accumulated_size >= size) {
-        return;
+        return accumulated_size;
      }
    } while (!allocations_[i].empty());
  }
+  return accumulated_size;
 }

 Allocation *MultiBinBufferedAllocator::AllocateImpl(size_t size, Attr attr) {
  auto bin_index = FindDivisionPlanBinIndex(division_plan_, size);
  auto upper_size = TolerantUpperSize(size);

-  for (; upper_size >= division_plan_[bin_index]; ++bin_index) {
+  // if (bin_index >= allocations_.size()) {
+  //  VLOG(2) << "Allocate " << size << " from underlying directly";
+  //}
+
+  for (; bin_index < allocations_.size() &&
+         upper_size >= division_plan_[bin_index];
+       ++bin_index) {
    auto &allocation = allocations_[bin_index];
    platform::LockGuardPtr<std::mutex> lock(mtx_[bin_index]);
    auto it = allocation.lower_bound(size);
-    if (it != allocation.end() && it->second->size() < upper_size) {
+    if (it != allocation.end() && it->second->size() <= upper_size) {
+      size_t sz = it->second->size();
      auto ret = std::move(it->second);
      allocation.erase(it);
+      VLOG(3) << "Allocate " << sz << "(required " << size
+              << ") from cache directly";
      return ret.release();
    }
  }

+  size_t retry_time = 1;
+  while (true) {
    try {
-    return underlying_allocator_->Allocate(size, attr).release();
+      auto ret = underlying_allocator_->Allocate(size, attr).release();
+      VLOG(2) << "Allocate " << size << " from underlying directly";
+      return ret;
    } catch (BadAlloc &) {
-    VLOG(2) << "BadAlloc raises, try to free " << size << " caches";
-    FreeCache(size, bin_index);
-    return underlying_allocator_->Allocate(size, attr).release();
+      VLOG(1) << retry_time << "-th BadAlloc raises, try to free " << size
+              << " bytes caches";
+      // size_t actual_free_size = FreeCache(size, bin_index);
+      size_t actual_free_size = FreeCache(-1UL, bin_index);
+      VLOG(1) << retry_time << "-th free " << actual_free_size
+              << " bytes caches";
+      if (actual_free_size == 0) throw;
+    }
+    ++retry_time;
  }
 }


--- a/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/multi_bin_buffered_allocator.h
@@ -41,7 +41,7 @@ class MultiBinBufferedAllocator : public Allocator {
  void FreeImpl(Allocation* allocation) override;

 private:
-  void FreeCache(size_t size, size_t bin_index);
+  size_t FreeCache(size_t size, size_t bin_index);

  std::shared_ptr<Allocator> underlying_allocator_;
  std::vector<std::multimap<size_t, AllocationPtr>> allocations_;

--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -25,9 +25,11 @@ namespace detail {

 BuddyAllocator::BuddyAllocator(
    std::unique_ptr<SystemAllocator> system_allocator, size_t min_chunk_size,
-    size_t max_chunk_size)
+    size_t first_allocate_chunk_size, size_t reallocate_chunk_size)
    : min_chunk_size_(min_chunk_size),
-      max_chunk_size_(max_chunk_size),
+      first_allocate_chunk_size_(first_allocate_chunk_size),
+      reallocate_chunk_size_(reallocate_chunk_size),
+      max_chunk_size_(first_allocate_chunk_size),
      cache_(system_allocator->UseGpu()),
      system_allocator_(std::move(system_allocator)) {}

@@ -36,9 +38,10 @@ BuddyAllocator::~BuddyAllocator() {
              "have actually been freed";
  while (!pool_.empty()) {
    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+    auto desc = cache_.load(block);
+    VLOG(10) << "Free from block (" << block << ", " << desc.size << ")";

-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, desc.size, desc.index);
    cache_.invalidate(block);
    pool_.erase(pool_.begin());
  }
@@ -63,7 +66,7 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // if the allocation is huge, send directly to the system allocator
  if (size > max_chunk_size_) {
    VLOG(10) << "Allocate from system allocator.";
-    return SystemAlloc(size);
+    return SystemAlloc(size, false);
  }

  // query and allocate from the existing chunk
@@ -72,9 +75,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
  // refill the pool if failure
  if (it == pool_.end()) {
    it = RefillPool();
-    // if still failure, fail fatally
+    // if still failure, try to allocate from SystemAllocator
    if (it == pool_.end()) {
-      return nullptr;
+      return SystemAlloc(size, false);
    }
  } else {
    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
@@ -98,7 +101,7 @@ void BuddyAllocator::Free(void* p) {

  VLOG(10) << "Free from address " << block;

-  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+  if (block->type(cache_) == MemoryBlock::UNMANAGED_HUGE_CHUNK) {
    VLOG(10) << "Free directly from system allocator";
    system_allocator_->Free(block, block->total_size(cache_),
                            block->index(cache_));
@@ -168,9 +171,12 @@ void BuddyAllocator::Free(void* p) {

 size_t BuddyAllocator::Used() { return total_used_; }
 size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
-size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
+size_t BuddyAllocator::GetMaxChunkSize() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  return max_chunk_size_;
+}

-void* BuddyAllocator::SystemAlloc(size_t size) {
+void* BuddyAllocator::SystemAlloc(size_t size, bool is_managed) {
  size_t index = 0;
  void* p = system_allocator_->Alloc(&index, size);

@@ -178,25 +184,23 @@ void* BuddyAllocator::SystemAlloc(size_t size) {

  if (p == nullptr) return nullptr;

-  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index,
-                                     size, nullptr, nullptr);
+  static_cast<MemoryBlock*>(p)->init(
+      &cache_, is_managed ? MemoryBlock::MANAGED_HUGE_CHUNK
+                          : MemoryBlock::UNMANAGED_HUGE_CHUNK,
+      index, size, nullptr, nullptr);

  return static_cast<MemoryBlock*>(p)->data();
 }

 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-#ifdef PADDLE_WITH_CUDA
-  if (system_allocator_->UseGpu()) {
-    if ((total_used_ + total_free_) == 0) {
-      // Compute the maximum allocation size for the first allocation.
-      max_chunk_size_ = platform::GpuMaxChunkSize();
-    }
+  if (total_used_ + total_free_ > 0) {
+    max_chunk_size_ = reallocate_chunk_size_;
  }
-#endif

  // Allocate a new maximum sized block
  size_t index = 0;
-  void* p = system_allocator_->Alloc(&index, max_chunk_size_);
+  size_t chunk_size = max_chunk_size_;
+  void* p = system_allocator_->Alloc(&index, chunk_size);

  if (p == nullptr) return pool_.end();

@@ -204,7 +208,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
           << " from system allocator";

  static_cast<MemoryBlock*>(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index,
-                                     max_chunk_size_, nullptr, nullptr);
+                                     chunk_size, nullptr, nullptr);

  // gpu fallback allocation
  if (system_allocator_->UseGpu() &&
@@ -212,10 +216,10 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
    fallback_alloc_count_++;
  }

-  total_free_ += max_chunk_size_;
+  total_free_ += chunk_size;

  // dump the block into pool
-  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+  return pool_.insert(IndexSizeAddress(index, chunk_size, p)).first;
 }

 BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
@@ -271,27 +275,24 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,

 void BuddyAllocator::CleanIdleFallBackAlloc() {
  // If fallback allocation does not exist, return directly
-  if (!fallback_alloc_count_) return;
+  if (!fallback_alloc_count_ || !system_allocator_->UseGpu()) return;

  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-
    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));

-    // If no GPU fallback allocator, return
-    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+    auto desc = cache_.load(block);
+    if (desc.index == 0) {
      return;
    }

    VLOG(10) << "Return block " << block << " to fallback allocator.";

-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, desc.size, block->index(cache_));
    cache_.invalidate(block);

    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));

-    total_free_ -= max_chunk_size_;
+    total_free_ -= desc.size;
    fallback_alloc_count_--;

    // If no fall allocation exists, return directly
@@ -315,19 +316,21 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
  if (!shall_free_alloc()) return;

  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
-    // If free memory block less than max_chunk_size_, return directly
-    if (std::get<1>(*pool) < max_chunk_size_) return;
-
    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+    auto desc = cache_.load(block);
+
+    if (desc.type != MemoryBlock::MANAGED_HUGE_CHUNK) {
+      return;
+    }

    VLOG(10) << "Return block " << block << " to base allocator.";

-    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    system_allocator_->Free(block, desc.size, desc.index);
    cache_.invalidate(block);

    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));

-    total_free_ -= max_chunk_size_;
+    total_free_ -= desc.size;

    if (!shall_free_alloc()) return;
  }

--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -34,7 +34,8 @@ namespace detail {
 class BuddyAllocator {
 public:
  BuddyAllocator(std::unique_ptr<SystemAllocator> system_allocator,
-                 size_t min_chunk_size, size_t max_chunk_size);
+                 size_t min_chunk_size, size_t first_allocate_chunk_size,
+                 size_t reallocate_chunk_size);

  ~BuddyAllocator();

@@ -57,7 +58,7 @@ class BuddyAllocator {
  using PoolSet = std::set<IndexSizeAddress>;

  /*! \brief Allocate fixed-size memory from system */
-  void* SystemAlloc(size_t size);
+  void* SystemAlloc(size_t size, bool is_managed = true);

  /*! \brief If existing chunks are not suitable, refill pool */
  PoolSet::iterator RefillPool();
@@ -87,7 +88,11 @@ class BuddyAllocator {
  size_t total_free_ = 0;  // the total size of free memory

  size_t min_chunk_size_;  // the minimum size of each chunk
-  size_t max_chunk_size_;  // the maximum size of each chunk
+
+  size_t first_allocate_chunk_size_;
+  size_t reallocate_chunk_size_;
+
+  size_t max_chunk_size_;

 private:
  /**

--- a/paddle/fluid/memory/detail/memory_block.h
+++ b/paddle/fluid/memory/detail/memory_block.h
@@ -29,7 +29,8 @@ struct MemoryBlock {
  enum Type {
    FREE_CHUNK,            // memory is free and idle
    ARENA_CHUNK,           // memory is being occupied
-    HUGE_CHUNK,    // memory is out of management
+    MANAGED_HUGE_CHUNK,    // memory is huge and out of management
+    UNMANAGED_HUGE_CHUNK,  // memory is huge and managed by allocator
    INVALID_CHUNK          // memory is invalid
  };


--- a/paddle/fluid/operators/benchmark/op_tester.cc
+++ b/paddle/fluid/operators/benchmark/op_tester.cc
@@ -42,8 +42,8 @@ void OpTester::Init(const OpTesterConfig &config) {
  // Initialize the OpDesc
  if (op_desc_info.Has(config_.op_type)) {
    type_ = config_.op_type;
-    op_desc_.SetType(config_.op_type);

+    CreateOpDesc();
    CreateInputVarDesc();
    CreateOutputVarDesc();
  } else {
@@ -131,6 +131,40 @@ std::vector<std::string> OpTester::GetOpProtoOutputNames() {
  return output_names;
 }

+std::unordered_map<std::string, framework::proto::AttrType>
+OpTester::GetOpProtoAttrNames() {
+  std::unordered_map<std::string, framework::proto::AttrType> attr_types;
+  const framework::proto::OpProto &proto =
+      framework::OpInfoMap::Instance().Get(type_).Proto();
+  const std::vector<std::string> skipped_attrs = {
+      framework::OpProtoAndCheckerMaker::OpRoleAttrName(),
+      framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(),
+      framework::OpProtoAndCheckerMaker::OpNamescopeAttrName(),
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()};
+  for (int i = 0; i != proto.attrs_size(); ++i) {
+    const auto &attr = proto.attrs(i);
+    if (!Has(skipped_attrs, attr.name())) {
+      VLOG(4) << "attr: " << attr.name() << ", type: " << attr.type();
+      attr_types[attr.name()] = attr.type();
+    }
+  }
+  return attr_types;
+}
+
+framework::proto::VarType::Type OpTester::TransToVarType(std::string str) {
+  if (str == "int32") {
+    return framework::proto::VarType::INT32;
+  } else if (str == "int64") {
+    return framework::proto::VarType::INT64;
+  } else if (str == "fp32") {
+    return framework::proto::VarType::FP32;
+  } else if (str == "fp64") {
+    return framework::proto::VarType::FP64;
+  } else {
+    PADDLE_THROW("Unsupported dtype %s.", str.c_str());
+  }
+}
+
 void OpTester::CreateInputVarDesc() {
  std::vector<std::string> input_names = GetOpProtoInputNames();
  for (auto &name : input_names) {
@@ -145,11 +179,11 @@ void OpTester::CreateInputVarDesc() {
    // Need to support more type
    var->SetType(framework::proto::VarType::LOD_TENSOR);
    var->SetPersistable(false);
-    var->SetDataType(framework::proto::VarType::FP32);
+    var->SetDataType(TransToVarType(input->dtype));
    var->SetShape(input->dims);

    op_desc_.SetInput(name, {var_name});
-    input_lods_[var_name] = input->lod;
+    inputs_[var_name] = *input;
  }
 }

@@ -167,6 +201,49 @@ void OpTester::CreateOutputVarDesc() {
  }
 }

+void OpTester::CreateOpDesc() {
+  op_desc_.SetType(config_.op_type);
+  std::unordered_map<std::string, framework::proto::AttrType> attr_types =
+      GetOpProtoAttrNames();
+  for (auto item : config_.attrs) {
+    const std::string &name = item.first;
+    if (attr_types.find(name) == attr_types.end()) {
+      LOG(FATAL) << "Operator " << type_ << " do not have attr " << name;
+    }
+
+    const std::string &value_str = item.second;
+    const framework::proto::AttrType &type = attr_types[name];
+    switch (type) {
+      case framework::proto::AttrType::BOOLEAN:
+        break;
+      case framework::proto::AttrType::INT: {
+        int value = StringTo<int>(value_str);
+        op_desc_.SetAttr(name, {value});
+      } break;
+      case framework::proto::AttrType::FLOAT: {
+        float value = StringTo<float>(value_str);
+        op_desc_.SetAttr(name, {value});
+      } break;
+      case framework::proto::AttrType::STRING: {
+        op_desc_.SetAttr(name, {value_str});
+      } break;
+      case framework::proto::AttrType::BOOLEANS:
+      case framework::proto::AttrType::INTS:
+      case framework::proto::AttrType::FLOATS:
+      case framework::proto::AttrType::STRINGS:
+        LOG(FATAL) << "Not supported yet.";
+        break;
+      case framework::proto::AttrType::LONG: {
+        int64_t value = StringTo<int64_t>(value_str);
+        op_desc_.SetAttr(name, value);
+      } break;
+      case framework::proto::AttrType::LONGS:
+      default:
+        PADDLE_THROW("Unsupport attr type %d", type);
+    }
+  }
+}
+
 framework::VarDesc *OpTester::Var(const std::string &name) {
  auto it = vars_.find(name);
  if (it != vars_.end()) {
@@ -179,24 +256,41 @@ framework::VarDesc *OpTester::Var(const std::string &name) {

 template <typename T>
 void OpTester::SetupTensor(framework::LoDTensor *tensor,
-                           const std::vector<int64_t> &shape, T lower,
-                           T upper) {
+                           const std::vector<int64_t> &shape, T lower, T upper,
+                           const std::string &initializer) {
  static unsigned int seed = 100;
  std::mt19937 rng(seed++);
  std::uniform_real_distribution<double> uniform_dist(0, 1);

  T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
-  if (platform::is_cpu_place(place_)) {
-    for (int i = 0; i < tensor->numel(); ++i) {
-      ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
-    }
-  } else {
+
  framework::LoDTensor cpu_tensor;
-    T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
+  T *cpu_ptr = nullptr;
+
+  if (!platform::is_cpu_place(place_)) {
+    cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
                                         platform::CPUPlace());
+  } else {
+    cpu_ptr = ptr;
+  }
+
+  if (initializer == "random") {
    for (int i = 0; i < cpu_tensor.numel(); ++i) {
      cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
    }
+  } else if (initializer == "natural") {
+    for (int i = 0; i < cpu_tensor.numel(); ++i) {
+      cpu_ptr[i] = lower + i;
+    }
+  } else if (initializer == "zeros") {
+    for (int i = 0; i < cpu_tensor.numel(); ++i) {
+      cpu_ptr[i] = 0;
+    }
+  } else {
+    PADDLE_THROW("Unsupported initializer %s.", initializer.c_str());
+  }
+
+  if (!platform::is_cpu_place(place_)) {
    TensorCopySync(cpu_tensor, place_, tensor);
  }
 }
@@ -219,7 +313,7 @@ void OpTester::CreateVariables(framework::Scope *scope) {
    }
  }

-  for (auto &item : input_lods_) {
+  for (auto &item : inputs_) {
    // Allocate memory for input tensor
    auto &var_name = item.first;
    VLOG(3) << "Allocate memory for tensor " << var_name;
@@ -229,11 +323,23 @@ void OpTester::CreateVariables(framework::Scope *scope) {

    auto *var = scope->Var(var_name);
    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    const auto &data_type = var_desc->GetDataType();
+    if (data_type == framework::proto::VarType::INT32) {
+      SetupTensor<int>(tensor, shape, 0, 1, item.second.initializer);
+    } else if (data_type == framework::proto::VarType::INT64) {
+      SetupTensor<int64_t>(tensor, shape, 0, 1, item.second.initializer);
+    } else if (data_type == framework::proto::VarType::FP32) {
      SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
-                       static_cast<float>(1.0));
+                         static_cast<float>(1.0), item.second.initializer);
+    } else if (data_type == framework::proto::VarType::FP64) {
+      SetupTensor<double>(tensor, shape, static_cast<double>(0.0),
+                          static_cast<double>(1.0), item.second.initializer);
+    } else {
+      PADDLE_THROW("Unsupported dtype %d.", data_type);
+    }

    VLOG(3) << "Set lod for tensor " << var_name;
-    std::vector<std::vector<size_t>> &lod_vec = item.second;
+    std::vector<std::vector<size_t>> &lod_vec = item.second.lod;
    framework::LoD lod;
    for (size_t i = 0; i < lod_vec.size(); ++i) {
      lod.push_back(lod_vec[i]);
@@ -261,7 +367,16 @@ std::string OpTester::DebugString() {
    ss << GenSpaces(count) << "type: LOD_TENSOR\n";
    ss << GenSpaces(count++) << "lod_tensor {\n";
    ss << GenSpaces(count++) << "tensor {\n";
+    const auto &data_type = var->GetDataType();
+    if (data_type == framework::proto::VarType::INT32) {
+      ss << GenSpaces(count) << "data_type: INT32\n";
+    } else if (data_type == framework::proto::VarType::INT64) {
+      ss << GenSpaces(count) << "data_type: INT64\n";
+    } else if (data_type == framework::proto::VarType::FP32) {
      ss << GenSpaces(count) << "data_type: FP32\n";
+    } else if (data_type == framework::proto::VarType::FP64) {
+      ss << GenSpaces(count) << "data_type: FP64\n";
+    }
    std::vector<int64_t> shape = var->GetShape();
    for (auto d : shape) {
      ss << GenSpaces(count) << "dims: " << d << "\n";
@@ -288,6 +403,63 @@ std::string OpTester::DebugString() {
    ss << GenSpaces(--count) << "}\n";
  }
  ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
+  for (auto &name : op_desc_.AttrNames()) {
+    ss << GenSpaces(count++) << "attrs {\n";
+    const auto &attr_type = op_desc_.GetAttrType(name);
+    const auto &attr = op_desc_.GetAttr(name);
+    ss << GenSpaces(count) << "name: \"" << name << "\"\n";
+    switch (attr_type) {
+      case framework::proto::AttrType::BOOLEAN: {
+        ss << GenSpaces(count) << "type: BOOLEAN\n";
+        ss << GenSpaces(count) << "b: " << boost::get<bool>(attr) << "\n";
+      } break;
+      case framework::proto::AttrType::INT: {
+        ss << GenSpaces(count) << "type: INT\n";
+        ss << GenSpaces(count) << "i: " << boost::get<int>(attr) << "\n";
+      } break;
+      case framework::proto::AttrType::FLOAT: {
+        ss << GenSpaces(count) << "type: FLOAT\n";
+        ss << GenSpaces(count) << "f: " << boost::get<float>(attr) << "\n";
+      } break;
+      case framework::proto::AttrType::STRING: {
+        ss << GenSpaces(count) << "type: STRING\n";
+        ss << GenSpaces(count) << "s: \"" << boost::get<std::string>(attr)
+           << "\"\n";
+      } break;
+      case framework::proto::AttrType::BOOLEANS: {
+        ss << GenSpaces(count) << "type: BOOLEANS\n";
+        ss << GenSpaces(count) << "bools: "
+           << "\n";
+      } break;
+      case framework::proto::AttrType::INTS: {
+        ss << GenSpaces(count) << "type: INTS\n";
+        ss << GenSpaces(count) << "ints: "
+           << "\n";
+      } break;
+      case framework::proto::AttrType::FLOATS: {
+        ss << GenSpaces(count) << "type: FLOATS\n";
+        ss << GenSpaces(count) << "floats: "
+           << "\n";
+      } break;
+      case framework::proto::AttrType::STRINGS: {
+        ss << GenSpaces(count) << "type: STRINGS\n";
+        ss << GenSpaces(count) << "strings: "
+           << "\n";
+      } break;
+      case framework::proto::AttrType::LONG: {
+        ss << GenSpaces(count) << "type: LONG\n";
+        ss << GenSpaces(count) << "l: " << boost::get<int64_t>(attr) << "\n";
+      } break;
+      case framework::proto::AttrType::LONGS: {
+        ss << GenSpaces(count) << "type: LONGS\n";
+        ss << GenSpaces(count) << "longs: "
+           << "\n";
+      } break;
+      default:
+        PADDLE_THROW("Unsupport attr type %d", attr_type);
+    }
+    ss << GenSpaces(--count) << "}\n";
+  }
  ss << GenSpaces(--count) << "}\n";
  return ss.str();
 }
@@ -299,6 +471,7 @@ TEST(op_tester, base) {
                   FLAGS_op_config_list.c_str());
    std::vector<OpTesterConfig> op_configs;
    while (!fin.eof()) {
+      VLOG(4) << "Reading config " << op_configs.size() << "...";
      OpTesterConfig config;
      bool result = config.Init(fin);
      if (result) {

--- a/paddle/fluid/operators/benchmark/op_tester.h
+++ b/paddle/fluid/operators/benchmark/op_tester.h
@@ -14,7 +14,9 @@ limitations under the License. */

 #pragma once

+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -39,16 +41,21 @@ class OpTester {
 private:
  std::vector<std::string> GetOpProtoInputNames();
  std::vector<std::string> GetOpProtoOutputNames();
+  std::unordered_map<std::string, framework::proto::AttrType>
+  GetOpProtoAttrNames();

+  framework::proto::VarType::Type TransToVarType(std::string str);
  void CreateInputVarDesc();
  void CreateOutputVarDesc();
+  void CreateOpDesc();

  framework::VarDesc *Var(const std::string &name);
  void CreateVariables(framework::Scope *scope);

  template <typename T>
  void SetupTensor(framework::LoDTensor *input,
-                   const std::vector<int64_t> &shape, T lower, T upper);
+                   const std::vector<int64_t> &shape, T lower, T upper,
+                   const std::string &initializer);

  void RunImpl();

@@ -57,7 +64,7 @@ class OpTester {
  std::string type_;
  framework::OpDesc op_desc_;
  std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
-  std::unordered_map<std::string, std::vector<std::vector<size_t>>> input_lods_;
+  std::unordered_map<std::string, OpInputConfig> inputs_;
  std::unique_ptr<framework::OperatorBase> op_;
  platform::Place place_;
  std::unique_ptr<framework::Scope> scope_;

--- a/paddle/fluid/operators/benchmark/op_tester_config.cc
+++ b/paddle/fluid/operators/benchmark/op_tester_config.cc
@@ -14,7 +14,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/benchmark/op_tester_config.h"
 #include <fstream>
-#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"

 namespace paddle {
@@ -40,6 +39,62 @@ static void EraseEndSep(std::string* str,
  }
 }

+OpInputConfig::OpInputConfig(std::istream& is) {
+  std::string sep;
+  is >> sep;
+  if (sep == kStartSeparator) {
+    while (sep != kEndSeparator) {
+      is >> sep;
+      if (sep == "name" || sep == "name:") {
+        is >> name;
+        EraseEndSep(&name);
+      } else if (sep == "dtype" || sep == "dtype:") {
+        ParseDType(is);
+      } else if (sep == "initializer" || sep == "initializer:") {
+        ParseInitializer(is);
+      } else if (sep == "dims" || sep == "dims:") {
+        ParseDims(is);
+      } else if (sep == "lod" || sep == "lod:") {
+        ParseLoD(is);
+      }
+    }
+  }
+}
+
+void OpInputConfig::ParseDType(std::istream& is) {
+  std::string dtype_str;
+  is >> dtype_str;
+  EraseEndSep(&dtype_str);
+
+  if (dtype_str == "int32" || dtype_str == "int") {
+    dtype = "int32";
+  } else if (dtype_str == "int64" || dtype_str == "long") {
+    dtype = "int64";
+  } else if (dtype_str == "fp32" || dtype_str == "float") {
+    dtype = "fp32";
+  } else if (dtype_str == "fp64" || dtype_str == "double") {
+    dtype = "fp64";
+  } else {
+    PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str());
+  }
+  VLOG(4) << "dtype of input " << name << " is: " << dtype;
+}
+
+void OpInputConfig::ParseInitializer(std::istream& is) {
+  std::string initializer_str;
+  is >> initializer_str;
+  EraseEndSep(&initializer_str);
+
+  const std::vector<std::string> supported_initializers = {"random", "natural",
+                                                           "zeros"};
+  if (!Has(supported_initializers, initializer_str)) {
+    PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str());
+  }
+
+  initializer = initializer_str;
+  VLOG(4) << "initializer of input " << name << " is: " << initializer;
+}
+
 void OpInputConfig::ParseDims(std::istream& is) {
  std::string dims_str;
  is >> dims_str;
@@ -84,7 +139,7 @@ void OpInputConfig::ParseLoD(std::istream& is) {
          number += lod_str[i];
          ++i;
        }
-        level.push_back(atoi(number.c_str()));
+        level.push_back(StringTo<size_t>(number));
      }
      lod.push_back(level);
    } else if (lod_str[i] == '}') {
@@ -93,24 +148,6 @@ void OpInputConfig::ParseLoD(std::istream& is) {
  }
 }

-OpInputConfig::OpInputConfig(std::istream& is) {
-  std::string sep;
-  is >> sep;
-  if (sep == kStartSeparator) {
-    while (sep != kEndSeparator) {
-      is >> sep;
-      if (sep == "name" || sep == "name:") {
-        is >> name;
-        EraseEndSep(&name);
-      } else if (sep == "dims" || sep == "dims:") {
-        ParseDims(is);
-      } else if (sep == "lod" || sep == "lod:") {
-        ParseLoD(is);
-      }
-    }
-  }
-}
-
 OpTesterConfig::OpTesterConfig(const std::string& filename) {
  std::ifstream fin(filename, std::ios::in | std::ios::binary);
  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
@@ -167,6 +204,7 @@ bool OpTesterConfig::ParseAttrs(std::istream& is) {
      is >> value;
      EraseEndSep(&key, ":");
      EraseEndSep(&value);
+      VLOG(4) << "attrs: " << key << ", " << value;

      attrs[key] = value;
    }

--- a/paddle/fluid/operators/benchmark/op_tester_config.h
+++ b/paddle/fluid/operators/benchmark/op_tester_config.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once

 #include <istream>
+#include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -27,10 +28,14 @@ struct OpInputConfig {
  OpInputConfig() {}
  explicit OpInputConfig(std::istream& is);

+  void ParseDType(std::istream& is);
+  void ParseInitializer(std::istream& is);
  void ParseDims(std::istream& is);
  void ParseLoD(std::istream& is);

  std::string name;
+  std::string dtype{"fp32"};  // int32/int, int64/long, fp32/float, fp64/double
+  std::string initializer{"random"};  // random, natural
  std::vector<int64_t> dims;
  std::vector<std::vector<size_t>> lod;
 };
@@ -55,6 +60,23 @@ struct OpTesterConfig {
  double runtime{0.0};
 };

+static bool Has(const std::vector<std::string>& vec, const std::string& item) {
+  for (size_t i = 0; i < vec.size(); ++i) {
+    if (vec[i] == item) {
+      return true;
+    }
+  }
+  return false;
+}
+
+template <typename T>
+T StringTo(const std::string& str) {
+  std::istringstream is(str);
+  T value;
+  is >> value;
+  return value;
+}
+
 }  // namespace benchmark
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/cast_op.cc
+++ b/paddle/fluid/operators/cast_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/cast_op.h"
+#include <memory>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"

@@ -30,7 +31,8 @@ class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 Cast Operator.

 This Operator casts the input tensor to another data type and
-returns tha Output Tensor.
+returns the Output Tensor. It's meaningless if the output dtype equals
+the input dtype, but it's fine if you do so.

 )DOC");
  }

--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -33,11 +33,14 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
+detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)

 if(WITH_GPU)
  detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub)
+  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub)
 else()
  detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+  detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
 endif()

 detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)

--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+class BoxDecoderAndAssignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("PriorBox"),
+        "Input(PriorBox) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("PriorBoxVar"),
+        "Input(PriorBoxVar) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("TargetBox"),
+        "Input(TargetBox) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("BoxScore"),
+        "Input(BoxScore) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("DecodeBox"),
+        "Output(DecodeBox) of BoxDecoderAndAssignOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("OutputAssignBox"),
+        "Output(OutputAssignBox) of BoxDecoderAndAssignOp should not be null.");
+
+    auto prior_box_dims = ctx->GetInputDim("PriorBox");
+    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+    auto target_box_dims = ctx->GetInputDim("TargetBox");
+    auto box_score_dims = ctx->GetInputDim("BoxScore");
+
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+                      "The rank of Input of PriorBox must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 1,
+                      "The rank of Input of PriorBoxVar must be 1");
+    PADDLE_ENFORCE_EQ(prior_box_var_dims[0], 4,
+                      "The shape of PriorBoxVar is [4]");
+    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                      "The rank of Input of TargetBox must be 2");
+    PADDLE_ENFORCE_EQ(box_score_dims.size(), 2,
+                      "The rank of Input of BoxScore must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0],
+                      "The first dim of prior_box and target_box is roi nums "
+                      "and should be same!");
+    PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0],
+                      "The first dim of prior_box and box_score is roi nums "
+                      "and should be same!");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], box_score_dims[1] * prior_box_dims[1],
+                      "The shape of target_box is [N, classnum * 4], The shape "
+                      "of box_score is [N, classnum], The shape of prior_box "
+                      "is [N, 4]");
+
+    ctx->SetOutputDim("DecodeBox", framework::make_ddim({target_box_dims[0],
+                                                         target_box_dims[1]}));
+    ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox");
+    ctx->SetOutputDim(
+        "OutputAssignBox",
+        framework::make_ddim({prior_box_dims[0], prior_box_dims[1]}));
+    ctx->ShareLoD("PriorBox", /*->*/ "OutputAssignBox");
+  }
+};
+
+class BoxDecoderAndAssignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "PriorBox",
+        "(Tensor, default Tensor<float>) "
+        "Box list PriorBox is a 2-D Tensor with shape [N, 4] which holds N "
+        "boxes and each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the anchor box, "
+        "if the input is image feature map, they are close to the origin "
+        "of the coordinate system. [xmax, ymax] is the right bottom "
+        "coordinate of the anchor box.");
+    AddInput("PriorBoxVar",
+             "(Tensor, default Tensor<float>, optional) "
+             "PriorBoxVar is a 2-D Tensor with shape [N, 4] which holds N "
+             "group of variance. PriorBoxVar will set all elements to 1 by "
+             "default.")
+        .AsDispensable();
+    AddInput("TargetBox",
+             "(LoDTensor or Tensor) "
+             "This input can be a 2-D LoDTensor with shape "
+             "[N, classnum*4]. It holds N targets for N boxes.");
+    AddInput("BoxScore",
+             "(LoDTensor or Tensor) "
+             "This input can be a 2-D LoDTensor with shape "
+             "[N, classnum], each box is represented as [classnum] which is "
+             "the classification probabilities.");
+    AddAttr<float>("box_clip",
+                   "(float, default 4.135, np.log(1000. / 16.)) "
+                   "clip box to prevent overflowing")
+        .SetDefault(4.135f);
+    AddOutput("DecodeBox",
+              "(LoDTensor or Tensor) "
+              "the output tensor of op with shape [N, classnum * 4] "
+              "representing the result of N target boxes decoded with "
+              "M Prior boxes and variances for each class.");
+    AddOutput("OutputAssignBox",
+              "(LoDTensor or Tensor) "
+              "the output tensor of op with shape [N, 4] "
+              "representing the result of N target boxes decoded with "
+              "M Prior boxes and variances with the best non-background class "
+              "by BoxScore.");
+    AddComment(R"DOC(
+
+Bounding Box Coder.
+
+Decode the target bounding box with the prior_box information.
+
+The Decoding schema is described below:
+
+    $$
+    ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2} 
+    $$
+    $$
+    oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2}
+    $$
+    $$
+    ow = \\exp (pwv \\times tw) \\times pw + \\frac{tw}{2}
+    $$
+    $$
+    oh = \\exp (phv \\times th) \\times ph + \\frac{th}{2}
+    $$
+
+where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width
+and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the
+prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`,
+`phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the
+decoded coordinates, width and height in decode_box. 
+
+decode_box is obtained after box decode, then assigning schema is described below:
+
+For each prior_box, use the best non-background class's decoded values to 
+update the prior_box locations and get output_assign_box. So, the shape of
+output_assign_box is the same as PriorBox.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(box_decoder_and_assign, ops::BoxDecoderAndAssignOp,
+                  ops::BoxDecoderAndAssignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    box_decoder_and_assign,
+    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BoxDecoderAndAssignKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void DecodeBoxKernel(const T* prior_box_data,
+                                const T* prior_box_var_data,
+                                const T* target_box_data, const int roi_num,
+                                const int class_num, const T box_clip,
+                                T* output_box_data) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < roi_num * class_num) {
+    int i = idx / class_num;
+    int j = idx % class_num;
+    T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1;
+    T prior_box_height =
+        prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1;
+    T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2;
+    T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2;
+
+    int offset = i * class_num * 4 + j * 4;
+    T dw = prior_box_var_data[2] * target_box_data[offset + 2];
+    T dh = prior_box_var_data[3] * target_box_data[offset + 3];
+    if (dw > box_clip) {
+      dw = box_clip;
+    }
+    if (dh > box_clip) {
+      dh = box_clip;
+    }
+    T target_box_center_x = 0, target_box_center_y = 0;
+    T target_box_width = 0, target_box_height = 0;
+    target_box_center_x =
+        prior_box_var_data[0] * target_box_data[offset] * prior_box_width +
+        prior_box_center_x;
+    target_box_center_y =
+        prior_box_var_data[1] * target_box_data[offset + 1] * prior_box_height +
+        prior_box_center_y;
+    target_box_width = expf(dw) * prior_box_width;
+    target_box_height = expf(dh) * prior_box_height;
+
+    output_box_data[offset] = target_box_center_x - target_box_width / 2;
+    output_box_data[offset + 1] = target_box_center_y - target_box_height / 2;
+    output_box_data[offset + 2] =
+        target_box_center_x + target_box_width / 2 - 1;
+    output_box_data[offset + 3] =
+        target_box_center_y + target_box_height / 2 - 1;
+  }
+}
+
+template <typename T>
+__global__ void AssignBoxKernel(const T* prior_box_data,
+                                const T* box_score_data, T* output_box_data,
+                                const int roi_num, const int class_num,
+                                T* output_assign_box_data) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < roi_num) {
+    int i = idx;
+    T max_score = -1;
+    int max_j = -1;
+    for (int j = 0; j < class_num; ++j) {
+      T score = box_score_data[i * class_num + j];
+      if (score > max_score && j > 0) {
+        max_score = score;
+        max_j = j;
+      }
+    }
+    if (max_j > 0) {
+      for (int pno = 0; pno < 4; pno++) {
+        output_assign_box_data[i * 4 + pno] =
+            output_box_data[i * class_num * 4 + max_j * 4 + pno];
+      }
+    } else {
+      for (int pno = 0; pno < 4; pno++) {
+        output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno];
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
+    auto* output_box = context.Output<framework::Tensor>("DecodeBox");
+    auto* output_assign_box =
+        context.Output<framework::Tensor>("OutputAssignBox");
+
+    auto roi_num = target_box->dims()[0];
+    auto class_num = box_score->dims()[1];
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    auto* prior_box_var_data = prior_box_var->data<T>();
+    auto* box_score_data = box_score->data<T>();
+    output_box->mutable_data<T>({roi_num, class_num * 4}, context.GetPlace());
+    output_assign_box->mutable_data<T>({roi_num, 4}, context.GetPlace());
+    T* output_box_data = output_box->data<T>();
+    T* output_assign_box_data = output_assign_box->data<T>();
+
+    int block = 512;
+    int grid = (roi_num * class_num + block - 1) / block;
+    auto& device_ctx = context.cuda_device_context();
+
+    const T box_clip = context.Attr<T>("box_clip");
+
+    DecodeBoxKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+        prior_box_data, prior_box_var_data, target_box_data, roi_num, class_num,
+        box_clip, output_box_data);
+
+    context.device_context().Wait();
+    int assign_grid = (roi_num + block - 1) / block;
+    AssignBoxKernel<T><<<assign_grid, block, 0, device_ctx.stream()>>>(
+        prior_box_data, box_score_data, output_box_data, roi_num, class_num,
+        output_assign_box_data);
+    context.device_context().Wait();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    box_decoder_and_assign,
+    ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
+                                       float>,
+    ops::BoxDecoderAndAssignCUDAKernel<paddle::platform::CUDADeviceContext,
+                                       double>);
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class BoxDecoderAndAssignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* prior_box = context.Input<framework::LoDTensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* box_score = context.Input<framework::LoDTensor>("BoxScore");
+    auto* output_box = context.Output<framework::Tensor>("DecodeBox");
+    auto* output_assign_box =
+        context.Output<framework::Tensor>("OutputAssignBox");
+    int roi_num = target_box->dims()[0];
+    int class_num = box_score->dims()[1];
+    auto* target_box_data = target_box->data<T>();
+    auto* prior_box_data = prior_box->data<T>();
+    auto* prior_box_var_data = prior_box_var->data<T>();
+    auto* box_score_data = box_score->data<T>();
+    output_box->mutable_data<T>({roi_num, class_num * 4}, context.GetPlace());
+    output_assign_box->mutable_data<T>({roi_num, 4}, context.GetPlace());
+    T* output_box_data = output_box->data<T>();
+    T* output_assign_box_data = output_assign_box->data<T>();
+    const T bbox_clip = context.Attr<T>("box_clip");
+
+    for (int i = 0; i < roi_num; ++i) {
+      T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1;
+      T prior_box_height =
+          prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1;
+      T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2;
+      T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2;
+      for (int j = 0; j < class_num; ++j) {
+        int64_t offset = i * class_num * 4 + j * 4;
+        T dw = std::min(prior_box_var_data[2] * target_box_data[offset + 2],
+                        bbox_clip);
+        T dh = std::min(prior_box_var_data[3] * target_box_data[offset + 3],
+                        bbox_clip);
+        T target_box_center_x = 0, target_box_center_y = 0;
+        T target_box_width = 0, target_box_height = 0;
+        target_box_center_x =
+            prior_box_var_data[0] * target_box_data[offset] * prior_box_width +
+            prior_box_center_x;
+        target_box_center_y = prior_box_var_data[1] *
+                                  target_box_data[offset + 1] *
+                                  prior_box_height +
+                              prior_box_center_y;
+        target_box_width = std::exp(dw) * prior_box_width;
+        target_box_height = std::exp(dh) * prior_box_height;
+
+        output_box_data[offset] = target_box_center_x - target_box_width / 2;
+        output_box_data[offset + 1] =
+            target_box_center_y - target_box_height / 2;
+        output_box_data[offset + 2] =
+            target_box_center_x + target_box_width / 2 - 1;
+        output_box_data[offset + 3] =
+            target_box_center_y + target_box_height / 2 - 1;
+      }
+
+      T max_score = -1;
+      int max_j = -1;
+      for (int j = 0; j < class_num; ++j) {
+        T score = box_score_data[i * class_num + j];
+        if (score > max_score && j > 0) {
+          max_score = score;
+          max_j = j;
+        }
+      }
+
+      if (max_j > 0) {
+        for (int pno = 0; pno < 4; pno++) {
+          output_assign_box_data[i * 4 + pno] =
+              output_box_data[i * class_num * 4 + max_j * 4 + pno];
+        }
+      } else {
+        for (int pno = 0; pno < 4; pno++) {
+          output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno];
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DistributeFpnProposalsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("FpnRois"),
+                   "Input(FpnRois) shouldn't be null");
+    PADDLE_ENFORCE_GE(
+        ctx->Outputs("MultiFpnRois").size(), 1UL,
+        "Outputs(MultiFpnRois) of DistributeOp should not be empty");
+    size_t min_level = static_cast<size_t>(ctx->Attrs().Get<int>("min_level"));
+    size_t max_level = static_cast<size_t>(ctx->Attrs().Get<int>("max_level"));
+    PADDLE_ENFORCE_GE(max_level, min_level,
+                      "max_level must not lower than min_level");
+    // Set the output shape
+    size_t num_out_rois = max_level - min_level + 1;
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(num_out_rois);
+    for (size_t i = 0; i < num_out_rois; ++i) {
+      framework::DDim out_dim = {-1, 4};
+      outs_dims.push_back(out_dim);
+    }
+    ctx->SetOutputsDim("MultiFpnRois", outs_dims);
+    ctx->SetOutputDim("RestoreIndex", {1, -1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois"));
+    return framework::OpKernelType(data_type, platform::CPUPlace());
+  }
+};
+
+class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)");
+    AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator")
+        .AsDuplicable();
+    AddOutput("RestoreIndex",
+              "(Tensor) An array of positive number which is "
+              "used to restore the order of FpnRois");
+    AddAttr<int>("min_level",
+                 "The lowest level of FPN layer where the"
+                 " proposals come from");
+    AddAttr<int>("max_level",
+                 "The highest level of FPN layer where the"
+                 " proposals come from");
+    AddAttr<int>("refer_level",
+                 "The referring level of FPN layer with"
+                 " specified scale");
+    AddAttr<int>("refer_scale",
+                 "The referring scale of FPN layer with"
+                 " specified level");
+    AddComment(R"DOC(
+This operator distribute all proposals into different fpn level,
+ with respect to scale of the proposals, the referring scale and
+ the referring level. Besides, to restore the order of proposals,
+we return an array which indicate the original index of rois in
+ current proposals.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(distribute_fpn_proposals, ops::DistributeFpnProposalsOp,
+                  ops::DistributeFpnProposalsOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals,
+                       ops::DistributeFpnProposalsOpKernel<float>,
+                       ops::DistributeFpnProposalsOpKernel<double>);
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/fluid/memory/allocation/allocator.h>
+#include "cub/cub.cuh"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+int const BBoxSize = 4;
+
+struct RangeInitFunctor {
+  int start_;
+  int delta_;
+  int* out_;
+  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
+};
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+static inline void TransLoD(const int* length_lod, const int lod_size,
+                            int* offset_lod) {
+  int offset = 0;
+  for (int i = 0; i < lod_size; ++i) {
+    offset_lod[i] = offset;
+    offset += length_lod[i];
+  }
+}
+
+template <typename T>
+static __device__ inline T RoIArea(const T* box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static __global__ void GPUDistFpnProposalsHelper(
+    const int nthreads, const T* rois, const int lod_size,
+    const int refer_level, const int refer_scale, const int max_level,
+    const int min_level, int* roi_batch_id_data, int* sub_lod_list,
+    int* target_lvls) {
+  CUDA_1D_KERNEL_LOOP(i, nthreads) {
+    const T* offset_roi = rois + i * BBoxSize;
+    int roi_batch_ind = roi_batch_id_data[i];
+    // get the target level of current rois
+    T roi_area = RoIArea(offset_roi, false);
+    T roi_scale = sqrt(roi_area);
+    int tgt_lvl = floor(log2(roi_scale / refer_scale) + refer_level);
+    tgt_lvl = min(max_level, max(tgt_lvl, min_level));
+    target_lvls[i] = tgt_lvl;
+    // compute number of rois in the same batch and same target level
+    platform::CudaAtomicAdd(sub_lod_list + tgt_lvl * lod_size + roi_batch_ind,
+                            1);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* fpn_rois = ctx.Input<paddle::framework::LoDTensor>("FpnRois");
+
+    auto multi_fpn_rois = ctx.MultiOutput<LoDTensor>("MultiFpnRois");
+    auto* restore_index = ctx.Output<Tensor>("RestoreIndex");
+
+    const int min_level = ctx.Attr<int>("min_level");
+    const int max_level = ctx.Attr<int>("max_level");
+    const int refer_level = ctx.Attr<int>("refer_level");
+    const int refer_scale = ctx.Attr<int>("refer_scale");
+    int num_level = max_level - min_level + 1;
+
+    // check that the fpn_rois is not empty
+    PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
+                      "DistributeFpnProposalsOp need 1 level of LoD");
+
+    auto fpn_rois_lod = fpn_rois->lod().back();
+    int lod_size = fpn_rois_lod.size() - 1;
+    int roi_num = fpn_rois_lod[lod_size];
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    // get batch id by lod in CPU
+    Tensor roi_batch_id_list;
+    roi_batch_id_list.Resize({roi_num});
+    int* roi_batch_id_data =
+        roi_batch_id_list.mutable_data<int>(platform::CPUPlace());
+    for (int n = 0; n < lod_size; ++n) {
+      for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+    // copy batch id list to GPU
+    Tensor roi_batch_id_list_gpu;
+    framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(),
+                              &roi_batch_id_list_gpu);
+
+    Tensor sub_lod_list;
+    sub_lod_list.Resize({num_level, lod_size});
+    int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
+    Tensor target_lvls;
+    target_lvls.Resize({roi_num});
+    int* target_lvls_data = target_lvls.mutable_data<int>(dev_ctx.GetPlace());
+
+    int blocks = NumBlocks(roi_num);
+    int threads = kNumCUDAThreads;
+
+    // get target levels and sub_lod list
+    GPUDistFpnProposalsHelper<T><<<blocks, threads>>>(
+        roi_num, fpn_rois->data<T>(), lod_size, refer_level, refer_scale,
+        max_level, min_level, roi_batch_id_list_gpu.data<int>(),
+        sub_lod_list_data, target_lvls_data);
+
+    Tensor index_in_t;
+    int* idx_in = index_in_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
+    platform::ForRange<platform::CUDADeviceContext> for_range(dev_ctx, roi_num);
+    for_range(RangeInitFunctor{0, 1, idx_in});
+
+    Tensor keys_out_t;
+    int* keys_out = keys_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
+    Tensor index_out_t;
+    int* idx_out = index_out_t.mutable_data<int>({roi_num}, dev_ctx.GetPlace());
+
+    // Determine temporary device storage requirements
+    size_t temp_storage_bytes = 0;
+    cub::DeviceRadixSort::SortPairsDescending<int, int>(
+        nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in,
+        idx_out, roi_num);
+    // Allocate temporary storage
+    auto place = boost::get<platform::CUDAPlace>(dev_ctx.GetPlace());
+    auto d_temp_storage = memory::Alloc(place, temp_storage_bytes,
+                                        memory::Allocator::kScratchpad);
+
+    // Run sorting operation
+    // sort target level to get corresponding index
+    cub::DeviceRadixSort::SortPairsDescending<int, int>(
+        d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out,
+        idx_in, idx_out, roi_num);
+
+    int* restore_idx_data =
+        restore_index->mutable_data<int>({roi_num, 1}, dev_ctx.GetPlace());
+    // sort current index to get restore index
+    cub::DeviceRadixSort::SortPairsDescending<int, int>(
+        d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in,
+        restore_idx_data, roi_num);
+
+    Tensor offset_lod;
+    int* offset_lod_data =
+        offset_lod.mutable_data<int>({lod_size + 1}, dev_ctx.GetPlace());
+    for (int i = 0; i < num_level; ++i) {
+      Tensor sub_lod = sub_lod_list.Slice(i, i + 1);
+      int* sub_lod_data = sub_lod.data<int>();
+      // transfer length-based lod to offset-based lod
+      TransLoD(sub_lod_data, lod_size + 1, offset_lod_data);
+      int sub_rois_num = offset_lod_data[lod_size];
+      Tensor sub_idx = index_out_t.Slice(0, sub_rois_num);
+
+      multi_fpn_rois[i]->mutable_data<T>({sub_rois_num, kBoxDim},
+                                         dev_ctx.GetPlace());
+
+      GPUGather<T>(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]);
+      framework::LoD lod;
+      std::vector<size_t> offset;
+      memory::Copy(platform::CPUPlace(), offset.data(), place, offset_lod_data,
+                   sizeof(int) * (lod_size + 1), 0);
+      lod.emplace_back(offset);
+      multi_fpn_rois[i]->set_lod(lod);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    distribute_fpn_proposals,
+    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::GPUDistributeFpnProposalsOpKernel<paddle::platform::CUDADeviceContext,
+                                           double>);
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/detail/safe_ref.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+const int kBoxDim = 4;
+
+template <typename T>
+static inline T BBoxArea(const T* box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <typename T>
+class DistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* fpn_rois = context.Input<paddle::framework::LoDTensor>("FpnRois");
+
+    auto multi_fpn_rois =
+        context.MultiOutput<paddle::framework::LoDTensor>("MultiFpnRois");
+
+    auto* restore_index =
+        context.Output<paddle::framework::Tensor>("RestoreIndex");
+
+    const int min_level = context.Attr<int>("min_level");
+    const int max_level = context.Attr<int>("max_level");
+    const int refer_level = context.Attr<int>("refer_level");
+    const int refer_scale = context.Attr<int>("refer_scale");
+    const int num_level = max_level - min_level + 1;
+
+    // check that the fpn_rois is not empty
+    PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL,
+                      "DistributeFpnProposalsOp need 1 level of LoD");
+
+    auto fpn_rois_lod = fpn_rois->lod().back();
+    int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+    std::vector<int> target_level;
+    // std::vector<int> target_level(fpn_rois_num, -1);
+    // record the number of rois in each level
+    std::vector<int> num_rois_level(num_level, 0);
+    std::vector<int> num_rois_level_integral(num_level + 1, 0);
+    for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+      Tensor fpn_rois_slice =
+          fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
+      const T* rois_data = fpn_rois_slice.data<T>();
+      for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+        // get the target level of current rois
+        T roi_scale = std::sqrt(BBoxArea(rois_data, false));
+        int tgt_lvl =
+            std::floor(std::log2(roi_scale / refer_scale) + refer_level);
+        tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
+        target_level.push_back(tgt_lvl);
+        num_rois_level[tgt_lvl - min_level]++;
+        rois_data += kBoxDim;
+      }
+    }
+    // define the output rois
+    // pointer which point to each level fpn rois
+    std::vector<T*> multi_fpn_rois_data(num_level);
+    // lod0 which will record the offset information of each level rois
+    std::vector<std::vector<size_t>> multi_fpn_rois_lod0;
+    for (int i = 0; i < num_level; ++i) {
+      // allocate memory for each level rois
+      multi_fpn_rois[i]->mutable_data<T>({num_rois_level[i], kBoxDim},
+                                         context.GetPlace());
+      multi_fpn_rois_data[i] = multi_fpn_rois[i]->data<T>();
+      std::vector<size_t> lod0(1, 0);
+      multi_fpn_rois_lod0.push_back(lod0);
+      // statistic start point for each level rois
+      num_rois_level_integral[i + 1] =
+          num_rois_level_integral[i] + num_rois_level[i];
+    }
+    restore_index->mutable_data<int>({1, fpn_rois_num}, context.GetPlace());
+    int* restore_index_data = restore_index->data<int>();
+    std::vector<int> restore_index_inter(fpn_rois_num, -1);
+    // distribute the rois into different fpn level by target level
+    for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+      Tensor fpn_rois_slice =
+          fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]);
+      const T* rois_data = fpn_rois_slice.data<T>();
+      size_t cur_offset = fpn_rois_lod[i];
+      // std::vector<size_t > lod_offset[num_level];
+      for (int j = 0; j < num_level; j++) {
+        multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
+      }
+      for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+        int lvl = target_level[cur_offset + j];
+        memcpy(multi_fpn_rois_data[lvl - min_level], rois_data,
+               kBoxDim * sizeof(T));
+        multi_fpn_rois_data[lvl - min_level] += kBoxDim;
+        int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
+                               multi_fpn_rois_lod0[lvl - min_level][i + 1];
+        restore_index_inter[index_in_shuffle] = cur_offset + j;
+        multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
+        rois_data += kBoxDim;
+      }
+    }
+    for (int i = 0; i < fpn_rois_num; ++i) {
+      restore_index_data[restore_index_inter[i]] = i;
+    }
+    // merge lod information into LoDTensor
+    for (int i = 0; i < num_level; ++i) {
+      framework::LoD lod;
+      lod.emplace_back(multi_fpn_rois_lod0[i]);
+      multi_fpn_rois[i]->set_lod(lod);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"

 namespace paddle {
 namespace operators {
@@ -47,7 +46,7 @@ struct EmbeddingVSumFunctor {
    auto *output = output_t->mutable_data<T>(context.GetPlace());

    PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
-    PADDLE_ENFORCE_GT(ids_lod.size(), 1UL);
+    PADDLE_ENFORCE_GT(ids_lod.size(), 1UL, "The LoD[0] could NOT be empty");

    jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
                                  out_width, jit::SeqPoolType::kSum);
@@ -83,11 +82,11 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
        FusedEmbeddingSeqPoolLastDim(table_var->dims(), ids_t->dims());
    const auto &ids_lod = ids_t->lod();
    // in run time, the LoD of ids must be 1
-    PADDLE_ENFORCE(ids_lod.size(), 1u, "The LoD level of Input(Ids) must be 1");
-    PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty");
+    PADDLE_ENFORCE(ids_lod.size(), 1UL,
+                   "The LoD level of Input(Ids) must be 1");
    int64_t batch_size = ids_lod[0].size() - 1;
    // in run time, the shape from Ids -> output
-    // should be [seq_length, 1] -> [batch_size, embedding_size]
+    // should be [seq_length, 1] -> [batch_size, last_dim]
    output_t->Resize({batch_size, last_dim});

    if (combiner_type == "sum") {
@@ -125,7 +124,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      auto *ids_data = ids->data<int64_t>();
      int64_t ids_num = ids->numel();
      auto lod = ids->lod()[0];
-      int64_t row_width = d_output->dims()[1];
+      int64_t out_width = d_output->dims()[1];

      framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
      new_rows->resize(ids_num);
@@ -136,15 +135,13 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
      const T *d_output_data = d_output->data<T>();

-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      auto vbroadcast = jit::Get<jit::kVBroadcast, jit::VBroadcastTuples<T>,
+                                 platform::CPUPlace>(out_width);
      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
        int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-        int64_t in_offset = lod[i] * row_width;
-        const T *out_pos = d_output_data + i * row_width;
-        T *in_pos = d_table_data + in_offset;
-        for (int r = 0; r != h; ++r) {
-          blas.VCOPY(row_width, out_pos, in_pos + r * row_width);
-        }
+        const T *src = d_output_data + i * out_width;
+        T *dst = d_table_data + lod[i] * out_width;
+        vbroadcast(src, dst, h, out_width);
      }
    } else {
      LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";

--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -474,6 +474,23 @@ void BenchCRFDecodingKernel() {
  }
 }

+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchVBroadcastKernel() {
+  for (int64_t w : {1, 16, 64, 100, 256}) {
+    Tensor x;
+    x.Resize({w});
+    RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
+    const T* x_data = x.data<T>();
+    for (int h : TestSizes()) {
+      Tensor y;
+      y.Resize({h * w});
+      T* y_data = y.mutable_data<T>(PlaceType());
+      BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>(
+          w, x_data, y_data, static_cast<int64_t>(h), w);
+    }
+  }
+}
+
 using T = float;
 using CPUPlace = paddle::platform::CPUPlace;

@@ -498,6 +515,7 @@ BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
 BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
 BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
 BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
+BENCH_FP32_CPU(kVCopy) { BenchXYNKernel<jit::kVCopy, T, CPUPlace>(); }

 // lstm and peephole
 BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
@@ -535,6 +553,11 @@ BENCH_FP32_CPU(kCRFDecoding) {
  BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
 }

+// vbroadcast function
+BENCH_FP32_CPU(kVBroadcast) {
+  BenchVBroadcastKernel<jit::kVBroadcast, T, CPUPlace>();
+}
+
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:

--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax)
 USE_JITKERNEL_GEN(kHSum)
 USE_JITKERNEL_GEN(kEmbSeqPool)
 USE_JITKERNEL_GEN(kSgd)
+USE_JITKERNEL_GEN(kVBroadcast)
--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void VBroadcastJitCode::genCode() {
+  preCode();
+  constexpr int block = YMM_FLOAT_BLOCK;
+  constexpr int max_num_regs = 16;
+  const int num_block = w_ / block;
+  const int num_groups = num_block / max_num_regs;
+  const size_t block_size = sizeof(float) * block;
+  std::vector<int> groups(num_groups, max_num_regs);
+  int rest_num_regs = num_block % max_num_regs;
+  if (rest_num_regs > 0) {
+    groups.push_back(rest_num_regs);
+  }
+
+  // protect param_h
+  mov(reg_height, param_h);
+  Label l_next_h;
+  xor_(reg_h_i, reg_h_i);
+  mov(reg_ptr_dst_i, param_dst);
+  L(l_next_h);
+  {
+    mov(reg_ptr_src_i, param_src);
+    for (int num_regs : groups) {
+      size_t w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]);
+        w_offset += block_size;
+      }
+      add(reg_ptr_src_i, num_regs * block_size);
+
+      w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i));
+        w_offset += block_size;
+      }
+      add(reg_ptr_dst_i, num_regs * block_size);
+    }  // end of groups
+    inc(reg_h_i);
+    cmp(reg_h_i, reg_height);
+    jl(l_next_h, T_NEAR);
+  }  // end of l_next_h
+
+  postCode();
+}
+
+class VBroadcastCreator : public JitCodeCreator<int64_t> {
+ public:
+  bool UseMe(const int64_t& w) const override {
+    return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
+  }
+  size_t CodeSize(const int64_t& w) const override {
+    return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
+    PADDLE_ENFORCE_GT(w, 0);
+    return make_unique<VBroadcastJitCode>(w, CodeSize(w));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
--- a/paddle/fluid/operators/jit/gen/vbroadcast.h
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class VBroadcastJitCode : public JitCode {
+ public:
+  explicit VBroadcastJitCode(const int64_t& w, size_t code_size = 256 * 1024,
+                             void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), w_(w) {
+    this->genCode();
+  }
+
+  DECLARE_JIT_CODE(VBroadcastJitCode);
+  void genCode() override;
+
+ private:
+  int w_;
+  reg64_t param_src{abi_param1};
+  reg64_t param_dst{abi_param2};
+  reg64_t param_h{abi_param3};
+  reg64_t param_w{abi_param4};
+
+  reg64_t reg_height{r9};
+  reg64_t reg_h_i{r10};
+  reg64_t reg_ptr_src_i{r11};
+  reg64_t reg_ptr_dst_i{r12};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -36,6 +36,8 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kVScal);
    ONE_CASE(kVAddBias);
    ONE_CASE(kVRelu);
+    ONE_CASE(kVBroadcast);
+    ONE_CASE(kVCopy);
    ONE_CASE(kVIdentity);
    ONE_CASE(kVExp);
    ONE_CASE(kVSquare);

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -41,6 +41,8 @@ typedef enum {
  kVAdd,
  kVAddBias,
  kVAddRelu,
+  kVBroadcast,
+  kVCopy,
  kVExp,
  kVIdentity,
  kVMul,
@@ -133,6 +135,13 @@ struct GRUTuples {
  typedef void (*func_type)(gru_t*, const gru_attr_t*);
 };

+template <typename T>
+struct VBroadcastTuples {
+  typedef T data_type;
+  typedef int64_t attr_type;
+  typedef void (*func_type)(const T*, T*, int64_t, int64_t);
+};
+
 typedef struct seq_pool_attr_s {
  int h, w;  // h should always be the first one
  SeqPoolType type;

--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -24,6 +24,11 @@ size_t JitCodeKey<int>(const int& d) {
  return d;
 }

+template <>
+size_t JitCodeKey<int64_t>(const int64_t& d) {
+  return d;
+}
+
 // TODO(TJ): refine and benchmark JitCodeKey generatation
 constexpr int act_type_shift = 3;  // suppot 2^3 act types
 static inline int act_type_convert(KernelType type) {

--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -9,9 +9,11 @@ USE_JITKERNEL_MORE(kVAdd, mkl)
 USE_JITKERNEL_MORE(kVScal, mkl)
 USE_JITKERNEL_MORE(kVExp, mkl)
 USE_JITKERNEL_MORE(kVSquare, mkl)
+USE_JITKERNEL_MORE(kVCopy, mkl)
 USE_JITKERNEL_MORE(kVSigmoid, mkl)
 USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
 USE_JITKERNEL_MORE(kSoftmax, mkl)
 USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
 USE_JITKERNEL_MORE(kSgd, mkl)
+USE_JITKERNEL_MORE(kVBroadcast, mkl)
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -154,6 +154,21 @@ bool VSquareKernel<float>::UseMe(const int& d) const {
  return d > 7;
 }

+template <>
+bool VCopyKernel<float>::UseMe(const int& d) const {
+  return d > 15;
+}
+
+template <>
+bool VBroadcastKernel<float>::UseMe(const int64_t& d) const {
+  return d > 127;
+}
+
+template <>
+bool VBroadcastKernel<double>::UseMe(const int64_t& attr) const {
+  return true;
+}
+
 template <>
 bool VSigmoidKernel<float>::UseMe(const int& d) const {
  return d > 7;
@@ -223,6 +238,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp);
 AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
 AWALYS_USE_ME_WITH_DOUBLE(VTanh);
 AWALYS_USE_ME_WITH_DOUBLE(VSquare);
+AWALYS_USE_ME_WITH_DOUBLE(VCopy);
 AWALYS_USE_ME_WITH_DOUBLE(Softmax);

 #undef AWALYS_USE_ME_WITH_DOUBLE
@@ -244,6 +260,8 @@ REGISTER_MKL_KERNEL(kVAdd, VAdd);
 REGISTER_MKL_KERNEL(kVScal, VScal);
 REGISTER_MKL_KERNEL(kVExp, VExp);
 REGISTER_MKL_KERNEL(kVSquare, VSquare);
+REGISTER_MKL_KERNEL(kVCopy, VCopy);
+REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);

--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -50,6 +50,13 @@ void VCopy(const T* x, T* y, int n);
 template <typename T>
 void VAXPY(T a, const T* x, T* y, int n);

+template <typename T>
+void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
+  for (int64_t h = 0; h < y_h; ++h) {
+    VCopy(x, y + h * x_len, x_len);
+  }
+}
+
 template <typename T>
 void VSigmoid(const T* x, T* y, int n) {
  const T min = SIGMOID_THRESHOLD_MIN;
@@ -192,6 +199,7 @@ DECLARE_MKL_KERNEL(VExp, XYNTuples);
 DECLARE_MKL_KERNEL(VSigmoid, XYNTuples);
 DECLARE_MKL_KERNEL(VTanh, XYNTuples);
 DECLARE_MKL_KERNEL(VSquare, XYNTuples);
+DECLARE_MKL_KERNEL(VCopy, XYNTuples);

 DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);

@@ -201,6 +209,8 @@ DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);

 DECLARE_MKL_KERNEL(Sgd, SgdTuples);

+DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples);
+
 #undef DECLARE_MKL_KERNEL

 }  // namespace mkl

--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -13,6 +13,7 @@ USE_JITKERNEL_REFER(kVAddRelu)
 USE_JITKERNEL_REFER(kVSub)
 USE_JITKERNEL_REFER(kVScal)
 USE_JITKERNEL_REFER(kVAddBias)
+USE_JITKERNEL_REFER(kVCopy)
 USE_JITKERNEL_REFER(kVRelu)
 USE_JITKERNEL_REFER(kVIdentity)
 USE_JITKERNEL_REFER(kVExp)
@@ -34,3 +35,4 @@ USE_JITKERNEL_REFER(kHMax)
 USE_JITKERNEL_REFER(kSoftmax)
 USE_JITKERNEL_REFER(kEmbSeqPool)
 USE_JITKERNEL_REFER(kSgd)
+USE_JITKERNEL_REFER(kVBroadcast)
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -30,6 +30,7 @@ REGISTER_REFER_KERNEL(kVScal, VScal);
 REGISTER_REFER_KERNEL(kVAddBias, VAddBias);

 REGISTER_REFER_KERNEL(kVRelu, VRelu);
+REGISTER_REFER_KERNEL(kVCopy, VCopy);
 REGISTER_REFER_KERNEL(kVIdentity, VIdentity);
 REGISTER_REFER_KERNEL(kVSquare, VSquare);
 REGISTER_REFER_KERNEL(kVExp, VExp);
@@ -61,4 +62,6 @@ REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);

 REGISTER_REFER_KERNEL(kSgd, Sgd);

+REGISTER_REFER_KERNEL(kVBroadcast, VBroadcast);
+
 #undef REGISTER_REFER_KERNEL
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -70,6 +70,20 @@ void VAddBias(const T* a, const T* x, T* y, int n) {
  }
 }

+template <typename T>
+void VCopy(const T* x, T* y, int n) {
+  std::memcpy(y, x, n * sizeof(T));
+}
+
+// x shape: (x_len)
+// y shape: (h, x_len)
+template <typename T>
+void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
+  for (int64_t h = 0; h < y_h; ++h) {
+    VCopy(x, y + h * x_len, x_len);
+  }
+}
+
 template <typename T>
 void VRelu(const T* x, T* y, int n) {
  for (int i = 0; i < n; ++i) {
@@ -500,6 +514,7 @@ DECLARE_REFER_KERNEL(VExp, XYNTuples);
 DECLARE_REFER_KERNEL(VSigmoid, XYNTuples);
 DECLARE_REFER_KERNEL(VTanh, XYNTuples);
 DECLARE_REFER_KERNEL(VSquare, XYNTuples);
+DECLARE_REFER_KERNEL(VCopy, XYNTuples);

 // lstm_t*, const lstm_attr_t*
 DECLARE_REFER_KERNEL(LSTMCtHt, LSTMTuples);
@@ -528,6 +543,8 @@ DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);

 DECLARE_REFER_KERNEL(Sgd, SgdTuples);

+DECLARE_REFER_KERNEL(VBroadcast, VBroadcastTuples);
+
 #undef DECLARE_REFER_KERNEL

 }  // namespace refer

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -26,8 +26,8 @@ limitations under the License. */
 DEFINE_double(acc, 1e-5, "Test accuracy threshold.");

 template <typename T>
-void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
-               const T upper = static_cast<T>(20.f)) {
+void RandomVec(const int n, T* a, const T lower = static_cast<T>(-2.f),
+               const T upper = static_cast<T>(2.f)) {
  static unsigned int seed = 100;
  std::mt19937 rng(seed++);
  std::uniform_real_distribution<double> uniform_dist(0, 1);
@@ -157,6 +157,26 @@ struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
  }
 };

+template <typename T>
+struct TestFuncWithRefer<jit::VBroadcastTuples<T>, std::vector<T>,
+                         std::vector<T>, int64_t,
+                         typename jit::VBroadcastTuples<T>::attr_type> {
+  void operator()(const typename jit::VBroadcastTuples<T>::func_type tgt,
+                  const std::vector<T>& x, const std::vector<T>& yref,
+                  int64_t h,
+                  const typename jit::VBroadcastTuples<T>::attr_type& attr) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(x.size(), static_cast<size_t>(attr));
+    EXPECT_EQ(yref.size(), x.size() * h);
+    std::vector<T> y(yref.size());
+    const T* x_data = x.data();
+    const T* yref_data = yref.data();
+    T* y_data = y.data();
+    tgt(x_data, y_data, h, attr);
+    ExpectEQ<T>(y_data, yref_data, yref.size());
+  }
+};
+
 template <typename T>
 struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
  void operator()(const typename jit::XYNTuples<T>::func_type tgt,
@@ -514,7 +534,7 @@ void TestKernelXRNTuples() {
    auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
    EXPECT_TRUE(ref != nullptr);
    std::vector<T> x(d);
-    RandomVec<T>(d, x.data(), -2.f, 2.f);
+    RandomVec<T>(d, x.data());
    T ref_res;
    ref(x.data(), &ref_res, d);
    TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
@@ -532,7 +552,7 @@ void TestKernelXYNTuples() {

    std::vector<T> x(d), yref(d);
    std::vector<T> xinp(d);  // inplace test
-    RandomVec<T>(d, x.data(), -2.f, 2.f);
+    RandomVec<T>(d, x.data());
    std::copy(x.begin(), x.end(), xinp.begin());

    const T* x_data = x.data();
@@ -566,7 +586,7 @@ void TestKernelLSTMTuples() {
            EXPECT_TRUE(ref != nullptr);
            std::vector<T> xsrc(4 * d), wp(3 * d), ct_1(d);
            std::vector<T> ct_ref(d), ht_ref(d), checked(2 * d);
-            RandomVec<T>(4 * d, xsrc.data(), -2.f, 2.f);
+            RandomVec<T>(4 * d, xsrc.data());
            RandomVec<T>(3 * d, wp.data(), -1.f, 1.f);
            RandomVec<T>(d, ct_1.data(), -1.f, 1.f);
            // x could be changed after compute, so copy to save src
@@ -614,8 +634,8 @@ void TestKernelGRUTuples() {
        auto ref = jit::GetRefer<KT, jit::GRUTuples<T>>();
        EXPECT_TRUE(ref != nullptr);
        std::vector<T> xsrc(3 * d), ht_1(d), ht_ref(d);
-        RandomVec<T>(3 * d, xsrc.data(), -2.f, 2.f);
-        RandomVec<T>(d, ht_1.data(), -2.f, 2.f);
+        RandomVec<T>(3 * d, xsrc.data());
+        RandomVec<T>(d, ht_1.data());
        // x could be changed after compute, so copy to save src
        std::vector<T> x(xsrc.size());
        std::copy(xsrc.begin(), xsrc.end(), x.begin());
@@ -651,7 +671,7 @@ void TestKernelSeqPoolTuples() {
        auto ref = jit::GetRefer<KT, jit::SeqPoolTuples<T>>();
        EXPECT_TRUE(ref != nullptr);
        std::vector<T> x(h * w), yref(w);
-        RandomVec<T>(h * w, x.data(), -2.f, 2.f);
+        RandomVec<T>(h * w, x.data());
        const T* x_data = x.data();
        T* yref_data = yref.data();
        ref(x_data, yref_data, &attr);
@@ -676,8 +696,8 @@ void TestKernelMatMulTuples() {
        auto ref = jit::GetRefer<KT, jit::MatMulTuples<T>>();
        EXPECT_TRUE(ref != nullptr);
        std::vector<T> a(m * k), b(k * n), c(m * n);
-        RandomVec<T>(m * k, a.data(), -2.f, 2.f);
-        RandomVec<T>(k * n, b.data(), -2.f, 2.f);
+        RandomVec<T>(m * k, a.data());
+        RandomVec<T>(k * n, b.data());
        const T* a_data = a.data();
        const T* b_data = b.data();
        T* c_data = c.data();
@@ -699,7 +719,7 @@ void TestKernelSoftmaxTuples() {
      auto ref = jit::GetRefer<KT, jit::SoftmaxTuples<T>>();
      EXPECT_TRUE(ref != nullptr);
      std::vector<T> x(bs * n), y(bs * n);
-      RandomVec<T>(bs * n, x.data(), -2.f, 2.f);
+      RandomVec<T>(bs * n, x.data());
      const T* x_data = x.data();
      T* y_data = y.data();

@@ -726,7 +746,7 @@ void TestKernelEmbSeqPoolTuples() {
  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
  for (int tbl_w : test_sizes) {
    std::vector<T> table(tbl_h * tbl_w);
-    RandomVec<T>(tbl_h * tbl_w, table.data(), -2.f, 2.f);
+    RandomVec<T>(tbl_h * tbl_w, table.data());
    const T* table_data = table.data();
    for (auto type : pool_types) {
      for (int idx_w : {1, 2, 10, 16}) {
@@ -772,14 +792,14 @@ void TestKernelSgdTuples() {
    for (int grad_w : TestSizes()) {
      std::vector<T> param(param_h * grad_w);
      std::vector<T> param_out(param_h * grad_w);
-      RandomVec<T>(param_h * grad_w, param.data(), -2.f, 2.f);
+      RandomVec<T>(param_h * grad_w, param.data());
      const T* param_data = param.data();
      T* out_data = param_out.data();
      for (int rows_size = 1; rows_size <= param_h; ++rows_size) {
        std::vector<T> grad(rows_size * grad_w);
        std::vector<int64_t> rows =
            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
-        RandomVec<T>(rows_size * grad_w, grad.data(), -2.f, 2.f);
+        RandomVec<T>(rows_size * grad_w, grad.data());
        const int64_t* rows_data = rows.data();
        const T* grad_data = grad.data();
        auto ref = jit::GetRefer<KT, jit::SgdTuples<T>>();
@@ -815,8 +835,8 @@ void TestKernelNCHW16CMulNCTuples() {
  int sz = n * c * h * w;
  std::vector<T> x(sz), y(n * c), zref(sz);
  std::vector<T> ztgt(sz), zjit(sz);
-  RandomVec<T>(sz, x.data(), -2.f, 2.f);
-  RandomVec<T>(n * c, y.data(), -2.f, 2.f);
+  RandomVec<T>(sz, x.data());
+  RandomVec<T>(n * c, y.data());

  const T* x_data = x.data();
  const T* y_data = y.data();
@@ -873,11 +893,11 @@ void TestKernelLayerNormTuples() {
        int sz = left * right;
        std::vector<T> x(sz), mean(left), var(left), scale(right), bias(right),
            outref(sz);
-        RandomVec<T>(sz, x.data(), -2.f, 2.f);
-        RandomVec<T>(left, mean.data(), -2.f, 2.f);
-        RandomVec<T>(left, var.data(), -2.f, 2.f);
-        RandomVec<T>(right, scale.data(), -2.f, 2.f);
-        RandomVec<T>(right, bias.data(), -2.f, 2.f);
+        RandomVec<T>(sz, x.data());
+        RandomVec<T>(left, mean.data());
+        RandomVec<T>(left, var.data());
+        RandomVec<T>(right, scale.data());
+        RandomVec<T>(right, bias.data());

        const T* scale_data = scale.data();
        const T* bias_data = bias.data();
@@ -903,7 +923,7 @@ void TestKernelCRFDecodingTuples() {
  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  constexpr int state_trans_base_idx = 2;
  auto test_sizes = TestSizes();
-  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 1000));
+  test_sizes.erase(std::remove(test_sizes.begin(), test_sizes.end(), 2000));
  for (int seq_len : {1, 11, 17, 50}) {
    for (int tag_num : test_sizes) {
      auto ref = jit::GetRefer<KT, jit::CRFDecodingTuples<T>>();
@@ -912,8 +932,8 @@ void TestKernelCRFDecodingTuples() {
      int w_sz = (tag_num + state_trans_base_idx) * tag_num;
      std::vector<T> x(x_sz), w(w_sz), alpharef(x_sz);
      std::vector<int> trackref(x_sz);
-      RandomVec<T>(x_sz, x.data(), -2.f, 2.f);
-      RandomVec<T>(w_sz, w.data(), -2.f, 2.f);
+      RandomVec<T>(x_sz, x.data());
+      RandomVec<T>(w_sz, w.data());

      ref(seq_len, (const T*)x.data(), (const T*)w.data(), alpharef.data(),
          trackref.data(), tag_num);
@@ -926,6 +946,27 @@ void TestKernelCRFDecodingTuples() {
  }
 }

+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestKernelVBroadcastTuples() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  for (int w : TestSizes()) {
+    std::vector<T> x(w);
+    RandomVec<T>(w, x.data());
+    const T* x_data = x.data();
+    for (int64_t h : {1, 2, 6}) {
+      auto ref = jit::GetRefer<KT, jit::VBroadcastTuples<T>>();
+      EXPECT_TRUE(ref != nullptr);
+      std::vector<T> y(w * h);
+      T* y_data = y.data();
+      ref(x_data, y_data, h, w);
+
+      TestAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType, std::vector<T>,
+                   std::vector<T>, int64_t>(static_cast<int64_t>(w), x, y, h,
+                                            static_cast<int64_t>(w));
+    }
+  }
+}
+
 #define TEST_CPU_KERNEL(test_tuple, kernel_type)                 \
  TEST(JITKernel, kernel_type) {                                 \
    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
@@ -949,6 +990,7 @@ TEST_CPU_KERNEL(XYNTuples, kVSquare);
 TEST_CPU_KERNEL(XYNTuples, kVExp);
 TEST_CPU_KERNEL(XYNTuples, kVSigmoid);
 TEST_CPU_KERNEL(XYNTuples, kVTanh);
+TEST_CPU_KERNEL(XYNTuples, kVCopy);

 TEST_CPU_KERNEL(LSTMTuples, kLSTMCtHt);
 TEST_CPU_KERNEL(LSTMTuples, kLSTMC1H1);
@@ -966,6 +1008,7 @@ TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool);
 TEST_CPU_KERNEL(SgdTuples, kSgd);
 TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm);
 TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding);
+TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast);

 TEST(JITKernel_key, lstm) {
  jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);

--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "mkldnn.hpp"
+#include "paddle/fluid/framework/data_layout_transform.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/requantize_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::reorder;
+using platform::to_void_cast;
+using Tensor = framework::Tensor;
+using framework::DataLayout;
+using mkldnn::stream;
+using platform::GetMKLDNNFormat;
+
+template <typename T>
+class ReQuantOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto scale_in = ctx.Attr<float>("Scale_in");
+    auto scale_out = ctx.Attr<float>("Scale_out");
+    auto* output = ctx.Output<Tensor>("Output");
+    auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& engine = dev_ctx.GetEngine();
+
+    std::vector<primitive> pipeline;
+    std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
+    std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
+    mkldnn::memory::data_type src_dt =
+        paddle::framework::ToMKLDNNDataType(input->type());
+    mkldnn::memory::data_type dst_dt = src_dt;  // TODO(Xiaoli) support
+                                                // requantize from different
+                                                // data type (e.g., s8 to u8)
+    mkldnn::memory::format src_fmt = memory::format::nhwc;
+    mkldnn::memory::format dst_fmt = memory::format::nhwc;
+
+    const T* input_data = input->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    float scale_shift = scale_out / scale_in;
+
+    mkldnn::primitive_attr attri;
+    int mask = 0;
+    attri.set_output_scales(mask, {scale_shift});
+
+    auto src_md = platform::MKLDNNMemDesc({src_tz}, src_dt, src_fmt);
+    auto src_pd = mkldnn::memory::primitive_desc(src_md, engine);
+    auto src_memory =
+        std::make_shared<mkldnn::memory>(src_pd, to_void_cast<T>(input_data));
+    std::shared_ptr<primitive::at> src_memory_p =
+        std::shared_ptr<primitive::at>(new primitive::at(*src_memory));
+
+    auto dst_md = platform::MKLDNNMemDesc({dst_tz}, dst_dt, dst_fmt);
+    auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
+    auto dst_memory = mkldnn::memory(dst_pd, to_void_cast<T>(output_data));
+
+    auto reorder_pd = std::shared_ptr<reorder::primitive_desc>(
+        new reorder::primitive_desc(src_pd, dst_pd, attri));
+
+    auto reorder_p = std::shared_ptr<reorder>(
+        new reorder(*reorder_pd, *src_memory_p, dst_memory));
+    pipeline.push_back(*reorder_p);
+    stream(stream::kind::eager).submit(pipeline).wait();
+
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetMKLDNNFormat(dst_memory));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(requantize, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ReQuantOpKernel<int8_t>, ops::ReQuantOpKernel<uint8_t>);
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -157,11 +157,13 @@ class RecurrentBase : public framework::OperatorBase {
                                     const std::vector<std::string> &src_vars,
                                     framework::Scope *dst_scope,
                                     const std::vector<std::string> &dst_vars,
-                                     Callback callback) {
+                                     Callback callback,
+                                     bool is_backward = false) {
    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
    for (size_t i = 0; i < dst_vars.size(); ++i) {
      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
-      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
+                   is_backward);
    }
  }

@@ -173,11 +175,13 @@ class RecurrentBase : public framework::OperatorBase {
                                     const std::vector<std::string> &src_vars,
                                     const framework::Scope &dst_scope,
                                     const std::vector<std::string> &dst_vars,
-                                     Callback callback) {
+                                     Callback callback,
+                                     bool is_backward = false) {
    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
    for (size_t i = 0; i < dst_vars.size(); ++i) {
      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
-      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback,
+                   is_backward);
    }
  }

@@ -194,9 +198,13 @@ class RecurrentBase : public framework::OperatorBase {
  static void AccessTensor(const framework::Scope &src_scope,
                           const std::string &src_var_name,
                           framework::Scope *dst_scope,
-                           const std::string &dst_var_name, Callback callback) {
+                           const std::string &dst_var_name, Callback callback,
+                           bool is_backward = false) {
    auto *src_var = src_scope.FindVar(src_var_name);
-    PADDLE_ENFORCE(src_var != nullptr);
+    if (is_backward && src_var == nullptr) {
+      return;
+    }
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
    auto &src_tensor = src_var->Get<framework::LoDTensor>();

    auto *dst_var = dst_scope->Var(dst_var_name);
@@ -208,12 +216,16 @@ class RecurrentBase : public framework::OperatorBase {
  static void AccessTensor(const framework::Scope &src_scope,
                           const std::string &src_var_name,
                           const framework::Scope &dst_scope,
-                           const std::string &dst_var_name, Callback callback) {
+                           const std::string &dst_var_name, Callback callback,
+                           bool is_backward = false) {
+    auto *dst_var = dst_scope.FindVar(dst_var_name);
+    if (is_backward && dst_var == nullptr) {
+      return;
+    }
    auto *src_var = src_scope.FindVar(src_var_name);
-    PADDLE_ENFORCE(src_var != nullptr);
+    PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name);
    auto &src_tensor = src_var->Get<framework::LoDTensor>();
-    auto *dst_var = dst_scope.FindVar(dst_var_name);
-    PADDLE_ENFORCE(dst_var != nullptr);
+    PADDLE_ENFORCE(dst_var != nullptr, "%s is not found.", dst_var_name);
    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
    callback(src_tensor, dst_tensor);
  }
@@ -345,7 +357,8 @@ class RecurrentGradOp : public RecurrentBase {
            auto dims = framework::vectorize(inside->dims());
            dims.erase(dims.begin());
            inside->Resize(framework::make_ddim(dims));
-          });
+          },
+          true /*is_backward*/);
      auto og_set = List2Set(Inputs(kOutputGrads));

      if (VLOG_IS_ON(10)) {
@@ -454,7 +467,8 @@ class RecurrentGradOp : public RecurrentBase {

            auto dst = outside->Slice(seq_offset, seq_offset + 1);
            framework::TensorCopy(inside, place, dev_ctx, &dst);
-          });
+          },
+          true /*is_backward*/);
      VLOG(5) << "Link outside gradient finished ";

      if (step_id + 1 == seq_len) {  // at_end
@@ -467,7 +481,8 @@ class RecurrentGradOp : public RecurrentBase {
              outside->Resize(inside.dims());
              outside->mutable_data(place, inside.type());
              framework::TensorCopy(inside, place, dev_ctx, outside);
-            });
+            },
+            true /*is_backward*/);
        VLOG(5) << "Link initialize state gradient finished ";
      }
      scopes.Next();
@@ -608,10 +623,8 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
    std::vector<std::string> input{kInputs, kInitialStates};
    std::vector<std::string> output{kOutputs};
    for (auto &s : input) {
+      // NOTE(zcd): In some case, some of kInputs doesn't have gradient.
      PADDLE_ENFORCE(ctx->HasInputs(s));
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
-                     "Cannot find the gradient variable %s",
-                     framework::GradVarName(s));
    }
    for (auto &s : output) {
      PADDLE_ENFORCE(ctx->HasInputs(s));

--- a/paddle/fluid/operators/requantize_op.cc
+++ b/paddle/fluid/operators/requantize_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#include "paddle/fluid/operators/requantize_op.h"
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+framework::OpKernelType ReQuantOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library_ = framework::LibraryType::kMKLDNN;
+  framework::DataLayout layout_ = framework::DataLayout::kMKLDNN;
+
+  return framework::OpKernelType(ctx.Input<Tensor>("Input")->type(),
+                                 ctx.GetPlace(), layout_, library_);
+}
+
+void ReQuantOpMaker::Make() {
+  AddInput("Input", "input data");
+  AddOutput("Output", "output data");
+  AddAttr<float>("Scale_in", "scale in data").SetDefault({1.0f});
+  AddAttr<float>("Scale_out", "scale out data").SetDefault({1.0f});
+  AddComment(
+      R"DOC(This op will re-quantize data from INT8 with scale_in to INT8 with scale_out)DOC");
+}
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(requantize, ops::ReQuantOp, ops::ReQuantOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
--- a/paddle/fluid/operators/requantize_op.h
+++ b/paddle/fluid/operators/requantize_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::OpKernelType;
+using framework::Tensor;
+
+class ReQuantOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim("Output", ctx->GetInputDim("Input"));
+    ctx->ShareLoD("Input", /*->*/ "Output");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class ReQuantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -56,6 +56,9 @@ class ReshapeOp : public framework::OperatorWithKernel {
  static framework::DDim ValidateShape(const std::vector<int> shape,
                                       const framework::DDim &in_dims) {
    const int64_t in_size = framework::product(in_dims);
+    auto in_dims_vec = framework::vectorize(in_dims);
+    bool all_positive = std::all_of(in_dims_vec.cbegin(), in_dims_vec.cend(),
+                                    [](int64_t i) { return i > 0; });
    // only one dimension can be set to -1, whose size will be automatically
    // infered.
    const int64_t unk_dim_val = -1;
@@ -88,7 +91,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
    }

    if (unk_dim_idx != -1) {
-      if (in_size > 0) {
+      if (all_positive) {
        // in_size < 0 and is un-determinate in compile time, skip the check,
        // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
        // capacity = -24, in_size = -8, output_shape[0] = 0

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Ys", "A list of outputs").AsDuplicable();
    AddAttr<std::string>("subgraph", "the subgraph.");
    AddAttr<std::string>("calibration_data", "the calibration data for int8");
+    AddAttr<std::string>(
+        "engine_serialized_data",
+        "the serialized data contains the all info of the ICUDAEngine");
    AddAttr<std::string>(
        "engine_key",
        "The engine_key here is used to distinguish different TRT Engines");

--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -16,8 +16,10 @@

 #ifdef PADDLE_WITH_CUDA

+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>

 #include "paddle/fluid/framework/executor.h"
@@ -31,37 +33,6 @@ namespace paddle {

 namespace operators {

-using FluidDT = framework::proto::VarType_Type;
-using TRT_DT = nvinfer1::DataType;
-
-namespace {  // NOLINT
-
-TRT_DT FluidDataType2TRT(FluidDT type) {
-  switch (type) {
-    case FluidDT::VarType_Type_FP32:
-      return TRT_DT::kFLOAT;
-    case FluidDT::VarType_Type_INT32:
-      return TRT_DT::kINT32;
-    default:
-      return TRT_DT::kINT32;
-  }
-  PADDLE_THROW("unkown type");
-  return TRT_DT::kINT32;
-}
-
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
-  PADDLE_ENFORCE_GT(shape.size(), 1UL,
-                    "TensorRT' tensor input requires at least 2 dimensions");
-  PADDLE_ENFORCE_LE(shape.size(), 4UL,
-                    "TensorRT' tensor input requires at most 4 dimensions");
-  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
-  if (shape.size() == 4UL)
-    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
-  return nvinfer1::DimsCHW(shape[1], 1, 1);
-}
-
-}  // namespace // NOLINT
-
 using inference::Singleton;
 using inference::tensorrt::TensorRTEngine;
 using inference::tensorrt::TRTInt8Calibrator;
@@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
  bool enable_int8_;
  std::string calibration_data_;
  std::string engine_key_;
+  std::string engine_serialized_data_;
  bool calibration_mode_;

 public:
@@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
    enable_int8_ = Attr<bool>("enable_int8");
    calibration_data_ = Attr<std::string>("calibration_data");
    engine_key_ = Attr<std::string>("engine_key");
+    engine_serialized_data_ = Attr<std::string>("engine_serialized_data");

    auto params = Attr<std::vector<std::string>>("parameters");
    for (const auto &param : params) {
@@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
      RunCalibration(scope, dev_place);
      return;
    }
-    RunTrt(scope, dev_place);
+    auto *trt_engine = GetEngine(scope, dev_place);
+    RunTrt(scope, dev_place, trt_engine);
  }

  void RunCalibration(const framework::Scope &scope,
@@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
    LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_
                         << " is running calibration trt int8... ";
    int runtime_batch = 1;
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
    if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
      TRTCalibratorEngine *calib_res =
          Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
@@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
          calib_buffers, runtime_batch, engine_key_, dev_place));
      calib_res->thr_.reset(new std::thread([&]() {
        calib_res->engine_.reset(new TensorRTEngine(
-            max_batch_size_, workspace_size_, stream,
-            boost::get<platform::CUDAPlace>(dev_place).device, enable_int8_,
-            calib_res->calib_.get()));
+            max_batch_size_, workspace_size_, enable_int8_,
+            calib_res->calib_.get(),
+            boost::get<platform::CUDAPlace>(dev_place).device));
        VLOG(3) << "start the calib trt engine thread";
-        Prepare(scope, dev_place, calib_res->engine_.get());
+        PrepareTRTEngine(scope, calib_res->engine_.get());
      }));
    }

@@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase {
    RunNativeImpl(scope, dev_place);
  }

-  void RunTrt(const framework::Scope &scope,
-              const platform::Place &dev_place) const {
+  void RunTrt(const framework::Scope &scope, const platform::Place &dev_place,
+              TensorRTEngine *engine) const {
    int runtime_batch = 1;
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(dev_place);
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
-    if (trt_engine_.get() == nullptr) {
-      trt_engine_.reset(
-          new TensorRTEngine(max_batch_size_, workspace_size_, stream,
-                             boost::get<platform::CUDAPlace>(dev_place).device,
-                             enable_int8_, calibrator_.get()));
-      Prepare(scope, dev_place, trt_engine_.get());
-    }

-    auto *engine = trt_engine_.get();
    PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");

    std::vector<std::string> output_maps =
        Attr<std::vector<std::string>>("output_name_mapping");

-    // Convert input tensor from fluid to engine.
+    int num_inputs = 0;
+
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      num_inputs += 1;
+    }
+    const int num_bindings = num_inputs + Outputs("Ys").size();
+    std::vector<void *> buffers(num_bindings);
+
+    // Bind input tensor to TRT.
    for (const auto &x : Inputs("Xs")) {
      if (param_names_.count(x)) continue;
      // convert input and copy to TRT engine's buffer
@@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase {
          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
      auto t_shape = framework::vectorize(t.dims());
      runtime_batch = t_shape[0];
-      if (platform::is_cpu_place(t.place())) {
-        engine->SetInputFromCPU(x, static_cast<const void *>(t.data<void>()),
-                                t.memory_size());
-      } else {
-        engine->SetInputFromGPU(x, static_cast<const void *>(t.data<void>()),
-                                t.memory_size());
-      }
-    }

-    cudaStreamSynchronize(stream);
-    PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
-    // Execute the engine.
-    engine->Execute(runtime_batch);
+      const int bind_index = engine->engine()->getBindingIndex(x.c_str());
+      PADDLE_ENFORCE(bind_index < num_bindings,
+                     "The bind index should be less than num_bindings");
+      buffers[bind_index] = static_cast<void *>(t.data<float>());
+    }

-    // Convert output tensor from engine to fluid
+    // Bind output tensor to TRT.
    int output_index = 0;
    VLOG(4) << "TensorRT Engine Op Outputs:";
    for (const auto &y : Outputs("Ys")) {
-      VLOG(4) << y;
-      // convert output and copy to fluid.
-      nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]);
-      auto dims = trt_t->getDimensions();
+      const int bind_index =
+          engine->engine()->getBindingIndex(output_maps[output_index].c_str());
+      auto dims = engine->engine()->getBindingDimensions(bind_index);
      // Use the output ITensor's dims to reshape the Fluid Tensor.
      // The ITensor doesn't contain the batch size dim.
      std::vector<int> ddim;
@@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase {
      for (int i = 0; i < dims.nbDims; i++) {
        ddim.push_back(dims.d[i]);
      }
-
      auto *fluid_v = scope.FindVar(y);
      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-
      fluid_t->Resize(framework::make_ddim(ddim));

-      // TODO(Superjomn) change this float to dtype size.
-      auto size =
-          inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch;
-      engine->GetOutputInGPU(
-          output_maps[output_index],
-          fluid_t->mutable_data<float>(platform::CUDAPlace(
-              boost::get<platform::CUDAPlace>(dev_place).device)),
-          size * sizeof(float));
+      PADDLE_ENFORCE(bind_index < num_bindings,
+                     "The bind index should be less than num_bindings");
+      buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
+          boost::get<platform::CUDAPlace>(dev_place)));
+
      output_index += 1;
    }

+    PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
+    // Execute the engine.
+    engine->Execute(runtime_batch, &buffers, stream);
    cudaStreamSynchronize(stream);
  }

-  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
+  TensorRTEngine *GetEngine(const framework::Scope &scope,
+                            const platform::Place &dev_place) const {
+    if (!trt_engine_) {
+      trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
+          max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
+          boost::get<platform::CUDAPlace>(dev_place).device));
+      if (!engine_serialized_data_.empty()) {
+        trt_engine_->Deserialize(engine_serialized_data_);
+      } else {
+        PrepareTRTEngine(scope, trt_engine_.get());
+      }
+    }
+    return trt_engine_.get();
+  }
+
+  void PrepareTRTEngine(const framework::Scope &scope,
                        TensorRTEngine *engine) const {
    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
                 "kernel etc). This process may cost a lot of time.";
-    framework::proto::BlockDesc block_desc;
-    block_desc.ParseFromString(Attr<std::string>("subgraph"));
+    framework::proto::BlockDesc block_proto;
+    block_proto.ParseFromString(Attr<std::string>("subgraph"));
+    framework::BlockDesc block_desc(nullptr, &block_proto);

-    std::vector<std::string> output_maps =
+    std::vector<std::string> inputs = Inputs("Xs");
+    std::vector<std::string> outputs =
        Attr<std::vector<std::string>>("output_name_mapping");

-    engine->InitNetwork();
-
-    framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
-    VLOG(4) << "parsed var size " << block.AllVars().size();
-    // Add inputs
-    VLOG(4) << "declare inputs";
-    for (auto &input : Inputs("Xs")) {
-      if (param_names_.count(input)) continue;
-      VLOG(4) << "declare input " << input;
-
-      auto &t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
-      auto t_shape = framework::vectorize(t.dims());
-
-      auto *var = block.FindVar(input);
-      // TensorRT engine need to create parameters. The parameter's description
-      // should be set in
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                        "TensorRT engine only takes LoDTensor as input");
-
-      engine->DeclareInput(
-          input, FluidDataType2TRT(
-                     var->Proto()->type().lod_tensor().tensor().data_type()),
-          Vec2TRT_Dims(t_shape));
-    }
    inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlock(block_desc, param_names_, scope, engine);
-
-    // Add outputs
-    for (auto &output : output_maps) {
-      engine->DeclareOutput(output);
-    }
-    engine->FreezeNetwork();
+        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
+                                 outputs, engine);
  }
 };


--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) {
  engine_op_desc.SetAttr("output_name_mapping",
                         std::vector<std::string>({"z0"}));
  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));

  LOG(INFO) << "create engine op";
  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
  engine_op_desc.SetAttr("output_name_mapping",
                         std::vector<std::string>({"z3"}));
  engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));

  auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);


--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -38,6 +38,22 @@ DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use,
              "additional trunks of the same size will be requested from gpu "
              "until the gpu has no memory left for another trunk.");

+DEFINE_double(
+    initial_gpu_memory_in_mb, -1.0,
+    "GPU memory chunk size in MB."
+    "Allocator would allocate FLAGS_initial_gpu_memory_in_mb size "
+    "chunk first and reallocate FLAGS_reallocate_gpu_memory_in_mb size "
+    "chunk when the first chunk is not enough. This flag has higher priority "
+    "than FLAGS_fraction_of_gpu_memory_to_use. Disable when less than 0.");
+
+DEFINE_double(reallocate_gpu_memory_in_mb, -1.0,
+              "GPU memory chunk size in MB."
+              "If FLAGS_initial_gpu_memory_in_mb is set and "
+              "FLAGS_reallocate_gpu_memory_in_mb "
+              "is less than 0, it would be replaced by "
+              "FLAGS_initial_gpu_memory_in_mb. Disable "
+              "when FLAGS_initial_gpu_memory_in_mb is less than 0.");
+
 DEFINE_bool(
    enable_cublas_tensor_op_math, false,
    "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, "
@@ -211,13 +227,54 @@ size_t GpuMaxChunkSize() {

  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
                                          (total - reserving));
-
  PADDLE_ENFORCE_LE(allocating, available,
                    "Insufficient GPU memory to allocation.");

  return allocating;
 }

+size_t GpuFirstAllocateChunkSize() {
+  if (FLAGS_initial_gpu_memory_in_mb <= 0) {
+    return GpuMaxChunkSize();
+  }
+
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(&available, &total);
+  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
+           << total / 1024 / 1024 << "M";
+
+  size_t initial_mem =
+      static_cast<size_t>(FLAGS_initial_gpu_memory_in_mb * (1 << 20));
+  PADDLE_ENFORCE_LE(initial_mem, available,
+                    "Insufficient GPU memory to allocation.");
+  return initial_mem;
+}
+
+size_t GpuReAllocateChunkSize() {
+  if (FLAGS_initial_gpu_memory_in_mb <= 0) {
+    return GpuMaxChunkSize();
+  }
+
+  double reallocate_mem = FLAGS_reallocate_gpu_memory_in_mb;
+  if (reallocate_mem < 0) {
+    PADDLE_ENFORCE(FLAGS_initial_gpu_memory_in_mb > 0,
+                   "FLAGS_init_gpu_memory_to_use_mb must be larger than 0");
+    reallocate_mem = FLAGS_initial_gpu_memory_in_mb;
+  }
+
+  size_t total = 0;
+  size_t available = 0;
+  GpuMemoryUsage(&available, &total);
+  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
+           << total / 1024 / 1024 << "M";
+  size_t realloc_mem = static_cast<size_t>(reallocate_mem * (1 << 20));
+  PADDLE_ENFORCE_LE(realloc_mem, available,
+                    "Insufficient GPU memory to allocation.");
+  return realloc_mem;
+}
+
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream) {
  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),

--- a/paddle/fluid/platform/gpu_info.h
+++ b/paddle/fluid/platform/gpu_info.h
@@ -66,6 +66,12 @@ size_t GpuMinChunkSize();
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();

+//! Get init chunk size for GPU buddy allocator.
+size_t GpuFirstAllocateChunkSize();
+
+//! Get reallocate chunk size for GPU buddy allocator.
+size_t GpuReAllocateChunkSize();
+
 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,
                    enum cudaMemcpyKind kind, cudaStream_t stream);

--- a/paddle/fluid/platform/temporary_allocator.cc
+++ b/paddle/fluid/platform/temporary_allocator.cc
@@ -77,6 +77,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) {
  }
  VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr()
           << " size: " << temp_allocation->size();
+  alloc::AllocationDeleter()(temp_allocation);
 }

 size_t TemporaryAllocator::TemporaryAllocationQueueSize() {

--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -221,7 +221,8 @@ void BindAnalysisConfig(py::module *m) {
      .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
           py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
           py::arg("min_subgraph_size") = 3,
-           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
+           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
+           py::arg("use_static") = true)
      .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
      .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
           py::arg("x") = true)

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -159,6 +159,7 @@ def __bootstrap__():

    if core.is_compiled_with_cuda():
        read_env_flags += [
+            'initial_gpu_memory_in_mb', 'reallocate_gpu_memory_in_mb',
            'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
            'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
            'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus',

--- a/python/paddle/fluid/imperative/layer_object_helper.py
+++ b/python/paddle/fluid/imperative/layer_object_helper.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import copy
+import six
+from ..framework import Parameter, _in_imperative_mode
+from ..param_attr import ParamAttr
+from .. import core
+from six.moves import zip
+from ..layer_helper_base import LayerHelperBase
+
+
+class LayerObjectHelper(LayerHelperBase):
+    def __init__(self, name):
+        super(LayerObjectHelper, self).__init__(name, layer_type=name)
+
+    def append_op(self,
+                  type=None,
+                  inputs=None,
+                  outputs=None,
+                  attrs=None,
+                  stop_gradient=None):
+        """append an operator for this layer object.
+
+           Args:
+               type: operator type
+               inputs: input variable of the operator
+               dtype: data type of this parameter
+               is_bias: if this is a bias parameter
+               default_initializer: set the default initializer for this parameter
+
+        Returns created parameter Variable.
+        """
+        return self.main_program.current_block().append_op(
+            type=type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=stop_gradient)
+
+    def _multiple_input(self, inputs_in):
+        inputs = inputs_in
+        ret = []
+        if isinstance(inputs, (list, tuple)):
+            for inp in inputs:
+                ret.append(self.to_variable(inp))
+        else:
+            ret.append(self.to_variable(inputs))
+        return ret
+
+    # TODO: make it public when we need it
+    def _input(self, inputs_in):
+        inputs = self._multiple_input(inputs_in)
+        if len(inputs) != 1:
+            raise "{0} layer only takes one input".format(self.layer_type)
+        return inputs[0]
+
+    def _multiple_param_attr(self, length, param_attr_in=None):
+        param_attr = param_attr_in
+        if isinstance(param_attr, ParamAttr):
+            param_attr = [param_attr]
+
+        if len(param_attr) != 1 and len(param_attr) != length:
+            raise ValueError("parameter number mismatch")
+        elif len(param_attr) == 1 and length != 1:
+            tmp = [None] * length
+            for i in six.moves.range(length):
+                tmp[i] = copy.deepcopy(param_attr[0])
+            param_attr = tmp
+        return param_attr
+
+    def iter_inputs_and_params(self, inputs_in, param_attr_in=None):
+        """Access all inputs and params one by one
+
+           Args:
+               inputs_in: inputs to be iter
+               param_attr_in: param_attr to be iter
+
+        Returns input, param_attr
+        """
+        inputs = inputs_in if (inputs_in is not None) else []
+        inputs = self._multiple_input(inputs)
+        param_attrs = self._multiple_param_attr(len(inputs), param_attr_in)
+        for ipt, param_attr in zip(inputs, param_attrs):
+            yield ipt, param_attr
+
+    def input_dtype(self, inputs_in):
+        """Get input data type
+
+           Args:
+               inputs_in: inputs wanted know the data type
+
+        Returns dtype of the input
+        """
+        inputs = self._multiple_input(inputs_in)
+        dtype = None
+        for each in inputs:
+            if dtype is None:
+                dtype = each.dtype
+            elif dtype != each.dtype:
+                raise ValueError("Data Type mismatch: %d to %d" %
+                                 (dtype, each.dtype))
+        return dtype
+
+    def get_parameter(self, name):
+        """Get parameter specifically
+
+           Args:
+               name: parameter's name
+
+        Returns target parameter
+        """
+        param = self.main_program.global_block().var(name)
+        if not isinstance(param, Parameter):
+            raise ValueError("no Parameter name %s found" % name)
+        return param
+
+    def append_bias_op(self,
+                       input_var,
+                       dim_start=1,
+                       dim_end=None,
+                       bias_attr=None):
+        """Append bias operator and return its output. If the user does not set bias_attr, append_bias_op will return input_var
+
+            Args:
+                input_var: the input variable. The len(input_var.shape) is
+                larger or equal than 2.
+                dim_start:
+                dim_end: the shape of the bias will be
+                bias_attr: the bias_attr of it
+
+        Return the Variable of after append bias op
+        """
+        size = list(input_var.shape[dim_start:dim_end])
+        bias_attr = bias_attr
+        if not bias_attr:
+            return input_var
+
+        b = self.create_parameter(
+            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
+        tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
+        self.append_op(
+            type='elementwise_add',
+            inputs={'X': [input_var],
+                    'Y': [b]},
+            outputs={'Out': [tmp]},
+            attrs={'axis': dim_start})
+        return tmp
+
+    # TODO: this should not be called anymore after all activation func move to Layers
+    def append_activation(self,
+                          input_var,
+                          act=None,
+                          use_cudnn=None,
+                          use_mkl_dnn=None):
+        """Append activation
+
+            Args:
+                input_var: the input variable. The len(input_var.shape) is
+                larger or equal than 2.
+                act: activation type
+                use_mkl_dnn: if use mkldnn
+                use_cudnn: if use cudnn
+
+        Return the Variable of after append activation
+        """
+        act = act
+        if act is None:
+            return input_var
+        if isinstance(act, six.string_types):
+            act = {'type': act}
+        else:
+            raise TypeError(str(act) + " should be unicode or str")
+
+        if (use_cudnn is not None) and use_cudnn:
+            act['use_cudnn'] = use_cudnn
+        if (use_mkl_dnn is not None) and use_mkl_dnn:
+            act['use_mkldnn'] = use_mkl_dnn
+        act_type = act.pop('type')
+
+        tmp = input_var
+        # NOTE(dzhwinter): some activation support inplace compution.
+        # NOTE(minqiyang): currently, we don't support inplace in imperative mode
+        if not _in_imperative_mode() and core.IsInplace(act_type):
+            tmp = input_var
+        else:
+            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
+        self.append_op(
+            type=act_type,
+            inputs={"X": [input_var]},
+            outputs={"Out": [tmp]},
+            attrs=act)
+        return tmp
+
+    def is_instance(self, param, cls):
+        """Check if the input parameter is instance of input class
+
+            Args:
+                param: parameter to be check
+                cls: class of the parameter
+
+        Return result of the check (True or False)
+        """
+        param = param
+        if not isinstance(param, cls):
+            raise TypeError("The input {0} parameter of method {1} must be {2}",
+                            param, self.layer_type, cls.__name__)
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -19,8 +19,8 @@ import numpy as np
 import collections
 from .. import unique_name
 from paddle.fluid import core
+from .layer_object_helper import LayerObjectHelper
 from paddle.fluid import framework
-from paddle.fluid.imperative import base

 __all__ = ['Layer', 'PyLayer']

@@ -44,6 +44,8 @@ class Layer(core.Layer):
        self._parameters = collections.OrderedDict()
        self._sub_layers = collections.OrderedDict()

+        self._helper = LayerObjectHelper(self._full_name)
+
    def full_name(self):
        """Full name for this layers.

@@ -53,6 +55,51 @@ class Layer(core.Layer):
        """
        return self._full_name

+    def create_parameter(self,
+                         attr,
+                         shape,
+                         dtype,
+                         is_bias=False,
+                         default_initializer=None):
+        """Create parameters for this layers.
+
+           Args:
+               attr: [ParamAttr] should be the parameter attribute for this parameter
+               shape: shape of the paramter
+               dtype: data type of this parameter
+               is_bias: if this is a bias parameter
+               default_initializer: set the default initializer for this parameter
+
+        Returns created parameter Variable.
+        """
+        return self._helper.create_parameter(attr, shape, dtype, is_bias,
+                                             default_initializer)
+
+    # TODO: Add more parameter list when we need them
+    def create_variable(self,
+                        name=None,
+                        persistable=None,
+                        dtype=None,
+                        type=core.VarDesc.VarType.LOD_TENSOR):
+        """Create Variable for this layers.
+
+           Args:
+               name: name of the variable
+               persistable: if set this variable persistable
+               dtype: data type of data in the variable
+               type: type of the variable
+
+        Returns created Variable.
+        """
+        if name is not None:
+            var_name = ".".join([self._full_name, name])
+        else:
+            var_name = unique_name.generate(".".join(
+                [self._full_name, "_generated_var"]))
+
+        return self._helper.main_program.current_block().create_var(
+            name=var_name, persistable=persistable, dtype=dtype, type=type)
+
    def parameters(self, include_sublayers=True):
        """Returns a list of Parameters from current and sub-layers.


--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -41,21 +41,12 @@ class Conv2D(layers.Layer):
                 bias_attr=None,
                 dtype=core.VarDesc.VarType.FP32):
        assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name_scope, dtype=dtype)
-
-        # TODO(minqiyang): Move this to the top.
-        from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(
-            self.full_name(),
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            dtype=dtype,
-            act=act)
-
+        super(Conv2D, self).__init__(name_scope)
        self._groups = groups
        self._stride = utils.convert_to_list(stride, 2, 'stride')
        self._padding = utils.convert_to_list(padding, 2, 'padding')
        self._dilation = utils.convert_to_list(dilation, 2, 'dilation')
+        self._act = act
        if not isinstance(use_cudnn, bool):
            raise ValueError("use_cudnn should be True or False")
        self._use_cudnn = use_cudnn
@@ -80,28 +71,28 @@ class Conv2D(layers.Layer):
            std = (2.0 / filter_elem_num)**0.5
            return Normal(0.0, std, 0)

-        self._filter_param = self._helper.create_parameter(
-            attr=self._helper.param_attr,
+        self._filter_param = self.create_parameter(
+            attr=param_attr,
            shape=filter_shape,
            dtype=self._dtype,
            default_initializer=_get_default_param_initializer())

        if self._use_cudnn:
-            self._helper.create_variable(
+            self.create_variable(
                name="kCUDNNFwdAlgoCache",
                persistable=True,
                type=core.VarDesc.VarType.RAW)
-            self._helper.create_variable(
+            self.create_variable(
                name="kCUDNNBwdDataAlgoCache",
                persistable=True,
                type=core.VarDesc.VarType.RAW)
-            self._helper.create_variable(
+            self.create_variable(
                name="kCUDNNBwdFilterAlgoCache",
                persistable=True,
                type=core.VarDesc.VarType.RAW)

-        self._bias_param = self._helper.create_parameter(
-            attr=self._helper.bias_attr,
+        self._bias_param = self.create_parameter(
+            attr=bias_attr,
            shape=[num_filters],
            dtype=self._dtype,
            is_bias=True)
@@ -137,7 +128,7 @@ class Conv2D(layers.Layer):
            attrs={'axis': 1})

        # Currently, we don't support inplace in imperative mode
-        return self._helper.append_activation(pre_act)
+        return self._helper.append_activation(pre_act, act=self._act)


 class Pool2D(layers.Layer):
@@ -167,9 +158,6 @@ class Pool2D(layers.Layer):

        super(Pool2D, self).__init__(name_scope, dtype=dtype)

-        from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(self.full_name(), dtype=dtype)
-
        self._pool_type = pool_type
        self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
        self._pool_padding = utils.convert_to_list(pool_padding, 2,
@@ -216,28 +204,25 @@ class FC(layers.Layer):
        self._size = size
        self._num_flatten_dims = num_flatten_dims
        self._dtype = dtype
-        from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(
-            self.full_name(),
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            act=act)
+        self._param_attr = param_attr
+        self._bias_attr = param_attr
+        self._act = act

    def _build_once(self, input):
        input_shape = input.shape
        param_shape = [
            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
        ] + [self._size]
-        self._w = self._helper.create_parameter(
-            attr=self._helper.param_attr,
+        self._w = self.create_parameter(
+            attr=self._param_attr,
            shape=param_shape,
            dtype=self._dtype,
            is_bias=False)

-        if self._helper.bias_attr:
+        if self._param_attr:
            size = list([self._size])
-            self._b = self._helper.create_parameter(
-                attr=self._helper.bias_attr,
+            self._b = self.create_parameter(
+                attr=self._param_attr,
                shape=size,
                dtype=self._dtype,
                is_bias=True)
@@ -275,7 +260,7 @@ class FC(layers.Layer):
        else:
            pre_activation = pre_bias
        # Currently, we don't support inplace in imperative mode
-        return self._helper.append_activation(pre_activation)
+        return self._helper.append_activation(pre_activation, act=self._act)


 class BatchNorm(layers.Layer):
@@ -297,16 +282,12 @@ class BatchNorm(layers.Layer):
                 fuse_with_relu=False,
                 use_global_stats=False):
        super(BatchNorm, self).__init__(name_scope)
+        self._param_attr = param_attr
+        self._param_attr = bias_attr
+        self._act = act

        assert bias_attr is not False, "bias_attr should not be False in batch_norm."

-        from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(
-            self.full_name(),
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            act=act)
-
        if dtype == core.VarDesc.VarType.FP16:
            self._dtype = core.VarDesc.VarType.FP32
        else:
@@ -315,23 +296,23 @@ class BatchNorm(layers.Layer):
        param_shape = [num_channels]

        # create parameter
-        self._scale = self._helper.create_parameter(
-            attr=self._helper.param_attr,
+        self._scale = self.create_parameter(
+            attr=self._param_attr,
            shape=param_shape,
            dtype=self._dtype,
            default_initializer=Constant(1.0))
-        if use_global_stats and self._helper.param_attr.learning_rate == 0.:
+        if use_global_stats and self._param_attr.learning_rate == 0.:
            self._scale._stop_gradient = True

-        self._bias = self._helper.create_parameter(
-            attr=self._helper.bias_attr,
+        self._bias = self.create_parameter(
+            attr=self._param_attr,
            shape=param_shape,
            dtype=self._dtype,
            is_bias=True)
-        if use_global_stats and self._helper.bias_attr.learning_rate == 0.:
+        if use_global_stats and self._param_attr.learning_rate == 0.:
            self._bias._stop_gradient = True

-        self._mean = self._helper.create_parameter(
+        self._mean = self.create_parameter(
            attr=ParamAttr(
                name=moving_mean_name,
                initializer=Constant(0.0),
@@ -341,7 +322,7 @@ class BatchNorm(layers.Layer):
            dtype=self._dtype)
        self._mean._stop_gradient = True

-        self._variance = self._helper.create_parameter(
+        self._variance = self.create_parameter(
            attr=ParamAttr(
                name=moving_variance_name,
                initializer=Constant(1.0),
@@ -401,7 +382,7 @@ class BatchNorm(layers.Layer):
            })

        # Currently, we don't support inplace in imperative mode
-        return self._helper.append_activation(batch_norm_out)
+        return self._helper.append_activation(batch_norm_out, self._act)


 class Embedding(layers.Layer):
@@ -466,9 +447,7 @@ class Embedding(layers.Layer):
        if self._remote_prefetch:
            assert self._is_sparse is True and self._is_distributed is False

-        from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(self.full_name(), param_attr=param_attr)
-        self._w = self._helper.create_parameter(
+        self._w = self.create_parameter(
            attr=self._param_attr,
            shape=self._size,
            dtype=self._dtype,

--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -19,7 +19,6 @@ import numpy as np
 from .wrapped_decorator import signature_safe_contextmanager
 from .core import VarDesc
 from . import unique_name
-from .imperative import base as imperative_base

 __all__ = [
    'Constant', 'Uniform', 'Normal', 'TruncatedNormal', 'Xavier', 'Bilinear',
@@ -166,7 +165,7 @@ class ConstantInitializer(Initializer):
                'force_cpu': self._force_cpu or force_init_on_cpu()
            },
            stop_gradient=True)
-        if not imperative_base.enabled():
+        if not framework._in_imperative_mode():
            var.op = op
        return op

@@ -246,7 +245,7 @@ class UniformInitializer(Initializer):
                attrs={"in_dtype": out_var.dtype,
                       "out_dtype": var.dtype})

-        if not imperative_base.enabled():
+        if not framework._in_imperative_mode():
            var.op = op
        return op

@@ -325,7 +324,7 @@ class NormalInitializer(Initializer):
                outputs={"Out": var},
                attrs={"in_dtype": out_var.dtype,
                       "out_dtype": var.dtype})
-        if not imperative_base.enabled():
+        if not framework._in_imperative_mode():
            var.op = op
        return op

@@ -404,7 +403,7 @@ class TruncatedNormalInitializer(Initializer):
                outputs={"Out": var},
                attrs={"in_dtype": out_var.dtype,
                       "out_dtype": var.dtype})
-        if not imperative_base.enabled():
+        if not framework._in_imperative_mode():
            var.op = op
        return op

@@ -510,7 +509,7 @@ class XavierInitializer(Initializer):
                    "seed": self._seed
                },
                stop_gradient=True)
-        if not imperative_base.enabled():
+        if not framework._in_imperative_mode():
            var.op = op
        return op

@@ -611,7 +610,7 @@ class MSRAInitializer(Initializer):
                    "seed": self._seed
                },
                stop_gradient=True)
-        if not imperative_base.enabled():
+        if not framework._in_imperative_mode():
            var.op = op
        return op

@@ -710,7 +709,7 @@ class BilinearInitializer(Initializer):
                'shape': list(shape),
                value_name: values
            })
-        if not imperative_base.enabled():
+        if not framework._in_imperative_mode():
            var.op = op
        return op

@@ -769,7 +768,7 @@ class NumpyArrayInitializer(Initializer):
                value_name: values
            },
            stop_gradient=True)
-        if not imperative_base.enabled():
+        if not framework._in_imperative_mode():
            var.op = op
        return op


--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -15,45 +15,29 @@
 from __future__ import print_function

 import copy
-import itertools
 import six
-import sys
-import numpy as np

-from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating, _in_imperative_mode
+from .framework import Parameter, dtype_is_floating, _in_imperative_mode
 from . import unique_name
-from paddle.fluid.imperative import base as imperative_base
 from paddle.fluid.initializer import Constant, Xavier
-from .param_attr import ParamAttr, WeightNormParamAttr
+from .param_attr import ParamAttr
 from . import core
 from six.moves import zip
+from .layer_helper_base import LayerHelperBase


-class LayerHelper(object):
+class LayerHelper(LayerHelperBase):
    def __init__(self, layer_type, **kwargs):
        self.kwargs = kwargs
-        self.layer_type = layer_type
        name = self.kwargs.get('name', None)
        # TODO(panyx0718, minqiyang): imperative mode
        # can not use both `layer_type` and `name`. Deprecate LayerHelper
        # and write a Helper for imperative mode.
        if name is None:
-            self.kwargs['name'] = unique_name.generate(self.layer_type)
+            self.kwargs['name'] = unique_name.generate(layer_type)

-    @property
-    def name(self):
-        return self.kwargs['name']
-
-    @property
-    def main_program(self):
-        return default_main_program()
-
-    @property
-    def startup_program(self):
-        return default_startup_program()
-
-    def to_variable(self, x):
-        return imperative_base.to_variable(x, self.main_program.current_block())
+        super(LayerHelper, self).__init__(
+            self.kwargs['name'], layer_type=layer_type)

    def append_op(self, *args, **kwargs):
        return self.main_program.current_block().append_op(*args, **kwargs)
@@ -82,6 +66,7 @@ class LayerHelper(object):
    def bias_attr(self):
        return ParamAttr._to_attr(self.kwargs.get('bias_attr', None))

+    #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of param_attr
    def multiple_param_attr(self, length):
        param_attr = self.param_attr
        if isinstance(param_attr, ParamAttr):
@@ -113,297 +98,13 @@ class LayerHelper(object):
                                 (dtype, each.dtype))
        return dtype

-    def _create_weight_normalize(self, attr, shape, dtype):
-        from .layers import elementwise_mul, elementwise_div, reshape
-
-        # Remove these ops when LayerHelper and layers support indicating
-        # program and block.
-        def __norm_op(x,
-                      out=None,
-                      p=2,
-                      dim=None,
-                      keep_dim=False,
-                      block=self.startup_program.global_block()):
-            if out is None:
-                out = block.create_var(
-                    name=unique_name.generate(".".join(
-                        [self.name, 'weight_norm_norm'])),
-                    dtype=dtype,
-                    persistable=False)
-            abs_out = block.create_var(
-                name=unique_name.generate(".".join(
-                    [self.name, 'weight_norm_abs'])),
-                dtype=dtype,
-                persistable=False)
-            block.append_op(
-                type='abs', inputs={'X': x}, outputs={'Out': abs_out})
-            pow_out = block.create_var(
-                name=unique_name.generate(".".join(
-                    [self.name, 'weight_norm_pow'])),
-                dtype=dtype,
-                persistable=False)
-            block.append_op(
-                type='pow',
-                inputs={'X': abs_out},
-                outputs={'Out': pow_out},
-                attrs={'factor': float(p)})
-            sum_out = block.create_var(
-                name=unique_name.generate(".".join(
-                    [self.name, 'weight_norm_sum'])),
-                dtype=dtype,
-                persistable=False)
-            block.append_op(
-                type='reduce_sum',
-                inputs={'X': pow_out},
-                outputs={'Out': sum_out},
-                attrs={
-                    'dim': dim,
-                    'keep_dim': keep_dim,
-                    'reduce_all': True if dim is None else False
-                })
-            block.append_op(
-                type='pow',
-                inputs={'X': sum_out},
-                outputs={'Out': out},
-                attrs={'factor': 1. / p})
-            return out
-
-        def __reshape_op(x,
-                         shape,
-                         out=None,
-                         block=self.startup_program.global_block()):
-            if out is None:
-                out = block.create_var(
-                    name=unique_name.generate(".".join(
-                        [self.name, 'weight_norm_reshape'])),
-                    dtype=dtype,
-                    persistable=False)
-            block.append_op(
-                type='reshape',
-                inputs={'X': x},
-                outputs={'Out': out},
-                attrs={'shape': shape})
-            return out
-
-        def __transpose_op(x,
-                           axis,
-                           out=None,
-                           block=self.startup_program.global_block()):
-            if out is None:
-                out = block.create_var(
-                    name=unique_name.generate(".".join(
-                        [self.name, 'weight_norm_transpose'])),
-                    dtype=dtype,
-                    persistable=False)
-            block.append_op(
-                type='transpose',
-                inputs={'X': x},
-                outputs={'Out': out},
-                attrs={'axis': axis})
-            return out
-
-        def __norm_except_dim(x,
-                              out=None,
-                              dim=None,
-                              block=self.startup_program.global_block()):
-            """Computes the norm over all dimensions except dim"""
-            if out is None:
-                out = block.create_var(
-                    name=unique_name.generate(".".join(
-                        [self.name, 'weight_norm_norm'])),
-                    dtype=dtype,
-                    persistable=False)
-            if dim is None:
-                __norm_op(x, out, dim=dim, block=block)
-            elif dim == 0:
-                out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
-                reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
-                norm = __norm_op(reshape, dim=1, block=block)
-                __reshape_op(norm, out=out, shape=out_shape, block=block)
-            elif dim == len(x.shape) - 1:
-                out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
-                reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
-                norm = __norm_op(reshape, dim=0, block=block)
-                __reshape_op(norm, out=out, shape=out_shape, block=block)
-            else:
-                perm = list(range(len(x.shape)))
-                perm[0], perm[dim] = dim, 0
-                transpose = __transpose_op(x, perm, block=block)
-                norm = __norm_op(transpose, dim=0, block=block)
-                __transpose_op(norm, perm, out=out, block=block)
-            return out
-
-        def __weight_normalize(g, v, dim):
-            """Calculations for weight normalization"""
-            norm = __norm_except_dim(
-                v, dim=dim, block=self.main_program.current_block())
-            scale = elementwise_div(
-                x=g, y=norm)  # The shapes of g and norm are the same.
-            # Currently, elementwise_mul only support broadcast when the shape
-            # of y is a subset of the shape of x. Thus, we reshape y to squeeze
-            # to achive the subset.
-            w = elementwise_mul(
-                x=v,
-                y=scale if dim is None else reshape(
-                    x=scale, shape=[v.shape[dim]]),
-                axis=-1 if dim is None else dim)
-            # To serialize the original parameter for inference, maybe a
-            # parameter rather than a variable should be returned.
-            return w
-
-        g_param_attr = copy.deepcopy(attr)
-        g_param_attr.name = attr.name + '_g'
-        g_param_shape = [1] * len(shape)
-        if attr.dim is not None:
-            g_param_shape[attr.dim] = shape[attr.dim]
-        v_param_attr = copy.deepcopy(attr)
-        v_param_attr.name = attr.name + '_v'
-        v_param_shape = shape
-
-        # Add to startup_program to initialize g and v.
-        # Try to reconstruct the initializer of w by initializing g and v.
-        # Set the initializers of g and v as below, then the distribution
-        # of w is the same as initializing w with the given initializer.
-        # For Data-Dependent Initialization, please compute the init-values
-        # of g and v in external and then feed the values to g and v by
-        # executing an extra program.
-        g_param = self.startup_program.global_block().create_parameter(
-            dtype=dtype,
-            shape=g_param_shape,
-            **g_param_attr._to_kwargs(with_initializer=False))
-        v_param = self.startup_program.global_block().create_parameter(
-            dtype=dtype,
-            shape=v_param_shape,
-            **v_param_attr._to_kwargs(with_initializer=True))
-        __norm_except_dim(
-            x=v_param,
-            out=g_param,
-            dim=attr.dim,
-            block=self.startup_program.global_block())
-
-        # Add weight normalization to main_program
-        g_param = self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs())
-        v_param = self.main_program.global_block().create_parameter(
-            dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs())
-        w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
-        return w_param
-
-    def create_parameter(self,
-                         attr,
-                         shape,
-                         dtype,
-                         is_bias=False,
-                         default_initializer=None):
-        # Deepcopy the attr so that parameters can be shared in program
-        attr = copy.deepcopy(attr)
-        assert isinstance(attr, ParamAttr)
-        suffix = 'b' if is_bias else 'w'
-        if attr.name is None:
-            attr.name = unique_name.generate(".".join([self.name, suffix]))
-
-        if default_initializer is None and attr.initializer is None:
-            if isinstance(dtype, core.VarDesc.VarType):
-                if dtype != core.VarDesc.VarType.FP32 and \
-                    dtype != core.VarDesc.VarType.FP64 and \
-                    dtype != core.VarDesc.VarType.FP16:
-                    raise TypeError(
-                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
-                    )
-            else:
-                if not (dtype.startswith("float") or dtype == "double"):
-                    raise TypeError(
-                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
-                    )
-            if is_bias:
-                attr._set_default_bias_initializer()
-            else:
-                attr._set_default_param_initializer()
-        else:
-            attr._set_default_initializer(default_initializer)
-
-        # If weight normalization is set, insert extra parameters and ops.
-        # Refer to https://arxiv.org/pdf/1602.07868.pdf
-        if isinstance(attr, WeightNormParamAttr):
-            param = self._create_weight_normalize(attr, shape, dtype)
-            WeightNormParamAttr.params_with_weight_norm.append(param)
-            return param
-        if _in_imperative_mode():
-            # In imperative mode, we want the returned parameter to be
-            # initialized so that it can be used imperatively.
-            return self.main_program.global_block().create_parameter(
-                dtype=dtype,
-                shape=shape,
-                **attr._to_kwargs(with_initializer=True))
-        else:
-            self.startup_program.global_block().create_parameter(
-                dtype=dtype,
-                shape=shape,
-                **attr._to_kwargs(with_initializer=True))
-            return self.main_program.global_block().create_parameter(
-                dtype=dtype, shape=shape, **attr._to_kwargs())
-
    def get_parameter(self, name):
        param = self.main_program.global_block().var(name)
        if not isinstance(param, Parameter):
            raise ValueError("no Parameter name %s found" % name)
        return param

-    def create_variable_for_type_inference(self, dtype, stop_gradient=False):
-        """Create a temporary variable that should be type inferred layer.
-
-        Note:
-            The default type will be set to LOD_TENSOR. However, when
-            the var is used as operator output, its type will be updated
-            based on operator's `VarTypeInference` implementation in
-            infer_var_type.
-        """
-        return self.main_program.current_block().create_var(
-            name=unique_name.generate(".".join([self.name, 'tmp'])),
-            dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR,
-            persistable=False,
-            stop_gradient=stop_gradient)
-
-    def create_variable(self, *args, **kwargs):
-        return self.main_program.current_block().create_var(*args, **kwargs)
-
-    def create_global_variable(self, persistable=False, *args, **kwargs):
-        """
-        create global variable, note that there is no initializer for this global variable.
-        Args:
-            persistable(bool): True if it is a checkpoint value.
-            *args: See create_var's documentation
-            **kwargs: See create_var's documentation
-
-        Returns(Variable): the created variable.
-        """
-        return self.main_program.global_block().create_var(
-            *args, persistable=persistable, **kwargs)
-
-    def create_or_get_global_variable(self, name, *args, **kwargs):
-        """
-        Creates a global variable if not exists and returns the variable and
-        a boolean flag which is true when it is a new variable.
-        """
-        if self.main_program.global_block().has_var(name):
-            return self.main_program.global_block().var(name), False
-        else:
-            return self.create_global_variable(name=name, *args, **kwargs), True
-
-    def set_variable_initializer(self, var, initializer):
-        assert isinstance(var, Variable)
-        if imperative_base.enabled():
-            initializer(var, var.block)
-        else:
-            self.startup_program.global_block().create_var(
-                name=var.name,
-                type=var.type,
-                dtype=var.dtype,
-                shape=var.shape,
-                persistable=True,
-                initializer=initializer)
-
+    #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of bias_attr
    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
        """
        Append bias operator and return its output. If the user does not set
@@ -434,6 +135,7 @@ class LayerHelper(object):
            attrs={'axis': dim_start})
        return tmp

+    #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of act
    def append_activation(self, input_var):
        act = self.kwargs.get('act', None)
        if act is None:
@@ -448,10 +150,11 @@ class LayerHelper(object):
        if 'use_mkldnn' in self.kwargs:
            act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
        act_type = act.pop('type')
+
        tmp = input_var
        # NOTE(dzhwinter): some activation support inplace compution.
        # NOTE(minqiyang): currently, we don't support inplace in imperative mode
-        if not imperative_base.enabled() and core.IsInplace(act_type):
+        if not _in_imperative_mode() and core.IsInplace(act_type):
            tmp = input_var
        else:
            tmp = self.create_variable_for_type_inference(dtype=input_var.dtype)
@@ -462,6 +165,7 @@ class LayerHelper(object):
            attrs=act)
        return tmp

+    #TODO (jiabin): should we remove this since it has never be used
    def _get_default_initializer(self, dtype):
        if dtype is None or dtype_is_floating(dtype) is True:
            return Xavier()
@@ -469,6 +173,7 @@ class LayerHelper(object):
            # For integer and boolean types, initialize with all zeros
            return Constant()

+    #TODO (jiabin): reconstruct this in LayerObjHelper and avoid dependency of kwargs
    def is_instance(self, param_name, cls):
        param = self.kwargs.get(param_name, None)
        if not isinstance(param, cls):

--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import copy
+import numpy as np
+
+from .framework import Variable, default_main_program, default_startup_program, _in_imperative_mode, _current_expected_place
+from . import unique_name
+from .param_attr import ParamAttr, WeightNormParamAttr
+from . import core
+
+
+class LayerHelperBase(object):
+    def __init__(self, name, layer_type):
+        self._layer_type = layer_type
+        self._name = name
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def layer_type(self):
+        return self._layer_type
+
+    @property
+    def main_program(self):
+        return default_main_program()
+
+    @property
+    def startup_program(self):
+        return default_startup_program()
+
+    def to_variable(self, value, block=None):
+        """convert value to variable
+
+            Args:
+                value: value to be convert
+                block: the block of the variable
+
+        Return Variable construct from value
+        """
+        if isinstance(value, np.ndarray):
+            assert _in_imperative_mode(
+            ), "to_variable could only be called in imperative mode"
+
+            if not block:
+                block = default_main_program().current_block()
+            py_var = Variable(
+                block,
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                name=None,
+                shape=value.shape,
+                dtype=value.dtype)
+            var = py_var._ivar.value()
+            tensor = var.get_tensor()
+            tensor.set(value, _current_expected_place())
+            return py_var
+        elif isinstance(value, Variable):
+            return value
+
+    def _create_weight_normalize(self, attr, shape, dtype):
+        from .layers import elementwise_mul, elementwise_div, reshape
+
+        # Remove these ops when LayerHelper and layers support indicating
+        # program and block.
+        def __norm_op(x,
+                      out=None,
+                      p=2,
+                      dim=None,
+                      keep_dim=False,
+                      block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name.generate(".".join(
+                        [self.name, 'weight_norm_norm'])),
+                    dtype=dtype,
+                    persistable=False)
+            abs_out = block.create_var(
+                name=unique_name.generate(".".join(
+                    [self.name, 'weight_norm_abs'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='abs', inputs={'X': x}, outputs={'Out': abs_out})
+            pow_out = block.create_var(
+                name=unique_name.generate(".".join(
+                    [self.name, 'weight_norm_pow'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='pow',
+                inputs={'X': abs_out},
+                outputs={'Out': pow_out},
+                attrs={'factor': float(p)})
+            sum_out = block.create_var(
+                name=unique_name.generate(".".join(
+                    [self.name, 'weight_norm_sum'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='reduce_sum',
+                inputs={'X': pow_out},
+                outputs={'Out': sum_out},
+                attrs={
+                    'dim': dim,
+                    'keep_dim': keep_dim,
+                    'reduce_all': True if dim is None else False
+                })
+            block.append_op(
+                type='pow',
+                inputs={'X': sum_out},
+                outputs={'Out': out},
+                attrs={'factor': 1. / p})
+            return out
+
+        def __reshape_op(x,
+                         shape,
+                         out=None,
+                         block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name.generate(".".join(
+                        [self.name, 'weight_norm_reshape'])),
+                    dtype=dtype,
+                    persistable=False)
+            block.append_op(
+                type='reshape',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'shape': shape})
+            return out
+
+        def __transpose_op(x,
+                           axis,
+                           out=None,
+                           block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name.generate(".".join(
+                        [self.name, 'weight_norm_transpose'])),
+                    dtype=dtype,
+                    persistable=False)
+            block.append_op(
+                type='transpose',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'axis': axis})
+            return out
+
+        def __norm_except_dim(x,
+                              out=None,
+                              dim=None,
+                              block=self.startup_program.global_block()):
+            """Computes the norm over all dimensions except dim"""
+            if out is None:
+                out = block.create_var(
+                    name=unique_name.generate(".".join(
+                        [self.name, 'weight_norm_norm'])),
+                    dtype=dtype,
+                    persistable=False)
+            if dim is None:
+                __norm_op(x, out, dim=dim, block=block)
+            elif dim == 0:
+                out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
+                reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
+                norm = __norm_op(reshape, dim=1, block=block)
+                __reshape_op(norm, out=out, shape=out_shape, block=block)
+            elif dim == len(x.shape) - 1:
+                out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
+                reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
+                norm = __norm_op(reshape, dim=0, block=block)
+                __reshape_op(norm, out=out, shape=out_shape, block=block)
+            else:
+                perm = list(range(len(x.shape)))
+                perm[0], perm[dim] = dim, 0
+                transpose = __transpose_op(x, perm, block=block)
+                norm = __norm_op(transpose, dim=0, block=block)
+                __transpose_op(norm, perm, out=out, block=block)
+            return out
+
+        def __weight_normalize(g, v, dim):
+            """Calculations for weight normalization"""
+            norm = __norm_except_dim(
+                v, dim=dim, block=self.main_program.current_block())
+            scale = elementwise_div(
+                x=g, y=norm)  # The shapes of g and norm are the same.
+            # Currently, elementwise_mul only support broadcast when the shape
+            # of y is a subset of the shape of x. Thus, we reshape y to squeeze
+            # to achive the subset.
+            w = elementwise_mul(
+                x=v,
+                y=scale if dim is None else reshape(
+                    x=scale, shape=[v.shape[dim]]),
+                axis=-1 if dim is None else dim)
+            # To serialize the original parameter for inference, maybe a
+            # parameter rather than a variable should be returned.
+            return w
+
+        g_param_attr = copy.deepcopy(attr)
+        g_param_attr.name = attr.name + '_g'
+        g_param_shape = [1] * len(shape)
+        if attr.dim is not None:
+            g_param_shape[attr.dim] = shape[attr.dim]
+        v_param_attr = copy.deepcopy(attr)
+        v_param_attr.name = attr.name + '_v'
+        v_param_shape = shape
+
+        # Add to startup_program to initialize g and v.
+        # Try to reconstruct the initializer of w by initializing g and v.
+        # Set the initializers of g and v as below, then the distribution
+        # of w is the same as initializing w with the given initializer.
+        # For Data-Dependent Initialization, please compute the init-values
+        # of g and v in external and then feed the values to g and v by
+        # executing an extra program.
+        g_param = self.startup_program.global_block().create_parameter(
+            dtype=dtype,
+            shape=g_param_shape,
+            **g_param_attr._to_kwargs(with_initializer=False))
+        v_param = self.startup_program.global_block().create_parameter(
+            dtype=dtype,
+            shape=v_param_shape,
+            **v_param_attr._to_kwargs(with_initializer=True))
+        __norm_except_dim(
+            x=v_param,
+            out=g_param,
+            dim=attr.dim,
+            block=self.startup_program.global_block())
+
+        # Add weight normalization to main_program
+        g_param = self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=g_param_shape, **g_param_attr._to_kwargs())
+        v_param = self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=v_param_shape, **v_param_attr._to_kwargs())
+        w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
+        return w_param
+
+    # TODO: hide the func after we move the layers to Layers
+    def create_parameter(self,
+                         attr,
+                         shape,
+                         dtype,
+                         is_bias=False,
+                         default_initializer=None):
+        """Create parameters for this layers.
+
+           Args:
+               attr: [ParamAttr] should be the parameter attribute for this parameter
+               shape: shape of the paramter
+               dtype: data type of this parameter
+               is_bias: if this is a bias parameter
+               default_initializer: set the default initializer for this parameter
+
+        Returns created parameter Variable.
+        """
+        # Deepcopy the attr so that parameters can be shared in program
+        attr = copy.deepcopy(attr)
+        if attr is None:
+            attr = ParamAttr._to_attr(attr)
+        assert isinstance(attr, ParamAttr)
+        suffix = 'b' if is_bias else 'w'
+        if attr.name is None:
+            attr.name = unique_name.generate(".".join([self.name, suffix]))
+
+        if default_initializer is None and attr.initializer is None:
+            if isinstance(dtype, core.VarDesc.VarType):
+                if dtype != core.VarDesc.VarType.FP32 and \
+                        dtype != core.VarDesc.VarType.FP64 and \
+                        dtype != core.VarDesc.VarType.FP16:
+                    raise TypeError(
+                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
+                    )
+            else:
+                if not (dtype.startswith("float") or dtype == "double"):
+                    raise TypeError(
+                        "Can not create parameter with default initializer when dtype is not float type. Set default_initializer to fit the parameter dtype!"
+                    )
+            if is_bias:
+                attr._set_default_bias_initializer()
+            else:
+                attr._set_default_param_initializer()
+        else:
+            attr._set_default_initializer(default_initializer)
+
+        # If weight normalization is set, insert extra parameters and ops.
+        # Refer to https://arxiv.org/pdf/1602.07868.pdf
+        if isinstance(attr, WeightNormParamAttr):
+            param = self._create_weight_normalize(attr, shape, dtype)
+            WeightNormParamAttr.params_with_weight_norm.append(param)
+            return param
+        if _in_imperative_mode():
+            # In imperative mode, we want the returned parameter to be
+            # initialized so that it can be used imperatively.
+            return self.main_program.global_block().create_parameter(
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
+        else:
+            self.startup_program.global_block().create_parameter(
+                dtype=dtype,
+                shape=shape,
+                **attr._to_kwargs(with_initializer=True))
+            return self.main_program.global_block().create_parameter(
+                dtype=dtype, shape=shape, **attr._to_kwargs())
+
+    def create_variable_for_type_inference(self, dtype, stop_gradient=False):
+        """Create a temporary variable that should be type inferred layer.
+
+        Note:
+            The default type will be set to LOD_TENSOR. However, when
+            the var is used as operator output, its type will be updated
+            based on operator's `VarTypeInference` implementation in
+            infer_var_type.
+        """
+        return self.main_program.current_block().create_var(
+            name=unique_name.generate(".".join([self.name, 'tmp'])),
+            dtype=dtype,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            persistable=False,
+            stop_gradient=stop_gradient)
+
+    def create_variable(self, *args, **kwargs):
+        """Create Variable for this layers.
+        Returns created Variable.
+        """
+        return self.main_program.current_block().create_var(*args, **kwargs)
+
+    def create_global_variable(self, persistable=False, *args, **kwargs):
+        """
+        create global variable, note that there is no initializer for this global variable.
+        Args:
+            persistable(bool): True if it is a checkpoint value.
+            *args: See create_var's documentation
+            **kwargs: See create_var's documentation
+
+        Returns(Variable): the created variable.
+        """
+        return self.main_program.global_block().create_var(
+            *args, persistable=persistable, **kwargs)
+
+    def create_or_get_global_variable(self, name, *args, **kwargs):
+        """
+        Creates a global variable if not exists and returns the variable and
+        a boolean flag which is true when it is a new variable.
+        """
+        if self.main_program.global_block().has_var(name):
+            return self.main_program.global_block().var(name), False
+        else:
+            return self.create_global_variable(name=name, *args, **kwargs), True
+
+    def set_variable_initializer(self, var, initializer):
+        """Set target Variable's initializer
+
+           Args:
+               var: target Variable
+               initializer: initializer to use
+        """
+        assert isinstance(var, Variable)
+        if _in_imperative_mode():
+            initializer(var, var.block)
+        else:
+            self.startup_program.global_block().create_var(
+                name=var.name,
+                type=var.type,
+                dtype=var.dtype,
+                shape=var.shape,
+                persistable=True,
+                initializer=initializer)
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -848,7 +848,7 @@ def create_array(dtype):


 @templatedoc()
-def less_than(x, y, force_cpu=None, cond=None, **ignored):
+def less_than(x, y, force_cpu=None, cond=None):
    """
    ${comment}

@@ -1800,7 +1800,7 @@ def reorder_lod_tensor_by_rank(x, rank_table):
    return out


-def is_empty(x, cond=None, **ignored):
+def is_empty(x, cond=None):
    """
    Test whether a Variable is empty.


--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -51,6 +51,8 @@ __all__ = [
    'yolov3_loss',
    'box_clip',
    'multiclass_nms',
+    'distribute_fpn_proposals',
+    'box_decoder_and_assign',
 ]


@@ -2221,3 +2223,138 @@ def multiclass_nms(bboxes,
    output.stop_gradient = True

    return output
+
+
+def distribute_fpn_proposals(fpn_rois,
+                             min_level,
+                             max_level,
+                             refer_level,
+                             refer_scale,
+                             name=None):
+    """
+    In Feature Pyramid Networks (FPN) models, it is needed to distribute all 
+    proposals into different FPN level, with respect to scale of the proposals,
+    the referring scale and the referring level. Besides, to restore the order
+    of proposals, we return an array which indicates the original index of rois
+    in current proposals. To compute FPN level for each roi, the formula is 
+    given as follows:
+    
+    .. math::
+
+        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
+
+        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+
+    where BBoxArea is a function to compute the area of each roi.
+
+    Args:
+        fpn_rois(variable): The input fpn_rois, the second dimension is 4.
+        min_level(int): The lowest level of FPN layer where the proposals come 
+                        from.
+        max_level(int): The highest level of FPN layer where the proposals
+                        come from.
+        refer_level(int): The referring level of FPN layer with specified scale.
+        refer_scale(int): The referring scale of FPN layer with specified level.
+        name(str|None): The name of this operator.        
+
+    Returns:
+        tuple: 
+               A tuple(multi_rois, restore_ind) is returned. The multi_rois is 
+               a list of segmented tensor variables. The restore_ind is a 2D 
+               Tensor with shape [N, 1], N is the number of total rois. It is
+               used to restore the order of fpn_rois.
+
+    Examples:
+        .. code-block:: python
+
+            fpn_rois = fluid.layers.data(
+                name='data', shape=[4], dtype='float32', lod_level=1)
+            multi_rois, restore_ind = fluid.layers.distribute_fpn_proposals(
+                fpn_rois=fpn_rois,
+                min_level=2,
+                max_level=5,
+                refer_level=4,
+                refer_scale=224)
+    """
+
+    helper = LayerHelper('distribute_fpn_proposals', **locals())
+    dtype = helper.input_dtype()
+    num_lvl = max_level - min_level + 1
+    multi_rois = [
+        helper.create_variable_for_type_inference(dtype) for i in range(num_lvl)
+    ]
+    restore_ind = helper.create_variable_for_type_inference(dtype='int32')
+    helper.append_op(
+        type='distribute_fpn_proposals',
+        inputs={'FpnRois': fpn_rois},
+        outputs={'MultiFpnRois': multi_rois,
+                 'RestoreIndex': restore_ind},
+        attrs={
+            'min_level': min_level,
+            'max_level': max_level,
+            'refer_level': refer_level,
+            'refer_scale': refer_scale
+        })
+    return multi_rois, restore_ind
+
+
+@templatedoc()
+def box_decoder_and_assign(prior_box,
+                           prior_box_var,
+                           target_box,
+                           box_score,
+                           box_clip,
+                           name=None):
+    """
+    ${comment}
+    Args:
+        prior_box(${prior_box_type}): ${prior_box_comment}
+        prior_box_var(${prior_box_var_type}): ${prior_box_var_comment}
+        target_box(${target_box_type}): ${target_box_comment}
+        box_score(${box_score_type}): ${box_score_comment}
+        box_clip(${box_clip_type}): ${box_clip_comment}
+        name(str|None): The name of this operator
+    Returns:
+        decode_box(Variable), output_assign_box(Variable):
+
+            two variables:
+
+            - decode_box(${decode_box_type}): ${decode_box_comment}
+            - output_assign_box(${output_assign_box_type}): ${output_assign_box_comment}
+
+    Examples:
+        .. code-block:: python
+
+            pb = fluid.layers.data(
+                name='prior_box', shape=[20, 4], dtype='float32')
+            pbv = fluid.layers.data(
+                name='prior_box_var', shape=[1, 4], dtype='float32')
+            loc = fluid.layers.data(
+                name='target_box', shape=[20, 4*81], dtype='float32')
+            scores = fluid.layers.data(
+                name='scores', shape=[20, 81], dtype='float32')
+            decoded_box, output_assign_box = fluid.layers.box_decoder_and_assign(
+                pb, pbv, loc, scores, 4.135)
+
+    """
+    helper = LayerHelper("box_decoder_and_assign", **locals())
+
+    decoded_box = helper.create_variable_for_type_inference(
+        dtype=prior_box.dtype)
+    output_assign_box = helper.create_variable_for_type_inference(
+        dtype=prior_box.dtype)
+
+    helper.append_op(
+        type="box_decoder_and_assign",
+        inputs={
+            "PriorBox": prior_box,
+            "PriorBoxVar": prior_box_var,
+            "TargetBox": target_box,
+            "BoxScore": box_score
+        },
+        attrs={"box_clip": box_clip},
+        outputs={
+            "DecodeBox": decoded_box,
+            "OutputAssignBox": output_assign_box
+        })
+    return decoded_box, output_assign_box
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -4833,11 +4833,6 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
    """

    def __check_input(x, y):
-        if len(y.shape) > len(x.shape):
-            raise ValueError(
-                "Invalid inputs for matmul. "
-                "x's rank should be always greater than or equal to y'rank.")
-
        x_shape = list(x.shape)
        y_shape = list(y.shape)
        if len(x_shape) == 1:
@@ -4853,10 +4848,11 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
        if x_shape[-1] != y_shape[-2]:
            raise ValueError("Invalid inputs for matmul.")

-        if len(y_shape) > 2:
+        if len(y_shape) > 2 and len(x_shape) > 2:
            for i, dim_x in enumerate(x_shape[:-2]):
                if dim_x != y_shape[i]:
-                    raise ValueError("Invalid inputs for matmul.")
+                    raise ValueError("Invalid inputs for matmul. x(%s), y(%s)" %
+                                     (x.shape, y.shape))

    __check_input(x, y)


--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -142,7 +142,8 @@ def create_global_var(shape,
 def cast(x, dtype):
    """
    This layer takes in the Variable :attr:`x` with :attr:`x.dtype` and casts
-    it to the output with :attr:`dtype`.
+    it to the output with :attr:`dtype`. It's meaningless if the output
+    dtype equals the input dtype, but it's fine if you do so.

    Args:
        x (Variable): The input Variable for casting.

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -379,7 +379,7 @@ class Optimizer(object):
        self._dtype = loss.dtype
        program = loss.block.program
        optimize_ops = []
-        if imperative_base.enabled():
+        if framework._in_imperative_mode():
            if parameter_list is not None:
                parameters = parameter_list
            else:

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -106,13 +106,18 @@ class ParallelExecutor(object):
            else framework.default_main_program()

        self._compiled_program = compiler.CompiledProgram(main_program)
+        if share_vars_from:
+            assert isinstance(
+                share_vars_from, ParallelExecutor
+            ), "The share_vars_from should be ParallelExecutor."
        self._compiled_program.with_data_parallel(
            loss_name=loss_name,
            build_strategy=build_strategy,
            exec_strategy=exec_strategy,
-            share_vars_from=share_vars_from)
+            share_vars_from=share_vars_from._compiled_program
+            if share_vars_from else None)
        self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
-        self._executor = executor.Executor(self._place)
+        self._exe = executor.Executor(self._place)
        self._compiled_program._compile(place=self._place, scope=self._scope)

    def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True):
@@ -180,7 +185,7 @@ class ParallelExecutor(object):
                loss = pe.run(feed=feeder.feed(cur_batch),
                              fetch_list=[avg_cost.name]))
        """
-        return self._executor.run(program=self._compiled_program,
+        return self._exe.run(program=self._compiled_program,
                             scope=self._scope,
                             feed=feed,
                             fetch_list=fetch_list,

--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -504,5 +504,21 @@ class TestMulticlassNMS(unittest.TestCase):
            self.assertIsNotNone(output)


+class TestDistributeFpnProposals(unittest.TestCase):
+    def test_distribute_fpn_proposals(self):
+        program = Program()
+        with program_guard(program):
+            fpn_rois = fluid.layers.data(
+                name='data', shape=[4], dtype='float32', lod_level=1)
+            multi_rois, restore_ind = layers.distribute_fpn_proposals(
+                fpn_rois=fpn_rois,
+                min_level=2,
+                max_level=5,
+                refer_level=4,
+                refer_scale=224)
+            self.assertIsNotNone(multi_rois)
+            self.assertIsNotNone(restore_ind)
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py
@@ -70,3 +70,17 @@ def check_if_mkldnn_primitives_exist_in_bwd(test_case, op_type, x, out,
                fetch_list=['x@GRAD', 'out'])

        __assert_close(x_grad, out[0], 'x@GRAD')
+
+
+def format_reorder(out, size):
+    in_n = size[0]
+    out_h = size[2]
+    out_w = size[3]
+    out_c = size[1]
+    out_tmp = np.zeros((in_n, out_h, out_w, out_c))
+    for n in range(in_n):
+        for i in range(out_h):
+            for j in range(out_w):
+                for m in range(out_c):
+                    out_tmp[n, i, j, m] = out[n, m, i, j]
+    return out_tmp.reshape(in_n, out_c, out_h, out_w)
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
 from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
+from mkldnn_op_test import format_reorder


 def conv2d_forward_refer(input, filter, group, conv_param):
@@ -29,20 +30,6 @@ def conv2d_forward_refer(input, filter, group, conv_param):
    return format_reorder(out, size)


-def format_reorder(out, size):
-    in_n = size[0]
-    out_h = size[2]
-    out_w = size[3]
-    out_c = size[1]
-    out_tmp = np.zeros((in_n, out_h, out_w, out_c))
-    for n in range(in_n):
-        for i in range(out_h):
-            for j in range(out_w):
-                for m in range(out_c):
-                    out_tmp[n, i, j, m] = out[n, m, i, j]
-    return out_tmp.reshape(in_n, out_c, out_h, out_w)
-
-
 class TestConv2dInt8Op(TestConv2dOp):
    def setUp(self):
        self.op_type = "conv2d"

--- a/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from mkldnn_op_test import format_reorder
+
+
+class TestReQuantizeOp(OpTest):
+    def setUp(self):
+        self.op_type = 'requantize'
+        self.scale_in = 2.0
+        self.scale_out = 1.5
+        self.input_size = [1, 1, 5, 5]
+        self.data_type = 'int8'
+        self.set_scale()
+        self.set_data_type()
+
+        scale_shift = self.scale_out / self.scale_in
+
+        if self.data_type == 'int8':
+            input = (np.random.randint(0, 100, self.input_size) - 50
+                     ).astype(self.data_type)
+            output_tmp = np.round(input.astype('float32') *
+                                  scale_shift).astype('int8')
+        else:
+            input = (np.random.randint(0, 100,
+                                       self.input_size)).astype(self.data_type)
+            output_tmp = np.round(input.astype('float32') *
+                                  scale_shift).astype('uint8')
+
+        output = format_reorder(output_tmp, self.input_size)
+
+        self.inputs = {'Input': OpTest.np_dtype_to_fluid_dtype(input)}
+
+        self.outputs = {'Output': output}
+
+        self.attrs = {'Scale_in': self.scale_in, 'Scale_out': self.scale_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def set_scale(self):
+        pass
+
+    def set_data_type(OpTest):
+        pass
+
+
+#--------------------test requantize with s8 input--------------------
+
+
+class TestReQuantizeOp1(TestReQuantizeOp):
+    def set_scale(self):
+        self.scale_in = 1.5
+        self.scale_out = 1.5
+
+
+class TestReQuantizeOp2(TestReQuantizeOp):
+    def set_scale(self):
+        self.scale_in = 0.1
+        self.scale_out = 0.2
+
+
+#--------------------test requantize with u8 input--------------------
+
+
+class TestReQuantizeOp3(TestReQuantizeOp1):
+    def set_data_type(self):
+        self.data_type = 'uint8'
+
+
+class TestReQuantizeOp4(TestReQuantizeOp2):
+    def set_data_type(self):
+        self.data_type = 'uint8'
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -16,27 +16,17 @@ import unittest
 import numpy as np

 import paddle.fluid as fluid
-from paddle.fluid.layer_helper import LayerHelper


 class L1(fluid.imperative.Layer):
    def __init__(self, prefix):
        super(L1, self).__init__(prefix)
-        self._helper = LayerHelper(
-            self.full_name(),
-            param_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=0.1)))
-
-        self.w1 = self._helper.create_parameter(
-            attr=self._helper.param_attr,
-            shape=[2, 2],
-            dtype='float32',
-            is_bias=False)
-        self.w2 = self._helper.create_parameter(
-            attr=self._helper.param_attr,
-            shape=[2, 2],
-            dtype='float32',
-            is_bias=False)
+        self._param_attr = fluid.ParamAttr(
+            initializer=fluid.initializer.Constant(value=0.1))
+        self.w1 = self.create_parameter(
+            attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False)
+        self.w2 = self.create_parameter(
+            attr=self._param_attr, shape=[2, 2], dtype='float32', is_bias=False)

    def forward(self):
        return self.w1 + self.w2
@@ -67,8 +57,8 @@ class TestBaseLayer(unittest.TestCase):
        with fluid.imperative.guard():
            l = L1('test_one_level')
            ret = l()
-            self.assertEqual(l.w1.name, "test_one_level/L1_0_0.w_0")
-            self.assertEqual(l.w2.name, "test_one_level/L1_0_0.w_1")
+            self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0")
+            self.assertEqual(l.w2.name, "test_one_level/L1_0.w_1")
            self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))

    def test_three_level(self):
@@ -76,12 +66,12 @@ class TestBaseLayer(unittest.TestCase):
            l = L3('test_three_level')
            names = [p.name for p in l.parameters()]
            ret = l()
-            self.assertEqual(names[0], "test_three_level/L3_0/L2_0/L1_0_0.w_0")
-            self.assertEqual(names[1], "test_three_level/L3_0/L2_0/L1_0_0.w_1")
-            self.assertEqual(names[2], "test_three_level/L3_0/L2_0/L1_1_0.w_0")
-            self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1_0.w_1")
-            self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0_0.w_0")
-            self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0_0.w_1")
+            self.assertEqual(names[0], "test_three_level/L3_0/L2_0/L1_0.w_0")
+            self.assertEqual(names[1], "test_three_level/L3_0/L2_0/L1_0.w_1")
+            self.assertEqual(names[2], "test_three_level/L3_0/L2_0/L1_1.w_0")
+            self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1.w_1")
+            self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0.w_0")
+            self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0.w_1")
            self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))



--- a/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+def box_decoder_and_assign(deltas, weights, boxes, box_score, box_clip):
+    boxes = boxes.astype(deltas.dtype, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * widths
+    ctr_y = boxes[:, 1] + 0.5 * heights
+    wx, wy, ww, wh = weights
+    dx = deltas[:, 0::4] * wx
+    dy = deltas[:, 1::4] * wy
+    dw = deltas[:, 2::4] * ww
+    dh = deltas[:, 3::4] * wh
+    # Prevent sending too large values into np.exp()
+    dw = np.minimum(dw, box_clip)
+    dh = np.minimum(dh, box_clip)
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
+    # x1
+    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
+    # y1
+    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
+    # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
+    # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
+    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
+
+    output_assign_box = []
+    for ino in range(len(pred_boxes)):
+        rank = np.argsort(-box_score[ino])
+        maxidx = rank[0]
+        if maxidx == 0:
+            maxidx = rank[1]
+        beg_pos = maxidx * 4
+        end_pos = maxidx * 4 + 4
+        output_assign_box.append(pred_boxes[ino, beg_pos:end_pos])
+    output_assign_box = np.array(output_assign_box)
+
+    return pred_boxes, output_assign_box
+
+
+class TestBoxDecoderAndAssignOpWithLoD(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_decoder_and_assign"
+        lod = [[4, 8, 8]]
+        num_classes = 10
+        prior_box = np.random.random((20, 4)).astype('float32')
+        prior_box_var = np.array([0.1, 0.1, 0.2, 0.2], dtype=np.float32)
+        target_box = np.random.random((20, 4 * num_classes)).astype('float32')
+        box_score = np.random.random((20, num_classes)).astype('float32')
+        box_clip = 4.135
+        output_box, output_assign_box = box_decoder_and_assign(
+            target_box, prior_box_var, prior_box, box_score, box_clip)
+
+        self.inputs = {
+            'PriorBox': (prior_box, lod),
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': (target_box, lod),
+            'BoxScore': (box_score, lod),
+        }
+        self.attrs = {'box_clip': box_clip}
+        self.outputs = {
+            'DecodeBox': output_box,
+            'OutputAssignBox': output_assign_box
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from test_dist_base import TestDistBase
+
+
+class TestDistMnistNCCL2(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = True
+        self._use_reduce = False
+        self._use_reader_alloc = False
+        self._nccl2_mode = True
+
+    def test_dist_train(self):
+        import paddle.fluid as fluid
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "dist_mnist.py",
+                delta=1,
+                need_envs={
+                    "FLAGS_enable_parallel_graph": "1",
+                    "FLAGS_sync_nccl_allreduce": "1"
+                })
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py
+#    Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import sys
+from op_test import OpTest
+
+
+class TestDistributeFPNProposalsOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.rois_fpn, self.rois_idx_restore = self.calc_rois_distribute()
+        self.inputs = {'FpnRois': (self.rois[:, 1:5], self.rois_lod)}
+        self.attrs = {
+            'max_level': self.roi_max_level,
+            'min_level': self.roi_min_level,
+            'refer_scale': self.canonical_scale,
+            'refer_level': self.canonical_level
+        }
+        output = [('out%d' % i, self.rois_fpn[i])
+                  for i in range(len(self.rois_fpn))]
+        self.outputs = {
+            'MultiFpnRois': output,
+            'RestoreIndex': self.rois_idx_restore
+        }
+
+    def init_test_case(self):
+        self.roi_max_level = 5
+        self.roi_min_level = 2
+        self.canonical_scale = 224
+        self.canonical_level = 4
+        self.images_shape = [512, 512]
+
+    def boxes_area(self, boxes):
+        w = (boxes[:, 2] - boxes[:, 0] + 1)
+        h = (boxes[:, 3] - boxes[:, 1] + 1)
+        areas = w * h
+        assert np.all(areas >= 0), 'Negative areas founds'
+        return areas
+
+    def map_rois_to_fpn_levels(self, rois, lvl_min, lvl_max):
+        s = np.sqrt(self.boxes_area(rois))
+        s0 = self.canonical_scale
+        lvl0 = self.canonical_level
+        target_lvls = np.floor(lvl0 + np.log2(s / s0 + 1e-6))
+        target_lvls = np.clip(target_lvls, lvl_min, lvl_max)
+        return target_lvls
+
+    def get_sub_lod(self, sub_lvl):
+        sub_lod = []
+        max_batch_id = sub_lvl[-1]
+        for i in range(max_batch_id.astype(np.int32) + 1):
+            sub_lod.append(np.where(sub_lvl == i)[0].size)
+        return sub_lod
+
+    def add_multilevel_roi(self, rois, target_lvls, lvl_min, lvl_max):
+        rois_idx_order = np.empty((0, ))
+        rois_fpn = []
+        for lvl in range(lvl_min, lvl_max + 1):
+            idx_lvl = np.where(target_lvls == lvl)[0]
+            if len(idx_lvl) == 0:
+                rois_fpn.append((np.empty(shape=(0, 4)), [[0, 0]]))
+                continue
+            sub_lod = self.get_sub_lod(rois[idx_lvl, 0])
+            rois_fpn.append((rois[idx_lvl, 1:], [sub_lod]))
+            rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
+        rois_idx_restore = np.argsort(rois_idx_order).astype(
+            np.int32, copy=False)
+        return rois_fpn, rois_idx_restore
+
+    def calc_rois_distribute(self):
+        lvl_min = self.roi_min_level
+        lvl_max = self.roi_max_level
+        target_lvls = self.map_rois_to_fpn_levels(self.rois[:, 1:5], lvl_min,
+                                                  lvl_max)
+        rois_fpn, rois_idx_restore = self.add_multilevel_roi(
+            self.rois, target_lvls, lvl_min, lvl_max)
+        return rois_fpn, rois_idx_restore
+
+    def make_rois(self):
+        self.rois_lod = [[100, 200]]
+        rois = []
+        lod = self.rois_lod[0]
+        bno = 0
+        for roi_num in lod:
+            for i in range(roi_num):
+                xywh = np.random.rand(4)
+                xy1 = xywh[0:2] * 20
+                wh = xywh[2:4] * (self.images_shape - xy1)
+                xy2 = xy1 + wh
+                roi = [bno, xy1[0], xy1[1], xy2[0], xy2[1]]
+                rois.append(roi)
+            bno += 1
+        self.rois = np.array(rois).astype("float32")
+
+    def setUp(self):
+        self.op_type = "distribute_fpn_proposals"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -53,11 +53,15 @@ class MLP(fluid.imperative.Layer):
        super(MLP, self).__init__(name_scope)
        self._fc1 = FC(self.full_name(),
                       3,
-                       fluid.ParamAttr(
+                       param_attr=fluid.ParamAttr(
+                           initializer=fluid.initializer.Constant(value=0.1)),
+                       bias_attr=fluid.ParamAttr(
                           initializer=fluid.initializer.Constant(value=0.1)))
        self._fc2 = FC(self.full_name(),
                       4,
-                       fluid.ParamAttr(
+                       param_attr=fluid.ParamAttr(
+                           initializer=fluid.initializer.Constant(value=0.1)),
+                       bias_attr=fluid.ParamAttr(
                           initializer=fluid.initializer.Constant(value=0.1)))

    def forward(self, inputs):
@@ -74,41 +78,37 @@ class SimpleRNNCell(fluid.imperative.Layer):
        self.step_input_size = step_input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
-        self._dype = core.VarDesc.VarType.FP32
-        from paddle.fluid.layer_helper import LayerHelper
-        self._helper = LayerHelper(
-            'SimpleRNNCell', act="tanh", param_attr=param_attr)
+        self._dtype = core.VarDesc.VarType.FP32
+        self.param_attr = param_attr

    def _build_once(self, inputs, pre_hidden):
        i2h_param_shape = [self.step_input_size, self.hidden_size]
        h2h_param_shape = [self.hidden_size, self.hidden_size]
        h2o_param_shape = [self.output_size, self.hidden_size]
-        self._i2h_w = self._helper.create_parameter(
-            attr=self._helper.param_attr,
+        self._i2h_w = self.create_parameter(
+            attr=self.param_attr,
            shape=i2h_param_shape,
            dtype=self._dtype,
            is_bias=False)
-        self._h2h_w = self._helper.create_parameter(
-            attr=self._helper.param_attr,
+        self._h2h_w = self.create_parameter(
+            attr=self.param_attr,
            shape=h2h_param_shape,
            dtype=self._dtype,
            is_bias=False)
-        self._h2o_w = self._helper.create_parameter(
-            attr=self._helper.param_attr,
+        self._h2o_w = self.create_parameter(
+            attr=self.param_attr,
            shape=h2o_param_shape,
            dtype=self._dtype,
            is_bias=False)

    def forward(self, input, pre_hidden):

-        tmp_i2h = self._helper.create_variable_for_type_inference(self._dtype)
-        tmp_h2h = self._helper.create_variable_for_type_inference(self._dtype)
-        hidden = self._helper.create_variable_for_type_inference(self._dype)
-        out = self._helper.create_variable_for_type_inference(self._dype)
-        softmax_out = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        reduce_out = self._helper.create_variable_for_type_inference(
-            self._dtype)
+        tmp_i2h = self.create_variable(dtype=self._dtype)
+        tmp_h2h = self.create_variable(dtype=self._dtype)
+        hidden = self.create_variable(dtype=self._dtype)
+        out = self.create_variable(dtype=self._dtype)
+        softmax_out = self.create_variable(dtype=self._dtype)
+        reduce_out = self.create_variable(dtype=self._dtype)
        self._helper.append_op(
            type="mul",
            inputs={"X": input,
@@ -132,7 +132,7 @@ class SimpleRNNCell(fluid.imperative.Layer):
            outputs={'Out': hidden},
            attrs={'axis': -1,
                   'use_mkldnn': False})
-        hidden = self._helper.append_activation(hidden)
+        hidden = self._helper.append_activation(hidden, act='tanh')

        self._helper.append_op(
            type="mul",
@@ -174,7 +174,7 @@ class SimpleRNN(fluid.imperative.Layer):
        outs = list()
        pre_hiddens = list()

-        init_hidden = fluid.layers.tensor.create_parameter(
+        init_hidden = self.create_parameter(
            attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=0.1)),
            shape=[1, 3],
@@ -337,10 +337,10 @@ class TestImperative(unittest.TestCase):
        self.assertTrue(np.allclose(dy_grad, static_grad))

        params = mlp.parameters(True)
-        self.assertEqual("mlp/MLP_0/FC_0_0.w_0", params[0].name)
-        self.assertEqual("mlp/MLP_0/FC_0_0.b_0", params[1].name)
-        self.assertEqual("mlp/MLP_0/FC_1_0.w_0", params[2].name)
-        self.assertEqual("mlp/MLP_0/FC_1_0.b_0", params[3].name)
+        self.assertEqual("mlp/MLP_0/FC_0.w_0", params[0].name)
+        self.assertEqual("mlp/MLP_0/FC_0.b_0", params[1].name)
+        self.assertEqual("mlp/MLP_0/FC_1.w_0", params[2].name)
+        self.assertEqual("mlp/MLP_0/FC_1.b_0", params[3].name)
        self.assertEqual(len(params), 4)

        sublayers = mlp.sublayers(True)

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -78,7 +78,7 @@ class SimpleImgConvPool(fluid.imperative.Layer):


 class MNIST(fluid.imperative.Layer):
-    def __init__(self, name_scope, param_attr=None, bias_attr=None):
+    def __init__(self, name_scope):
        super(MNIST, self).__init__(name_scope)

        self._simple_img_conv_pool_1 = SimpleImgConvPool(

--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -41,19 +41,17 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
        self._dropout = dropout
        self._input = None
        self._num_steps = num_steps
-        from paddle.fluid.layer_helper import LayerHelper
-        self._helper = LayerHelper('SimpleLSTMRNN', act="tanh")
+        self.cell_array = []
+        self.hidden_array = []

    def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
        self.weight_1_arr = []
        self.weight_2_arr = []
        self.bias_arr = []
-        self.hidden_array = []
-        self.cell_array = []
        self.mask_array = []

        for i in range(self._num_layers):
-            weight_1 = self._helper.create_parameter(
+            weight_1 = self.create_parameter(
                attr=fluid.ParamAttr(
                    initializer=fluid.initializer.UniformInitializer(
                        low=-self._init_scale, high=self._init_scale)),
@@ -62,7 +60,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
                default_initializer=fluid.initializer.UniformInitializer(
                    low=-self._init_scale, high=self._init_scale))
            self.weight_1_arr.append(weight_1)
-            bias_1 = self._helper.create_parameter(
+            bias_1 = self.create_parameter(
                attr=fluid.ParamAttr(
                    initializer=fluid.initializer.UniformInitializer(
                        low=-self._init_scale, high=self._init_scale)),
@@ -71,6 +69,11 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
                default_initializer=fluid.initializer.Constant(0.0))
            self.bias_arr.append(bias_1)

+    def forward(self, input_embedding, init_hidden=None, init_cell=None):
+        self.cell_array = []
+        self.hidden_array = []
+
+        for i in range(self._num_layers):
            pre_hidden = fluid.layers.slice(
                init_hidden, axes=[0], starts=[i], ends=[i + 1])
            pre_cell = fluid.layers.slice(
@@ -82,7 +85,6 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
            self.hidden_array.append(pre_hidden)
            self.cell_array.append(pre_cell)

-    def forward(self, input_embedding, init_hidden=None, init_cell=None):
        res = []
        for index in range(self._num_steps):
            self._input = fluid.layers.slice(
@@ -145,8 +147,6 @@ class PtbModel(fluid.imperative.Layer):
        self.num_layers = num_layers
        self.num_steps = num_steps
        self.dropout = dropout
-        from paddle.fluid.layer_helper import LayerHelper
-        self._helper = LayerHelper('PtbModel', act="tanh")
        self.simple_lstm_rnn = SimpleLSTMRNN(
            self.full_name(),
            hidden_size,
@@ -163,13 +163,13 @@ class PtbModel(fluid.imperative.Layer):
                name='embedding_para',
                initializer=fluid.initializer.UniformInitializer(
                    low=-init_scale, high=init_scale)))
-        self.softmax_weight = self._helper.create_parameter(
+        self.softmax_weight = self.create_parameter(
            attr=fluid.ParamAttr(),
            shape=[self.hidden_size, self.vocab_size],
            dtype="float32",
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale, high=self.init_scale))
-        self.softmax_bias = self._helper.create_parameter(
+        self.softmax_bias = self.create_parameter(
            attr=fluid.ParamAttr(),
            shape=[self.vocab_size],
            dtype="float32",
@@ -180,7 +180,6 @@ class PtbModel(fluid.imperative.Layer):
        pass

    def forward(self, input, label, init_hidden, init_cell):
-
        init_h = fluid.layers.reshape(
            init_hidden, shape=[self.num_layers, -1, self.hidden_size])