diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 2f31b182af7293488719e41a92b2ea78709bda02..c8c25086db1d58089604555b6c46dac0e6a1251e 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -99,6 +99,10 @@ struct Argument {
  private:                                                                 \
   unique_ptr_t field__##_;
 
+  // Each predictor has an unique id.
+  // For now, this attr will help us to get the right
+  // trt_engine for each trt_engine_op for each predictor when using trt.
+  DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int);
   // Model path
   DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string);
   // Model specified with program and parameters files.
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 59107f28080dceb0a58e17d42281db5f3773de56..9fa85f3762361e2916af6faa820241f5fe7e772f 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -217,6 +217,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
   return "";
 }
 
+static std::string GetTrtEngineSerializedPath(const std::string &model_root,
+                                              const std::string &engine_key) {
+  return model_root + "/trt_serialized_" + engine_key;
+}
+
+static std::string GetTrtEngineSerializedData(
+    const std::string &model_opt_cache_dir, const std::string &engine_key) {
+  std::string trt_serialized_path =
+      GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key);
+  if (FileExists(trt_serialized_path)) {
+    VLOG(3) << "Trt serialized file: " << trt_serialized_path
+            << "is found here";
+    std::ifstream infile(trt_serialized_path, std::ios::in);
+    std::stringstream buffer;
+    buffer << infile.rdbuf();
+    std::string trt_engine_serialized_data(buffer.str());
+    return trt_engine_serialized_data;
+  }
+  return "";
+}
+
+static void SaveTrtEngineSerializedDataToFile(
+    const std::string &trt_serialized_path,
+    const std::string &engine_serialized_data) {
+  std::ofstream outfile(trt_serialized_path);
+  outfile << engine_serialized_data;
+  outfile.close();
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 7476c199cfd073ec0962fa9a48f24750a6484bb5..6fe779524feb2870ff398ca17c4f69427e4a61af 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -81,6 +81,7 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set(
           "model_opt_cache_dir",
           new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
+      pass->Set("predictor_id", new int(argument->predictor_id()));
     }
 
     pre_pass = pass_name;
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 1da48b5d61ac54b063e9ae7b4bbbf3bcee48976f..7f564f321bdd7dc402a8cd11ad21104a81bdc9e7 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -19,6 +19,8 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/string/pretty_log.h"
 
@@ -83,7 +85,8 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
 }
 
 std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
-                              const std::set<std::string> &engine_outputs) {
+                              const std::set<std::string> &engine_outputs,
+                              const std::string &predictor_id) {
   std::string engine_hash_key = "";
   for (auto name : engine_inputs) {
     engine_hash_key += name;
@@ -91,6 +94,7 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
   for (auto name : engine_outputs) {
     engine_hash_key += name;
   }
+  engine_hash_key += predictor_id;
   auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
   return engine_key;
 }
@@ -205,8 +209,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   SetAttr(op_desc->Proto(), "parameters", params);
 
   auto enable_int8 = Get<bool>("enable_int8");
-  auto engine_key =
-      GenerateEngineKey(input_names_with_id, output_names_with_id);
+  int predictor_id = Get<int>("predictor_id");
+  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
+                                      std::to_string(predictor_id));
 
   // Get "" when there is no cached calibration table data.
   std::string calibration_data = GetTrtCalibTableData(
@@ -215,10 +220,53 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
+  SetAttr(op_desc->Proto(), "engine_serialized_data", std::string(""));
+  SetAttr(op_desc->Proto(), "engine_serialized_data_path",
+          GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
+                                     engine_key));
+
+  std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
+  if (enable_int8 && calibration_data.size() != 0) {
+    calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
+  }
 
-  if (!(enable_int8 && calibration_data.size() == 0)) {
+  // When in int8 mode and calibration_mode, the program just produce the
+  // calibration table data.
+  bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
+  if (!calibration_mode) {
     std::copy(params.begin(), params.end(),
               std::back_inserter(*repetitive_params));
+    std::string trt_engine_serialized_data = GetTrtEngineSerializedData(
+        Get<std::string>("model_opt_cache_dir"), engine_key);
+
+    tensorrt::TensorRTEngine *trt_engine =
+        inference::Singleton<tensorrt::TRTEngineManager>::Global().Create(
+            Get<int>("max_batch_size"), Get<int>("workspace_size"), enable_int8,
+            calibrator.get(), engine_key);
+    if (trt_engine_serialized_data.size() == 0) {
+      LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+                   "kernel etc). This process may cost a lot of time.";
+      auto *scope = param_scope();
+      framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+      std::unordered_set<std::string> param_set(params.begin(), params.end());
+      inference::Singleton<inference::tensorrt::OpConverter>::Global()
+          .ConvertBlockToTRTEngine(
+              &block_desc_temp, *scope,
+              std::vector<std::string>(input_names.begin(), input_names.end()),
+              param_set, output_mapping, trt_engine);
+      nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
+      trt_engine_serialized_data =
+          std::string((const char *)serialized_engine_data->data(),
+                      serialized_engine_data->size());
+      // SaveTrtEngineSerializedDataToFile(GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
+      // engine_key),
+      //                  trt_engine_serialized_data);
+    } else {
+      trt_engine->Deserialize(trt_engine_serialized_data);
+    }
+
+    SetAttr(op_desc->Proto(), "engine_serialized_data",
+            trt_engine_serialized_data);
   }
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index da2e9803f0467f2b83d79cdd06d4317d41630b04..7149f16b360465bee03a77b6542d0433953e484a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -342,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
       config_.static_memory_optim_force_update_);
   argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
+  argument_.SetPredictorID(predictor_id_);
   if (!config_.model_dir().empty()) {
     argument_.SetModelDir(config_.model_dir());
   } else {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 014df4ee8b6d86232212736c43a9aff32ffee011..732ea8061b6f0ca39e848d33f91dbbe4c203cc8f 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -21,6 +21,7 @@
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
 #ifdef PADDLE_WITH_TESTING
@@ -43,7 +44,9 @@ using framework::NaiveExecutor;
  */
 class AnalysisPredictor : public PaddlePredictor {
  public:
-  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {}
+  explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {
+    predictor_id_ = inference::GetUniqueId();
+  }
   ~AnalysisPredictor();
 
   bool Init(const std::shared_ptr<framework::Scope> &parent_scope,
@@ -143,6 +146,7 @@ class AnalysisPredictor : public PaddlePredictor {
   const size_t max_shape_collect_count_{1000};
   int need_collect_var_shapes_{-1};  // -1 for default, 0 for false, 1 for true.
   std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
+  int predictor_id_;
 
  private:
   // Some status here that help to determine the status inside the predictor.
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index b92781e4f2c612cbb39fcaa7c80b6051a67215fd..ec3bef42fd91cea04a656dd38a4e5c45c1a76476 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -50,6 +50,11 @@ class Timer {
   }
 };
 
+static int GetUniqueId() {
+  static int id = 0;
+  return id++;
+}
+
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
   pieces->clear();
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index ab50758c824e26b96b8813c9c4df99f28e0dd25b..8484daaa128189f70a9171f9b8bcb4fdddf54abf 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -143,6 +143,7 @@ class OpConverter {
     }
   }
 
+  // The scope  here should be inited with the parameter vars.
   void ConvertBlockToTRTEngine(
       framework::BlockDesc* block_desc, const framework::Scope& scope,
       const std::vector<std::string>& inputs,
@@ -151,18 +152,16 @@ class OpConverter {
     engine->InitNetwork();
     for (auto& input : inputs) {
       if (parameters.count(input)) continue;
-      auto& t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
-      auto t_shape = framework::vectorize(t.dims());
-
       auto* var = block_desc->FindVar(input);
       PADDLE_ENFORCE(var, "no variable called %s", input);
       PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
                         "TensorRT engine only takes LoDTensor as input");
+      auto var_shape = var->GetShape();
+
       engine->DeclareInput(
           input, FluidDataType2TRT(
                      var->Proto()->type().lod_tensor().tensor().data_type()),
-          Vec2TRT_Dims(t_shape));
+          Vec2TRT_Dims(var_shape));
     }
     framework::proto::BlockDesc* block_proto = block_desc->Proto();
     ConvertBlock(*block_proto, parameters, scope, engine);
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index e1005e9b0335a51bb6a69db8efff70ab2e60576d..cc378f4abdb970099105f7d132ace8df89b68419 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -104,6 +104,34 @@ class TensorRTEngine {
 
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
   nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
+
+  nvinfer1::IHostMemory* Serialize() {
+    PADDLE_ENFORCE(infer_engine_ != nullptr,
+                   "You should build engine first and then serialize");
+    ihost_memory_.reset(infer_engine_->serialize());
+    return ihost_memory_.get();
+  }
+
+  void Deserialize(const std::string& engine_serialized_data) {
+    infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
+    infer_engine_.reset(
+        runtime->deserializeCudaEngine(engine_serialized_data.c_str(),
+                                       engine_serialized_data.size(), nullptr));
+    PADDLE_ENFORCE(infer_engine_ != nullptr,
+                   "build cuda engine failed when deserialize engine info.!");
+    infer_context_.reset(infer_engine_->createExecutionContext());
+  }
+
+  void Deserialize(const nvinfer1::IHostMemory* engine_serialized_data) {
+    infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
+    infer_engine_.reset(runtime->deserializeCudaEngine(
+        engine_serialized_data->data(), engine_serialized_data->size(),
+        nullptr));
+    PADDLE_ENFORCE(infer_engine_ != nullptr,
+                   "build cuda engine failed when deserialize engine info.!");
+    infer_context_.reset(infer_engine_->createExecutionContext());
+  }
+
   void SetRuntimeBatch(size_t batch_size);
   int GetRuntimeBatch();
   nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
@@ -154,11 +182,11 @@ class TensorRTEngine {
   infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
   infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
   infer_ptr<nvinfer1::IExecutionContext> infer_context_;
+  infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
 };  // class TensorRTEngine
 
 // Add an layer__ into engine__ with args ARGS.
 // For example:
-//   TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
 //
 // Reference
 // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
@@ -170,6 +198,43 @@ class TensorRTEngine {
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
   engine__->network()->add##layer__(ARGS);
 
+/*
+ * Helper to control the TensorRT engine's creation and deletion.
+ */
+class TRTEngineManager {
+ public:
+  bool HasEngine(const std::string& name) const {
+    if (engines_.count(name) == 0) return false;
+    return engines_.at(name).get() != nullptr;
+  }
+
+  // Get an engine called `name`.
+  TensorRTEngine* Get(const std::string& name) const {
+    return engines_.at(name).get();
+  }
+
+  // Create or get an engine called `name`
+  TensorRTEngine* Create(int max_batch, int max_workspace, bool enable_int8,
+                         TRTInt8Calibrator* calibrator,
+                         const std::string& engine_name) {
+    std::unique_lock<std::mutex> lk(mut_);
+    auto* p =
+        new TensorRTEngine(max_batch, max_workspace, enable_int8, calibrator);
+    engines_[engine_name].reset(p);
+    return p;
+  }
+
+  void DeleteALL() {
+    for (auto& item : engines_) {
+      item.second.reset(nullptr);
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
+  std::mutex mut_;
+};
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index 784290fa44f46820736ffef125351437b039f2b4..0975a66ec6fb03be9a4e4be38002fcd760753740 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -191,9 +191,8 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
 
   std::vector<void *> buffers(2);  // TRT binded inputs
   nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
-  auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
-                                          *const_cast<nvinfer1::ITensor *>(x),
-                                          pool_t, nvinfer1::DimsHW{2, 2});
+  auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
+                                          nvinfer1::DimsHW{2, 2});
 
   PADDLE_ENFORCE(pool_layer != nullptr);
   pool_layer->setStride(nvinfer1::DimsHW{1, 1});
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index 031335009b692f9d1f73070c88e8e79d852cbe36..a8c86de9f9a1aea9ecdedd750757ec7d25cdf2f3 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
     AddAttr<std::string>("calibration_data", "the calibration data for int8");
+    AddAttr<std::string>(
+        "engine_serialized_data",
+        "the serialized data contains the all info of the ICUDAEngine");
     AddAttr<std::string>(
         "engine_key",
         "The engine_key here is used to distinguish different TRT Engines");
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index dcc046648a0c4843ef538c51cb41931f2c78e289..ab6f403ced6332a0ee7c05b760cdc1a0af9db09b 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -41,13 +41,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
  private:
   std::vector<std::string> input_names_;
   std::unordered_set<std::string> param_names_;
-  mutable std::unique_ptr<TensorRTEngine> trt_engine_;
+  mutable TensorRTEngine *trt_engine_;
   int max_batch_size_;
   int workspace_size_;
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
   bool enable_int8_;
   std::string calibration_data_;
   std::string engine_key_;
+  std::string engine_serialized_data_;
   bool calibration_mode_;
 
  public:
@@ -62,6 +63,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
     enable_int8_ = Attr<bool>("enable_int8");
     calibration_data_ = Attr<std::string>("calibration_data");
     engine_key_ = Attr<std::string>("engine_key");
+    engine_serialized_data_ = Attr<std::string>("engine_serialized_data");
+    trt_engine_ = nullptr;
 
     auto params = Attr<std::vector<std::string>>("parameters");
     for (const auto &param : params) {
@@ -78,7 +81,12 @@ class TensorRTEngineOp : public framework::OperatorBase {
 
     // we will create an engine here.
     if (!calibration_mode_) {
-      // trt_engine_.reset();
+      if (inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+              .HasEngine(engine_key_)) {
+        trt_engine_ = inference::Singleton<
+                          inference::tensorrt::TRTEngineManager>::Global()
+                          .Get(engine_key_);
+      }
     }
   }
 
@@ -99,7 +107,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
       RunCalibration(scope, dev_place);
       return;
     }
-    auto trt_engine = GetEngine(scope, dev_place);
+    auto *trt_engine = GetEngine(scope, dev_place);
     RunTrt(scope, dev_place, trt_engine);
   }
 
@@ -158,7 +166,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
 
-    // auto *engine = trt_engine_.get();
     PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
 
     std::vector<std::string> output_maps =
@@ -192,8 +199,9 @@ class TensorRTEngineOp : public framework::OperatorBase {
     int output_index = 0;
     VLOG(4) << "TensorRT Engine Op Outputs:";
     for (const auto &y : Outputs("Ys")) {
-      nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]);
-      auto dims = trt_t->getDimensions();
+      const int bind_index =
+          engine->engine()->getBindingIndex(output_maps[output_index].c_str());
+      auto dims = engine->engine()->getBindingDimensions(bind_index);
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       // The ITensor doesn't contain the batch size dim.
       std::vector<int> ddim;
@@ -206,8 +214,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
       auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
       fluid_t->Resize(framework::make_ddim(ddim));
 
-      const int bind_index =
-          engine->engine()->getBindingIndex(output_maps[output_index].c_str());
       PADDLE_ENFORCE(bind_index < num_bindings,
                      "The bind index should be less than num_bindings");
       buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
@@ -224,16 +230,14 @@ class TensorRTEngineOp : public framework::OperatorBase {
 
   TensorRTEngine *GetEngine(const framework::Scope &scope,
                             const platform::Place &dev_place) const {
-    if (trt_engine_.get() == nullptr) {
-      trt_engine_.reset(new TensorRTEngine(max_batch_size_, workspace_size_,
-                                           enable_int8_, calibrator_.get()));
-      if (true) {
-        PrepareTRTEngine(scope, trt_engine_.get());
-      } else {
-        // create static engine
-      }
+    if (trt_engine_ == nullptr) {
+      trt_engine_ =
+          inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
+              .Create(max_batch_size_, workspace_size_, enable_int8_,
+                      calibrator_.get(), engine_key_);
+      PrepareTRTEngine(scope, trt_engine_);
     }
-    return trt_engine_.get();
+    return trt_engine_;
   }
 
   void PrepareTRTEngine(const framework::Scope &scope,
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 5a3d9d2c1a3e8111acbad2ddcf4f5469a3a99751..e7ad2f4fe0c654d8928f5793c1ad8052ab766fb5 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z0"}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z3"}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
 
   auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);