diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index fa927a7da225f7527297e71b0f4913fb19196fe1..f5a51b7c3bc4e77864451362fe4b3dbf6643da29 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -907,6 +907,15 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
     return false;
   }
 
+#ifdef PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled()) {
+    inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
+        predictor_id_;
+    VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
+            << inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
+  }
+#endif
+
   // Run the inference program
   // if share variables, we need not create variables
   executor_->Run();
@@ -1630,6 +1639,16 @@ bool AnalysisPredictor::ZeroCopyRun() {
     MkldnnPreSet(shape_vector);
   }
 #endif
+
+#ifdef PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled()) {
+    inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
+        predictor_id_;
+    VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
+            << inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
+  }
+#endif
+
   executor_->Run();
 
   if (config_.shape_range_info_collected()) {
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 745f4163cfd5d4ec79e453e8d42f422ad65f8979..ebce2508c04325a0eb0c0ff6f1072b7c58c0a62d 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -30,6 +30,9 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+int TensorRTEngine::runtime_batch_ = 1;
+thread_local int TensorRTEngine::predictor_id_per_thread = -1;
+
 void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
   nvinfer1::DataType nv_type = nvinfer1::DataType::kFLOAT;
   switch (type) {
@@ -59,8 +62,6 @@ void TensorRTEngine::Weight::SetDataType(phi::DataType type) {
   w_.type = nv_type;
 }
 
-int TensorRTEngine::runtime_batch_ = 1;
-
 void TensorRTEngine::InitNetwork() {
   freshDeviceId();
   infer_builder_.reset(createInferBuilder(&logger_));
@@ -680,8 +681,9 @@ void TensorRTEngine::GetEngineInfo() {
   LOG(INFO) << "====== engine info ======";
   std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
       infer_engine_->createEngineInspector());
-  auto infer_context = context();
-  infer_inspector->setExecutionContext(infer_context);
+  auto infer_context = infer_ptr<nvinfer1::IExecutionContext>(
+      infer_engine_->createExecutionContextWithoutDeviceMemory());
+  infer_inspector->setExecutionContext(infer_context.get());
   LOG(INFO) << infer_inspector->getEngineInformation(
       nvinfer1::LayerInformationFormat::kONELINE);
   LOG(INFO) << "====== engine info end ======";
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 8aa8c98cdc5e155c4cc498ac3d5f29abccccf6fa..861a2aa8dfbd7d20536eb2ef031316d032ae1437 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -177,6 +177,7 @@ class TRTInt8Calibrator;
 class TensorRTEngine {
   using DescType = ::paddle::framework::proto::BlockDesc;
   using ShapeMapType = std::map<std::string, std::vector<int>>;
+  using PredictorID = int;
 
  public:
   // Weight is model parameter.
@@ -286,9 +287,17 @@ class TensorRTEngine {
 
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
   nvinfer1::IExecutionContext* context() {
+#ifndef PADDLE_WITH_TESTING
+    PADDLE_ENFORCE_GT(
+        predictor_id_per_thread,
+        -1,
+        platform::errors::InvalidArgument(
+            "thread local var predictor_id_per_thread must be "
+            "initialized to >= 0, but now predictor_id_per_thread = %d",
+            predictor_id_per_thread));
+#endif
     std::unique_lock<std::mutex> lock(mutex_);
-    const std::thread::id tid = std::this_thread::get_id();
-    if (infer_context_.find(tid) == infer_context_.end()) {
+    if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
       PADDLE_ENFORCE_NOT_NULL(
           infer_engine_,
           platform::errors::InvalidArgument(
@@ -296,24 +305,34 @@ class TensorRTEngine {
       // We may see trt warning: Profile 0 has been chosen by another
       // IExecutionContext...
       // It's ok. We will set it later.
-      infer_context_[tid].reset(infer_engine_->createExecutionContext());
+      infer_context_[predictor_id_per_thread].reset(
+          infer_engine_->createExecutionContext());
       if (with_dynamic_shape_) {
         // need new profile if it's not the first
         if (cur_profile_num_ > 0) {
-          infer_context_[tid]->setOptimizationProfile(cur_profile_num_);
+          infer_context_[predictor_id_per_thread]->setOptimizationProfile(
+              cur_profile_num_);
         }
-        profile_index_[tid] = cur_profile_num_;
+        profile_index_[predictor_id_per_thread] = cur_profile_num_;
         ++cur_profile_num_;
       }
     }
-    return infer_context_[tid].get();
+    return infer_context_[predictor_id_per_thread].get();
   }
 
   int GetProfileIndex() {
     if (max_profile_num_ > 1) {
+#ifndef PADDLE_WITH_TESTING
+      PADDLE_ENFORCE_GT(
+          predictor_id_per_thread,
+          -1,
+          platform::errors::InvalidArgument(
+              "thread local var predictor_id_per_thread must be "
+              "initialized to >= 0, but now predictor_id_per_thread = %d",
+              predictor_id_per_thread));
+#endif
       std::unique_lock<std::mutex> lock(mutex_);
-      const std::thread::id tid = std::this_thread::get_id();
-      return profile_index_[tid];
+      return profile_index_[predictor_id_per_thread];
     } else {
       return 0;
     }
@@ -326,14 +345,22 @@ class TensorRTEngine {
   int GetNbBindings() { return binding_num_; }
 
   void ResetContext() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    const std::thread::id tid = std::this_thread::get_id();
     PADDLE_ENFORCE_NOT_NULL(
         infer_engine_,
         platform::errors::InvalidArgument(
             "You should build engine first and then set the context."));
-    infer_context_[tid].reset(nullptr);
-    infer_context_.erase(tid);
+#ifndef PADDLE_WITH_TESTING
+    PADDLE_ENFORCE_GT(
+        predictor_id_per_thread,
+        -1,
+        platform::errors::InvalidArgument(
+            "thread local var predictor_id_per_thread must be "
+            "initialized to >= 0, but now predictor_id_per_thread = %d",
+            predictor_id_per_thread));
+#endif
+    std::unique_lock<std::mutex> lock(mutex_);
+    infer_context_[predictor_id_per_thread].reset(nullptr);
+    infer_context_.erase(predictor_id_per_thread);
   }
 
   nvinfer1::IHostMemory* Serialize() {
@@ -686,7 +713,7 @@ class TensorRTEngine {
   int device_id_;
   int max_profile_num_{1};
   int cur_profile_num_{0};
-  std::unordered_map<std::thread::id, int> profile_index_;
+  std::unordered_map<PredictorID, int> profile_index_;
   ShapeMapType min_input_shape_;
   ShapeMapType max_input_shape_;
   ShapeMapType optim_input_shape_;
@@ -723,7 +750,7 @@ class TensorRTEngine {
   infer_ptr<nvinfer1::IBuilder> infer_builder_;
   infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
   infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
-  std::unordered_map<std::thread::id, infer_ptr<nvinfer1::IExecutionContext>>
+  std::unordered_map<PredictorID, infer_ptr<nvinfer1::IExecutionContext>>
       infer_context_;
   infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
   std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
@@ -741,6 +768,9 @@ class TensorRTEngine {
 #endif
   std::mutex mutex_;
   bool use_inspector_;
+
+ public:
+  thread_local static int predictor_id_per_thread;
 };  // class TensorRTEngine
 
 // Add a layer__ into engine__ with args ARGS.