rem tools/infer_prune_patches (#53596)

a37ef769 · Kim Yann · GitHub · 7e9c87c5 · 7e9c87c5 · 7e9c87c5
12 changed file
--- a/tools/infer_prune_patches/analysis_predictor.cc.patch
+++ b/tools/infer_prune_patches/analysis_predictor.cc.patch
-diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
-index 0645af611b..6b05a7fffb 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
-+++ b/paddle/fluid/inference/api/analysis_predictor.cc
-@@ -1923,7 +1923,7 @@ USE_TRT_CONVERTER(shuffle_channel);
- USE_TRT_CONVERTER(swish);
- USE_TRT_CONVERTER(group_norm);
- USE_TRT_CONVERTER(instance_norm);
-USE_TRT_CONVERTER(layer_norm);
-+//USE_TRT_CONVERTER(layer_norm);
- USE_TRT_CONVERTER(gelu);
- USE_TRT_CONVERTER(multihead_matmul);
- USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
-@@ -1933,13 +1933,13 @@ USE_TRT_CONVERTER(scale);
- USE_TRT_CONVERTER(stack);
- USE_TRT_CONVERTER(clip);
- USE_TRT_CONVERTER(gather);
-USE_TRT_CONVERTER(anchor_generator);
-+//USE_TRT_CONVERTER(anchor_generator);
- USE_TRT_CONVERTER(yolo_box);
- USE_TRT_CONVERTER(yolo_box_head);
- USE_TRT_CONVERTER(arg_max);
-USE_TRT_CONVERTER(roi_align);
-USE_TRT_CONVERTER(affine_channel);
-USE_TRT_CONVERTER(multiclass_nms);
-+//USE_TRT_CONVERTER(roi_align);
-+//USE_TRT_CONVERTER(affine_channel);
-+//USE_TRT_CONVERTER(multiclass_nms);
- USE_TRT_CONVERTER(multiclass_nms3);
- USE_TRT_CONVERTER(nearest_interp);
- USE_TRT_CONVERTER(nearest_interp_v2);
--- a/tools/infer_prune_patches/analyzer.cc.patch
+++ b/tools/infer_prune_patches/analyzer.cc.patch
-diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
-index be7d6ab868..498e09cb4d 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
-+++ b/paddle/fluid/inference/analysis/analyzer.cc
-@@ -32,6 +32,9 @@ void Analyzer::RunAnalysis(Argument *argument) {
-                         "analsis_passes is not valid in the argument."));
-   const bool disable_logs = argument->disable_logs();
-   for (auto &pass : argument->analysis_passes()) {
-+    if (pass == "ir_params_sync_among_devices_pass") {
-+      continue;
-+    }
-     if (!disable_logs) {
-       string::PrettyLogH1("--- Running analysis [%s]", pass);
-     }
--- a/tools/infer_prune_patches/device_context.cc.patch
+++ b/tools/infer_prune_patches/device_context.cc.patch
-diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
-index 904e4854ba..4f8c955d8c 100644
--- a/paddle/fluid/platform/device_context.cc
-+++ b/paddle/fluid/platform/device_context.cc
-@@ -466,15 +466,15 @@ CUDAContext::CUDAContext(const CUDAPlace& place,
-   place_ = place;
-   CUDADeviceGuard guard(place_.device);
-   stream_.reset(new stream::CUDAStream(place, priority, flag));
-  InitEigenContext();
-  InitCuBlasContext();
-  InitCuDNNContext();
-+  //InitEigenContext();
-+  //InitCuBlasContext();
-+  //InitCuDNNContext();
- #ifndef PADDLE_WITH_HIP
- #if CUDA_VERSION >= 11060
-  InitCuBlasLtContext();
-+  //InitCuBlasLtContext();
- #endif
-  InitCuSparseContext();
-  InitCuSolverContext();
-+  //InitCuSparseContext();
-+  //InitCuSolverContext();
- #endif
- }
- 
-@@ -506,14 +506,14 @@ void CUDAContext::SetStream(gpuStream_t stream) {
- 
- CUDAContext::~CUDAContext() {
-   CUDADeviceGuard guard(place_.device);
-  DestoryCuDNNContext();
-  DestoryCuBlasContext();
-+  //DestoryCuDNNContext();
-+  //DestoryCuBlasContext();
- #ifndef PADDLE_WITH_HIP
- #if CUDA_VERSION >= 11060
-  InitCuBlasLtContext();
-+  //InitCuBlasLtContext();
- #endif
-  DestoryCuSparseContext();
-  DestoryCuSolverContext();
-+  //DestoryCuSparseContext();
-+  //DestoryCuSolverContext();
- #endif
- }
- 
--- a/tools/infer_prune_patches/jitcode.h.patch
+++ b/tools/infer_prune_patches/jitcode.h.patch
-diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
-index d71497275d..cb56e1d949 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
-+++ b/paddle/fluid/operators/jit/gen/jitcode.h
-@@ -102,8 +102,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
-     }
-     ret();
-   }
-  void L(const char* label) { Xbyak::CodeGenerator::L(label); }
-  void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }  // NOLINT
-+  void L(const char* label) {  }
-+  void L(Xbyak::Label& label) {  }  // NOLINT
-   // Enhanced vector extension
-   Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
-                                     int offt,
--- a/tools/infer_prune_patches/op_registry.h.patch
+++ b/tools/infer_prune_patches/op_registry.h.patch
-diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
-index d38efbff31..f5bef776d6 100644
--- a/paddle/fluid/framework/op_registry.h
-+++ b/paddle/fluid/framework/op_registry.h
-@@ -186,9 +186,8 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
-         library_type,
-         customized_type_value,
- 
-        [op_type](const framework::ExecutionContext& ctx) {
-+        [](const framework::ExecutionContext& ctx) {
-           KERNEL_TYPE().Compute(ctx);
-          CheckKernelLaunch<PlaceType>(op_type);
-         });
-     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
-     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
-@@ -257,15 +256,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-   void operator()(const char* op_type,
-                   const char* library_type,
-                   int customized_type_value) const {
-    RegisterKernelClass<PlaceType, T>(
-        op_type,
-        library_type,
-        customized_type_value,
-
-        [op_type](const framework::ExecutionContext& ctx) {
-          Functor()(ctx);
-          CheckKernelLaunch<PlaceType>(op_type);
-        });
-+    RegisterKernelClass<PlaceType, T>(op_type, library_type,
-+                                      customized_type_value, Functor());
- 
-     constexpr auto size =
-         std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
-@@ -296,7 +288,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-     VarTypeInference
-     InferShapeBase
- */
-#define REGISTER_OPERATOR(op_type, op_class, ...)                        \
-+#define REGISTER_OPERATOR__(op_type, op_class, ...)                        \
-   STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
-       __reg_op__##op_type,                                               \
-       "REGISTER_OPERATOR must be called in global namespace");           \
-@@ -307,15 +299,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-     return 0;                                                            \
-   }
- 
-#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...) \
-+#define REGISTER_OPERATOR(op_type, op_class, ...)
-+
-+#define REGISTER_OP_WITHOUT_GRADIENT__(op_type, op_class, ...) \
-   REGISTER_OPERATOR(op_type, op_class, __VA_ARGS__, \
-         paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
-         paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
- 
-+#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...)
- /**
-  * Macro to register OperatorKernel.
-  */
- #define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type,             \
-+                                            place_class, customized_name,      \
-+                                            customized_type_value, ...)
-+
-+#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE__(op_type, library_type,             \
-                                             place_class, customized_name,      \
-                                             customized_type_value, ...)        \
-   STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
-@@ -332,18 +331,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-     return 0;                                                                  \
-   }
- 
-#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)   \
-  REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(                                \
-+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)
-+
-+#define REGISTER_OP_KERNEL__(op_type, library_type, place_class, ...)   \
-+  REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE__(                                \
-       op_type, library_type, place_class, DEFAULT_TYPE,               \
-       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-       __VA_ARGS__)
- 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-+#define REGISTER_OP_CUDA_KERNEL__(op_type, ...) \
-+  REGISTER_OP_KERNEL__(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
-+
- #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
-   REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
-#else
-#define REGISTER_OP_CUDA_KERNEL(op_type, ...)
-#endif
-+
-+#define REGISTER_OP_CPU_KERNEL__(op_type, ...) \
-+  REGISTER_OP_KERNEL__(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
- 
- #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
-   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
-@@ -361,6 +364,11 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-   REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
- 
- #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
-+                              customized_name,                     \
-+                              customized_type_value,               \
-+			      ...)
-+
-+#define REGISTER_OP_KERNEL_EX__(op_type, library_type, place_class,  \
-                               customized_name,                     \
-                               customized_type_value,               \
-                               ...)                                 \
-@@ -378,8 +386,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-     return 0;                                                                  \
-   }
- 
-#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)                 \
-  REGISTER_OP_KERNEL_EX(                                              \
-+#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)
-+
-+#define REGISTER_OP_CUDA_KERNEL_FUNCTOR__(op_type, ...)                 \
-+  REGISTER_OP_KERNEL_EX__(                                              \
-       op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE,     \
-       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-       __VA_ARGS__)
-@@ -396,12 +406,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-       __VA_ARGS__)
- 
-#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...)                  \
-  REGISTER_OP_KERNEL_EX(                                              \
-      op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE,       \
-      ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
-      __VA_ARGS__)
-
- #define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...)                  \
-   REGISTER_OP_KERNEL_EX(                                              \
-       op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE,       \
-@@ -413,7 +417,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-  * we will use and tell the compiler to
-  * link them into target.
-  */
-#define USE_OP_ITSELF(op_type)                             \
-+#define USE_OP_ITSELF(op_type)
-+
-+#define USE_OP_ITSELF__(op_type)                             \
-   STATIC_ASSERT_GLOBAL_NAMESPACE(                          \
-       __use_op_itself_##op_type,                           \
-       "USE_OP_ITSELF must be called in global namespace"); \
-@@ -421,6 +427,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-   UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
- 
- #define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type,                     \
-+                                              LIBRARY_TYPE,                \
-+                                              customized_name)
-+
-+#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE__(op_type,                     \
-                                               LIBRARY_TYPE,                \
-                                               customized_name)             \
-   STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-@@ -431,33 +441,58 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
-   UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
-       TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
- 
-#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
-  USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
-+#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type,                \
-+                                              LIBRARY_TYPE,           \
-+                                              customized_name)
-+
-+#define USE_OP_DEVICE_KERNEL__(op_type, LIBRARY_TYPE) \
-+  USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE__(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
-+
-+#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)
- 
- // TODO(fengjiayi): The following macros
- // seems ugly, do we have better method?
- 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-+#ifndef PADDLE_WITH_CUDA
- #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
-+#define USE_OP_KERNEL__(op_type) USE_OP_DEVICE_KERNEL__(op_type, CPU)
- #else
- #define USE_OP_KERNEL(op_type)        \
-   USE_OP_DEVICE_KERNEL(op_type, CPU); \
-   USE_OP_DEVICE_KERNEL(op_type, CUDA)
-+
-+#define USE_OP_KERNEL__(op_type)        \
-+  USE_OP_DEVICE_KERNEL__(op_type, CPU); \
-+  USE_OP_DEVICE_KERNEL__(op_type, CUDA)
- #endif
- 
- #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
- 
-+#define USE_NO_KERNEL_OP__(op_type) USE_OP_ITSELF__(op_type);
-+
- #define USE_CPU_ONLY_OP(op_type) \
-   USE_OP_ITSELF(op_type);        \
-   USE_OP_DEVICE_KERNEL(op_type, CPU);
- 
-+#define USE_CPU_ONLY_OP__(op_type) \
-+  USE_OP_ITSELF__(op_type);        \
-+  USE_OP_DEVICE_KERNEL__(op_type, CPU);
-+
- #define USE_CUDA_ONLY_OP(op_type) \
-   USE_OP_ITSELF(op_type);         \
-   USE_OP_DEVICE_KERNEL(op_type, CUDA)
- 
-+#define USE_CUDA_ONLY_OP__(op_type) \
-+  USE_OP_ITSELF__(op_type);         \
-+  USE_OP_DEVICE_KERNEL__(op_type, CUDA)
-+
- #define USE_OP(op_type)   \
-   USE_OP_ITSELF(op_type); \
-   USE_OP_KERNEL(op_type)
-+
-+#define USE_OP__(op_type)   \
-+  USE_OP_ITSELF__(op_type); \
-+  USE_OP_KERNEL__(op_type)
- // clang-format on
- 
- }  // namespace framework
--- a/tools/infer_prune_patches/paddle_analysis_config.h.patch
+++ b/tools/infer_prune_patches/paddle_analysis_config.h.patch
-diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
-index d6a0b643c2..511844b482 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
-+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
-@@ -46,6 +46,7 @@
- namespace paddle {
- 
- class AnalysisPredictor;
-+class TensorRTPredictor;
- struct MkldnnQuantizerConfig;
- 
- struct LiteNNAdapterConfig {
-@@ -700,6 +701,8 @@ struct PD_INFER_DECL AnalysisConfig {
- 
-   friend class ::paddle::AnalysisPredictor;
- 
-+  friend class ::paddle::TensorRTPredictor;
-+
-   ///
-   /// \brief Get a pass builder for customize the passes in IR analysis phase.
-   /// NOTE: Just for developer, not an official API, easy to be broken.
--- a/tools/infer_prune_patches/paddle_api.h.patch
+++ b/tools/infer_prune_patches/paddle_api.h.patch
-diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
-index b28370fb82..aaf20a28b0 100644
--- a/paddle/fluid/inference/api/paddle_api.h
-+++ b/paddle/fluid/inference/api/paddle_api.h
-@@ -194,6 +194,7 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
- 
-  private:
-   friend class AnalysisPredictor;
-+  friend class TensorRTPredictor;
-   friend class ONNXRuntimePredictor;
-   explicit ZeroCopyTensor(void* scope, const void* device_contexts)
-       : paddle_infer::Tensor{scope, device_contexts} {}
--- a/tools/infer_prune_patches/paddle_inference_api.h.patch
+++ b/tools/infer_prune_patches/paddle_inference_api.h.patch
-diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
-index 35b90bfa54..ba8220d06a 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
-+++ b/paddle/fluid/inference/api/paddle_inference_api.h
-@@ -41,6 +41,11 @@ limitations under the License. */
- /// \since 2.0.0-beta
- ///
- 
-+namespace paddle {
-+std::unique_ptr<PaddlePredictor> CreateTensorRTPredictor(
-+    const AnalysisConfig& config);
-+}
-+
- namespace paddle_infer {
- 
- using PrecisionType = paddle::AnalysisConfig::Precision;
--- a/tools/infer_prune_patches/phi_cmake.patch
+++ b/tools/infer_prune_patches/phi_cmake.patch
-diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
-index 9715fd7704..44109e1081 100644
--- a/paddle/phi/CMakeLists.txt
-+++ b/paddle/phi/CMakeLists.txt
-@@ -18,7 +18,7 @@ add_subdirectory(infermeta)
- # phi operator definitions
- add_subdirectory(ops)
- # phi tools
-add_subdirectory(tools)
-+#add_subdirectory(tools)
- # phi tests
- add_subdirectory(tests)
- # phi capi
--- a/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
+++ b/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
-diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
-index 7a9c5b889d..c847a5d523 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
-+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
-@@ -418,6 +418,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
-       graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
-       graph->Has(framework::ir::kMultiheadMatmulPass));
- 
-+  std::unordered_set<std::string> param_set(params.begin(), params.end());
-   if (use_static_engine) {
-     trt_engine_serialized_data = GetTrtEngineSerializedData(
-         Get<std::string>("model_opt_cache_dir"), engine_key);
-@@ -427,6 +428,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
-       LOG(INFO) << "Load TRT Optimized Info from "
-                 << GetTrtEngineSerializedPath(
-                        Get<std::string>("model_opt_cache_dir"), engine_key);
-+      const auto* root_scope{param_scope()};
-+      for (;root_scope->parent();) {
-+        root_scope = root_scope->parent();
-+      }
-+      for (const auto& name: param_set) {
-+        LOG(INFO) << " ===== Clear param: " << name;
-+        root_scope->FindLocalVar(name)->Clear();
-+      }
-+      for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount();
-+          ++dev_id) {
-+        memory::Release(platform::CUDAPlace(dev_id));
-+      }
-+      memory::Release(platform::CPUPlace());
-       return;
-     }
-   }
-@@ -439,7 +453,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
- 
-   auto *scope = param_scope();
-   framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
-  std::unordered_set<std::string> param_set(params.begin(), params.end());
-   inference::Singleton<inference::tensorrt::OpConverter>::Global()
-       .ConvertBlockToTRTEngine(
-           &block_desc_temp,
-@@ -449,6 +462,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
-           output_mapping,
-           trt_engine);
- 
-+  const auto* root_scope{scope};
-+  for (;root_scope->parent();) {
-+    root_scope = root_scope->parent();
-+  }
-+  VLOG(4) << "root_scope->LocalVarNames().size: " << root_scope->LocalVarNames().size();
-+  for (const auto& name: param_set) {
-+    VLOG(4) << "  ===== Clear param: " << name;
-+    root_scope->FindLocalVar(name)->Clear();
-+  }
-+  for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount();
-+       ++dev_id) {
-+    memory::Release(platform::CUDAPlace(dev_id));
-+  }
-+  memory::Release(platform::CPUPlace());
-+
-   if (use_static_engine) {
-     nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
-     trt_engine_serialized_data =
-@@ -462,6 +490,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
-               << GetTrtEngineSerializedPath(
-                      Get<std::string>("model_opt_cache_dir"), engine_key);
-   }
-+  trt_engine_serialized_data.clear();
-+  trt_engine_serialized_data.shrink_to_fit();
- }
- 
- }  // namespace analysis
--- a/tools/infer_prune_patches/thread_local_allocator.cc.patch
+++ b/tools/infer_prune_patches/thread_local_allocator.cc.patch
-diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
-index 875e57cfd4..b111ada3ab 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
-+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
-@@ -13,19 +13,62 @@
- // limitations under the License.
- 
- #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
-+#include "paddle/fluid/platform/cuda_device_guard.h"
- 
- namespace paddle {
- namespace memory {
- namespace allocation {
- 
-+const int MALLOC_ALIGN = 64;
-+
-+#define CUDA_CALL(func)                                      \
-+  {                                                          \
-+    auto e = (func);                                         \
-+    CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
-+        << "CUDA: " << cudaGetErrorString(e);                \
-+  }
-+
-+void* DirectAllocator::Alloc(size_t unaligned_size) {
-+  if (platform::is_cpu_place(place_)) {
-+    size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
-+    char* p = static_cast<char*>(std::malloc(offset + unaligned_size));
-+    // Memory checking
-+    CHECK(p) << "Error occurred in malloc period: available space is not enough "
-+                "for mallocing "
-+            << unaligned_size << " bytes.";
-+    // Byte alignment
-+    void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
-+                                      (~(MALLOC_ALIGN - 1)));
-+    static_cast<void**>(r)[-1] = p;
-+    return r;
-+  } else if (platform::is_gpu_place(place_)) {
-+    int dev_id = place_.GetDeviceId();
-+    platform::CUDADeviceGuard guard(dev_id);
-+    void* ptr{};
-+    CUDA_CALL(cudaMalloc(&ptr, unaligned_size));
-+    return ptr;
-+  }
-+  return nullptr;
-+}
-+
-+void DirectAllocator::Free(void* ptr) {
-+  if (platform::is_cpu_place(place_)) {
-+    if (ptr) {
-+      std::free(static_cast<void**>(ptr)[-1]);
-+    } 
-+  } else if (platform::is_gpu_place(place_)) {
-+    int dev_id = place_.GetDeviceId();
-+    platform::CUDADeviceGuard guard(dev_id);
-+    CUDA_CALL(cudaFree(ptr));
-+  }
-+}
-+
-+
-+
- ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
-     : place_(p) {
-   if (platform::is_gpu_place(place_)) {
-    buddy_allocator_.reset(new memory::detail::BuddyAllocator(
-        std::unique_ptr<memory::detail::SystemAllocator>(
-            new memory::detail::GPUAllocator(place_.device)),
-        platform::GpuMinChunkSize(),
-        platform::GpuMaxChunkSize()));
-+    direct_allocator_.reset(new DirectAllocator{place_});
-   } else {
-     PADDLE_THROW(platform::errors::Unavailable(
-         "Thread local allocator only supports CUDAPlace now."));
-@@ -61,7 +104,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
- 
- ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
-   VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
-  void* ptr = buddy_allocator_->Alloc(size);
-+  void* ptr = direct_allocator_->Alloc(size);
-   auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
-   tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
-   return tl_allocation;
-@@ -69,12 +112,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
- 
- void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
-   VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
-  buddy_allocator_->Free(allocation->ptr());
-+  direct_allocator_->Free(allocation->ptr());
-   delete allocation;
- }
- 
- uint64_t ThreadLocalAllocatorImpl::ReleaseImpl() {
-  return buddy_allocator_->Release();
-+  return direct_allocator_->Release();
- }
- 
- }  // namespace allocation
--- a/tools/infer_prune_patches/thread_local_allocator.h.patch
+++ b/tools/infer_prune_patches/thread_local_allocator.h.patch
-diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
-index 654fb3fe7b..44c5dbf87f 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
-+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
-@@ -26,6 +26,16 @@ namespace paddle {
- namespace memory {
- namespace allocation {
- 
-+class DirectAllocator {
-+public:
-+  DirectAllocator(const platform::Place& place) : place_{place} {}
-+  void* Alloc(size_t unaligned_size);
-+  void Free(void* ptr);
-+  uint64_t Release() { return 0;}
-+private:
-+  platform::Place place_;
-+};
-+
- class ThreadLocalAllocatorImpl;
- 
- class ThreadLocalAllocation : public Allocation {
-@@ -55,7 +65,7 @@ class ThreadLocalAllocatorImpl
-   uint64_t ReleaseImpl();
- 
-  private:
-  std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
-+  std::unique_ptr<DirectAllocator> direct_allocator_;
-   platform::Place place_;
- };
-