Fix prune tool (#43849)

* update pateches * test=document_fix; add patch file

Fix prune tool (#43849)
* update pateches * test=document_fix; add patch file
7d14613d · Shang Zhizhou · GitHub · c4a52b83 · 7d14613d · 7d14613d
5 changed file
--- a/tools/infer_prune_patches/jitcode.h.patch
+++ b/tools/infer_prune_patches/jitcode.h.patch
 diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
-index 23650c8efc..24466e4327 100644
+index d71497275d..cb56e1d949 100644
 --- a/paddle/fluid/operators/jit/gen/jitcode.h
 +++ b/paddle/fluid/operators/jit/gen/jitcode.h
-@@ -97,8 +97,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
+@@ -102,8 +102,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
     }
     ret();
   }
@@ -11,5 +11,5 @@ index 23650c8efc..24466e4327 100644
 +  void L(const char* label) {  }
 +  void L(Xbyak::Label& label) {  }  // NOLINT
   // Enhanced vector extension
-   Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
-                                     bool bcast = false) {
+   Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
+                                     int offt,
--- a/tools/infer_prune_patches/op_registry.h.patch
+++ b/tools/infer_prune_patches/op_registry.h.patch
 diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
-index a1f07f9f25..179df3b981 100644
+index d38efbff31..f5bef776d6 100644
 --- a/paddle/fluid/framework/op_registry.h
 +++ b/paddle/fluid/framework/op_registry.h
-@@ -178,9 +178,8 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
-     RegisterKernelClass<PlaceType, T>(
-         op_type, library_type, customized_type_value,
+@@ -186,9 +186,8 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
+         library_type,
+         customized_type_value,
 
 -        [op_type](const framework::ExecutionContext& ctx) {
 +        [](const framework::ExecutionContext& ctx) {
@@ -13,12 +13,14 @@ index a1f07f9f25..179df3b981 100644
         });
     constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
     OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
-@@ -240,13 +239,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
- 
-   void operator()(const char* op_type, const char* library_type,
+@@ -257,15 +256,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
+   void operator()(const char* op_type,
+                   const char* library_type,
                   int customized_type_value) const {
 -    RegisterKernelClass<PlaceType, T>(
-        op_type, library_type, customized_type_value,
+-        op_type,
+-        library_type,
+-        customized_type_value,
 -
 -        [op_type](const framework::ExecutionContext& ctx) {
 -          Functor()(ctx);
@@ -29,7 +31,7 @@ index a1f07f9f25..179df3b981 100644
 
     constexpr auto size =
         std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
-@@ -275,7 +269,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+@@ -296,7 +288,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
     VarTypeInference
     InferShapeBase
 */
@@ -38,7 +40,7 @@ index a1f07f9f25..179df3b981 100644
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
       __reg_op__##op_type,                                               \
       "REGISTER_OPERATOR must be called in global namespace");           \
-@@ -286,15 +280,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+@@ -307,15 +299,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
     return 0;                                                            \
   }
 
@@ -62,7 +64,7 @@ index a1f07f9f25..179df3b981 100644
                                             place_class, customized_name,      \
                                             customized_type_value, ...)        \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
-@@ -311,18 +312,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+@@ -332,18 +331,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
     return 0;                                                                  \
   }
 
@@ -91,7 +93,7 @@ index a1f07f9f25..179df3b981 100644
 
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
-@@ -340,6 +345,11 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+@@ -361,6 +364,11 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
   REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
 
 #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class,  \
@@ -103,7 +105,7 @@ index a1f07f9f25..179df3b981 100644
                               customized_name,                     \
                               customized_type_value,               \
                               ...)                                 \
-@@ -357,8 +367,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+@@ -378,8 +386,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
     return 0;                                                                  \
   }
 
@@ -116,7 +118,7 @@ index a1f07f9f25..179df3b981 100644
       op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE,     \
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
-@@ -375,12 +387,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+@@ -396,12 +406,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
@@ -129,7 +131,7 @@ index a1f07f9f25..179df3b981 100644
 #define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...)                  \
   REGISTER_OP_KERNEL_EX(                                              \
       op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE,       \
-@@ -392,7 +398,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+@@ -413,7 +417,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
  * we will use and tell the compiler to
  * link them into target.
  */
@@ -140,7 +142,7 @@ index a1f07f9f25..179df3b981 100644
   STATIC_ASSERT_GLOBAL_NAMESPACE(                          \
       __use_op_itself_##op_type,                           \
       "USE_OP_ITSELF must be called in global namespace"); \
-@@ -400,6 +408,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+@@ -421,6 +427,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
   UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
 
 #define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type,                     \
@@ -151,7 +153,7 @@ index a1f07f9f25..179df3b981 100644
                                               LIBRARY_TYPE,                \
                                               customized_name)             \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-@@ -410,33 +422,58 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
+@@ -431,33 +441,58 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
   UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
       TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
 

--- a/tools/infer_prune_patches/phi_cmake.patch
+++ b/tools/infer_prune_patches/phi_cmake.patch
 diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
-index 58ad42ddd1..8ffdafcf0d 100644
+index 9715fd7704..44109e1081 100644
 --- a/paddle/phi/CMakeLists.txt
 +++ b/paddle/phi/CMakeLists.txt
 @@ -18,7 +18,7 @@ add_subdirectory(infermeta)
@@ -10,4 +10,4 @@ index 58ad42ddd1..8ffdafcf0d 100644
 +#add_subdirectory(tools)
 # phi tests
 add_subdirectory(tests)
- 
+ # phi capi
--- a/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
+++ b/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
 diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
-index 394ce7799e..8edbef50be 100644
+index 7a9c5b889d..c847a5d523 100644
 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
 +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
-@@ -390,6 +390,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
+@@ -418,6 +418,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
       graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
       graph->Has(framework::ir::kMultiheadMatmulPass));
 
@@ -10,7 +10,7 @@ index 394ce7799e..8edbef50be 100644
   if (use_static_engine) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
         Get<std::string>("model_opt_cache_dir"), engine_key);
-@@ -399,6 +400,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
+@@ -427,6 +428,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
       LOG(INFO) << "Load TRT Optimized Info from "
                 << GetTrtEngineSerializedPath(
                        Get<std::string>("model_opt_cache_dir"), engine_key);
@@ -30,16 +30,18 @@ index 394ce7799e..8edbef50be 100644
       return;
     }
   }
-@@ -411,12 +425,25 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
+@@ -439,7 +453,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   auto *scope = param_scope();
   framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
 -  std::unordered_set<std::string> param_set(params.begin(), params.end());
   inference::Singleton<inference::tensorrt::OpConverter>::Global()
       .ConvertBlockToTRTEngine(
-           &block_desc_temp, *scope,
-           std::vector<std::string>(input_names.begin(), input_names.end()),
-           param_set, output_mapping, trt_engine);
+           &block_desc_temp,
+@@ -449,6 +462,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
+           output_mapping,
+           trt_engine);
+ 
 +  const auto* root_scope{scope};
 +  for (;root_scope->parent();) {
 +    root_scope = root_scope->parent();
@@ -54,10 +56,11 @@ index 394ce7799e..8edbef50be 100644
 +    memory::Release(platform::CUDAPlace(dev_id));
 +  }
 +  memory::Release(platform::CPUPlace());
- 
+
   if (use_static_engine) {
     nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
-@@ -431,6 +458,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
+     trt_engine_serialized_data =
+@@ -462,6 +490,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
               << GetTrtEngineSerializedPath(
                      Get<std::string>("model_opt_cache_dir"), engine_key);
   }

--- a/tools/infer_prune_patches/thread_local_allocator.cc.patch
+++ b/tools/infer_prune_patches/thread_local_allocator.cc.patch
 diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
-index f125670a59..f858a30301 100644
+index 875e57cfd4..b111ada3ab 100644
 --- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
 +++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
-@@ -13,18 +13,62 @@
+@@ -13,19 +13,62 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
@@ -64,12 +64,13 @@ index f125670a59..f858a30301 100644
 -    buddy_allocator_.reset(new memory::detail::BuddyAllocator(
 -        std::unique_ptr<memory::detail::SystemAllocator>(
 -            new memory::detail::GPUAllocator(place_.device)),
-        platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
+-        platform::GpuMinChunkSize(),
+-        platform::GpuMaxChunkSize()));
 +    direct_allocator_.reset(new DirectAllocator{place_});
   } else {
     PADDLE_THROW(platform::errors::Unavailable(
         "Thread local allocator only supports CUDAPlace now."));
-@@ -59,7 +103,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
+@@ -61,7 +104,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
 
 ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
   VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
@@ -78,7 +79,7 @@ index f125670a59..f858a30301 100644
   auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
   tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
   return tl_allocation;
-@@ -67,12 +111,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
+@@ -69,12 +112,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
 
 void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
   VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;