diff --git a/tools/infer_prune_patches/jitcode.h.patch b/tools/infer_prune_patches/jitcode.h.patch index 9022b459db51c9b1f1036fc8cda2bb58ebb08352..a123939a4955b7a23ce412cd90571de3c5a195c5 100644 --- a/tools/infer_prune_patches/jitcode.h.patch +++ b/tools/infer_prune_patches/jitcode.h.patch @@ -1,15 +1,15 @@ diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h -index 23650c8efc..24466e4327 100644 +index d71497275d..cb56e1d949 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h -@@ -97,8 +97,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { +@@ -102,8 +102,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { } ret(); } - void L(const char* label) { Xbyak::CodeGenerator::L(label); } - void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } // NOLINT -+ void L(const char* label) { } -+ void L(Xbyak::Label& label) { } // NOLINT ++ void L(const char* label) { } ++ void L(Xbyak::Label& label) { } // NOLINT // Enhanced vector extension - Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt, - bool bcast = false) { + Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, + int offt, diff --git a/tools/infer_prune_patches/op_registry.h.patch b/tools/infer_prune_patches/op_registry.h.patch index a1d2a66347cc4c6c0199dbf4bedb2ca0cd487fe1..fb41c98de50f456d638f4af067f5da9971b2ed3b 100644 --- a/tools/infer_prune_patches/op_registry.h.patch +++ b/tools/infer_prune_patches/op_registry.h.patch @@ -1,10 +1,10 @@ diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h -index a1f07f9f25..179df3b981 100644 +index d38efbff31..f5bef776d6 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h -@@ -178,9 +178,8 @@ struct OpKernelRegistrarFunctor { - RegisterKernelClass( - op_type, library_type, customized_type_value, +@@ -186,9 +186,8 @@ struct OpKernelRegistrarFunctor { + library_type, + customized_type_value, - [op_type](const framework::ExecutionContext& ctx) { + [](const framework::ExecutionContext& ctx) { @@ -13,12 +13,14 @@ index a1f07f9f25..179df3b981 100644 }); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctor -@@ -240,13 +239,8 @@ struct OpKernelRegistrarFunctorEx( -- op_type, library_type, customized_type_value, +- op_type, +- library_type, +- customized_type_value, - - [op_type](const framework::ExecutionContext& ctx) { - Functor()(ctx); @@ -29,7 +31,7 @@ index a1f07f9f25..179df3b981 100644 constexpr auto size = std::tuple_size>::value; -@@ -275,7 +269,7 @@ struct OpKernelRegistrarFunctorExHas(framework::ir::kEmbEltwiseLayernormPass) && graph->Has(framework::ir::kMultiheadMatmulPass)); @@ -10,7 +10,7 @@ index 394ce7799e..8edbef50be 100644 if (use_static_engine) { trt_engine_serialized_data = GetTrtEngineSerializedData( Get("model_opt_cache_dir"), engine_key); -@@ -399,6 +400,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp( +@@ -427,6 +428,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp( LOG(INFO) << "Load TRT Optimized Info from " << GetTrtEngineSerializedPath( Get("model_opt_cache_dir"), engine_key); @@ -30,16 +30,18 @@ index 394ce7799e..8edbef50be 100644 return; } } -@@ -411,12 +425,25 @@ void TensorRtSubgraphPass::CreateTensorRTOp( +@@ -439,7 +453,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp( auto *scope = param_scope(); framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); - std::unordered_set param_set(params.begin(), params.end()); inference::Singleton::Global() .ConvertBlockToTRTEngine( - &block_desc_temp, *scope, - std::vector(input_names.begin(), input_names.end()), - param_set, output_mapping, trt_engine); + &block_desc_temp, +@@ -449,6 +462,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + output_mapping, + trt_engine); + + const auto* root_scope{scope}; + for (;root_scope->parent();) { + root_scope = root_scope->parent(); @@ -54,10 +56,11 @@ index 394ce7799e..8edbef50be 100644 + memory::Release(platform::CUDAPlace(dev_id)); + } + memory::Release(platform::CPUPlace()); - ++ if (use_static_engine) { nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); -@@ -431,6 +458,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + trt_engine_serialized_data = +@@ -462,6 +490,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( << GetTrtEngineSerializedPath( Get("model_opt_cache_dir"), engine_key); } diff --git a/tools/infer_prune_patches/thread_local_allocator.cc.patch b/tools/infer_prune_patches/thread_local_allocator.cc.patch index 6a4486aae9457cd20bbbabfbeaf0a3bed37ff422..b95480ad91dc77ff994b364e983866fe8f1001a7 100644 --- a/tools/infer_prune_patches/thread_local_allocator.cc.patch +++ b/tools/infer_prune_patches/thread_local_allocator.cc.patch @@ -1,8 +1,8 @@ diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc -index f125670a59..f858a30301 100644 +index 875e57cfd4..b111ada3ab 100644 --- a/paddle/fluid/memory/allocation/thread_local_allocator.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc -@@ -13,18 +13,62 @@ +@@ -13,19 +13,62 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/thread_local_allocator.h" @@ -64,12 +64,13 @@ index f125670a59..f858a30301 100644 - buddy_allocator_.reset(new memory::detail::BuddyAllocator( - std::unique_ptr( - new memory::detail::GPUAllocator(place_.device)), -- platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); +- platform::GpuMinChunkSize(), +- platform::GpuMaxChunkSize())); + direct_allocator_.reset(new DirectAllocator{place_}); } else { PADDLE_THROW(platform::errors::Unavailable( "Thread local allocator only supports CUDAPlace now.")); -@@ -59,7 +103,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool() +@@ -61,7 +104,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool() ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size; @@ -78,7 +79,7 @@ index f125670a59..f858a30301 100644 auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_); tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this()); return tl_allocation; -@@ -67,12 +111,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { +@@ -69,12 +112,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) { VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;