未验证 提交 7d14613d 编写于 作者: S Shang Zhizhou 提交者: GitHub

Fix prune tool (#43849)

* update pateches

* test=document_fix; add patch file
上级 c4a52b83
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index 23650c8efc..24466e4327 100644 index d71497275d..cb56e1d949 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h --- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -97,8 +97,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { @@ -102,8 +102,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
} }
ret(); ret();
} }
- void L(const char* label) { Xbyak::CodeGenerator::L(label); } - void L(const char* label) { Xbyak::CodeGenerator::L(label); }
- void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } // NOLINT - void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } // NOLINT
+ void L(const char* label) { } + void L(const char* label) { }
+ void L(Xbyak::Label& label) { } // NOLINT + void L(Xbyak::Label& label) { } // NOLINT
// Enhanced vector extension // Enhanced vector extension
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt, Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
bool bcast = false) { int offt,
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index a1f07f9f25..179df3b981 100644 index d38efbff31..f5bef776d6 100644
--- a/paddle/fluid/framework/op_registry.h --- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h
@@ -178,9 +178,8 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> { @@ -186,9 +186,8 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
RegisterKernelClass<PlaceType, T>( library_type,
op_type, library_type, customized_type_value, customized_type_value,
- [op_type](const framework::ExecutionContext& ctx) { - [op_type](const framework::ExecutionContext& ctx) {
+ [](const framework::ExecutionContext& ctx) { + [](const framework::ExecutionContext& ctx) {
...@@ -13,12 +13,14 @@ index a1f07f9f25..179df3b981 100644 ...@@ -13,12 +13,14 @@ index a1f07f9f25..179df3b981 100644
}); });
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value; constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...> OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
@@ -240,13 +239,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -257,15 +256,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
void operator()(const char* op_type,
void operator()(const char* op_type, const char* library_type, const char* library_type,
int customized_type_value) const { int customized_type_value) const {
- RegisterKernelClass<PlaceType, T>( - RegisterKernelClass<PlaceType, T>(
- op_type, library_type, customized_type_value, - op_type,
- library_type,
- customized_type_value,
- -
- [op_type](const framework::ExecutionContext& ctx) { - [op_type](const framework::ExecutionContext& ctx) {
- Functor()(ctx); - Functor()(ctx);
...@@ -29,7 +31,7 @@ index a1f07f9f25..179df3b981 100644 ...@@ -29,7 +31,7 @@ index a1f07f9f25..179df3b981 100644
constexpr auto size = constexpr auto size =
std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value; std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
@@ -275,7 +269,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -296,7 +288,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
VarTypeInference VarTypeInference
InferShapeBase InferShapeBase
*/ */
...@@ -38,7 +40,7 @@ index a1f07f9f25..179df3b981 100644 ...@@ -38,7 +40,7 @@ index a1f07f9f25..179df3b981 100644
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op__##op_type, \ __reg_op__##op_type, \
"REGISTER_OPERATOR must be called in global namespace"); \ "REGISTER_OPERATOR must be called in global namespace"); \
@@ -286,15 +280,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -307,15 +299,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
return 0; \ return 0; \
} }
...@@ -62,7 +64,7 @@ index a1f07f9f25..179df3b981 100644 ...@@ -62,7 +64,7 @@ index a1f07f9f25..179df3b981 100644
place_class, customized_name, \ place_class, customized_name, \
customized_type_value, ...) \ customized_type_value, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -311,18 +312,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -332,18 +331,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
return 0; \ return 0; \
} }
...@@ -91,7 +93,7 @@ index a1f07f9f25..179df3b981 100644 ...@@ -91,7 +93,7 @@ index a1f07f9f25..179df3b981 100644
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -340,6 +345,11 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -361,6 +364,11 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \ #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
...@@ -103,7 +105,7 @@ index a1f07f9f25..179df3b981 100644 ...@@ -103,7 +105,7 @@ index a1f07f9f25..179df3b981 100644
customized_name, \ customized_name, \
customized_type_value, \ customized_type_value, \
...) \ ...) \
@@ -357,8 +367,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -378,8 +386,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
return 0; \ return 0; \
} }
...@@ -116,7 +118,7 @@ index a1f07f9f25..179df3b981 100644 ...@@ -116,7 +118,7 @@ index a1f07f9f25..179df3b981 100644
op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \ op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__) __VA_ARGS__)
@@ -375,12 +387,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -396,12 +406,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__) __VA_ARGS__)
...@@ -129,7 +131,7 @@ index a1f07f9f25..179df3b981 100644 ...@@ -129,7 +131,7 @@ index a1f07f9f25..179df3b981 100644
#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...) \ #define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \ REGISTER_OP_KERNEL_EX( \
op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE, \ op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE, \
@@ -392,7 +398,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -413,7 +417,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
* we will use and tell the compiler to * we will use and tell the compiler to
* link them into target. * link them into target.
*/ */
...@@ -140,7 +142,7 @@ index a1f07f9f25..179df3b981 100644 ...@@ -140,7 +142,7 @@ index a1f07f9f25..179df3b981 100644
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_itself_##op_type, \ __use_op_itself_##op_type, \
"USE_OP_ITSELF must be called in global namespace"); \ "USE_OP_ITSELF must be called in global namespace"); \
@@ -400,6 +408,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -421,6 +427,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type() UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \ #define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \
...@@ -151,7 +153,7 @@ index a1f07f9f25..179df3b981 100644 ...@@ -151,7 +153,7 @@ index a1f07f9f25..179df3b981 100644
LIBRARY_TYPE, \ LIBRARY_TYPE, \
customized_name) \ customized_name) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -410,33 +422,58 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, @@ -431,33 +441,58 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \ UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name() TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
......
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 58ad42ddd1..8ffdafcf0d 100644 index 9715fd7704..44109e1081 100644
--- a/paddle/phi/CMakeLists.txt --- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt
@@ -18,7 +18,7 @@ add_subdirectory(infermeta) @@ -18,7 +18,7 @@ add_subdirectory(infermeta)
...@@ -10,4 +10,4 @@ index 58ad42ddd1..8ffdafcf0d 100644 ...@@ -10,4 +10,4 @@ index 58ad42ddd1..8ffdafcf0d 100644
+#add_subdirectory(tools) +#add_subdirectory(tools)
# phi tests # phi tests
add_subdirectory(tests) add_subdirectory(tests)
# phi capi
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 394ce7799e..8edbef50be 100644 index 7a9c5b889d..c847a5d523 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -390,6 +390,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( @@ -418,6 +418,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) && graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass)); graph->Has(framework::ir::kMultiheadMatmulPass));
...@@ -10,7 +10,7 @@ index 394ce7799e..8edbef50be 100644 ...@@ -10,7 +10,7 @@ index 394ce7799e..8edbef50be 100644
if (use_static_engine) { if (use_static_engine) {
trt_engine_serialized_data = GetTrtEngineSerializedData( trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key); Get<std::string>("model_opt_cache_dir"), engine_key);
@@ -399,6 +400,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp( @@ -427,6 +428,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
LOG(INFO) << "Load TRT Optimized Info from " LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath( << GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key); Get<std::string>("model_opt_cache_dir"), engine_key);
...@@ -30,16 +30,18 @@ index 394ce7799e..8edbef50be 100644 ...@@ -30,16 +30,18 @@ index 394ce7799e..8edbef50be 100644
return; return;
} }
} }
@@ -411,12 +425,25 @@ void TensorRtSubgraphPass::CreateTensorRTOp( @@ -439,7 +453,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
auto *scope = param_scope(); auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
- std::unordered_set<std::string> param_set(params.begin(), params.end()); - std::unordered_set<std::string> param_set(params.begin(), params.end());
inference::Singleton<inference::tensorrt::OpConverter>::Global() inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlockToTRTEngine( .ConvertBlockToTRTEngine(
&block_desc_temp, *scope, &block_desc_temp,
std::vector<std::string>(input_names.begin(), input_names.end()), @@ -449,6 +462,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
param_set, output_mapping, trt_engine); output_mapping,
trt_engine);
+ const auto* root_scope{scope}; + const auto* root_scope{scope};
+ for (;root_scope->parent();) { + for (;root_scope->parent();) {
+ root_scope = root_scope->parent(); + root_scope = root_scope->parent();
...@@ -54,10 +56,11 @@ index 394ce7799e..8edbef50be 100644 ...@@ -54,10 +56,11 @@ index 394ce7799e..8edbef50be 100644
+ memory::Release(platform::CUDAPlace(dev_id)); + memory::Release(platform::CUDAPlace(dev_id));
+ } + }
+ memory::Release(platform::CPUPlace()); + memory::Release(platform::CPUPlace());
+
if (use_static_engine) { if (use_static_engine) {
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
@@ -431,6 +458,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( trt_engine_serialized_data =
@@ -462,6 +490,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
<< GetTrtEngineSerializedPath( << GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key); Get<std::string>("model_opt_cache_dir"), engine_key);
} }
......
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
index f125670a59..f858a30301 100644 index 875e57cfd4..b111ada3ab 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc --- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc +++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -13,18 +13,62 @@ @@ -13,19 +13,62 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
...@@ -64,12 +64,13 @@ index f125670a59..f858a30301 100644 ...@@ -64,12 +64,13 @@ index f125670a59..f858a30301 100644
- buddy_allocator_.reset(new memory::detail::BuddyAllocator( - buddy_allocator_.reset(new memory::detail::BuddyAllocator(
- std::unique_ptr<memory::detail::SystemAllocator>( - std::unique_ptr<memory::detail::SystemAllocator>(
- new memory::detail::GPUAllocator(place_.device)), - new memory::detail::GPUAllocator(place_.device)),
- platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); - platform::GpuMinChunkSize(),
- platform::GpuMaxChunkSize()));
+ direct_allocator_.reset(new DirectAllocator{place_}); + direct_allocator_.reset(new DirectAllocator{place_});
} else { } else {
PADDLE_THROW(platform::errors::Unavailable( PADDLE_THROW(platform::errors::Unavailable(
"Thread local allocator only supports CUDAPlace now.")); "Thread local allocator only supports CUDAPlace now."));
@@ -59,7 +103,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool() @@ -61,7 +104,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size; VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
...@@ -78,7 +79,7 @@ index f125670a59..f858a30301 100644 ...@@ -78,7 +79,7 @@ index f125670a59..f858a30301 100644
auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_); auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this()); tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
return tl_allocation; return tl_allocation;
@@ -67,12 +111,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { @@ -69,12 +112,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) { void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation; VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册