未验证 提交 7d14613d 编写于 作者: S Shang Zhizhou 提交者: GitHub

Fix prune tool (#43849)

* update pateches

* test=document_fix; add patch file
上级 c4a52b83
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index 23650c8efc..24466e4327 100644
index d71497275d..cb56e1d949 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -97,8 +97,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
@@ -102,8 +102,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
}
ret();
}
- void L(const char* label) { Xbyak::CodeGenerator::L(label); }
- void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } // NOLINT
+ void L(const char* label) { }
+ void L(Xbyak::Label& label) { } // NOLINT
+ void L(const char* label) { }
+ void L(Xbyak::Label& label) { } // NOLINT
// Enhanced vector extension
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
bool bcast = false) {
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
int offt,
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index a1f07f9f25..179df3b981 100644
index d38efbff31..f5bef776d6 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -178,9 +178,8 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
RegisterKernelClass<PlaceType, T>(
op_type, library_type, customized_type_value,
@@ -186,9 +186,8 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
library_type,
customized_type_value,
- [op_type](const framework::ExecutionContext& ctx) {
+ [](const framework::ExecutionContext& ctx) {
......@@ -13,12 +13,14 @@ index a1f07f9f25..179df3b981 100644
});
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
@@ -240,13 +239,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
void operator()(const char* op_type, const char* library_type,
@@ -257,15 +256,8 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
void operator()(const char* op_type,
const char* library_type,
int customized_type_value) const {
- RegisterKernelClass<PlaceType, T>(
- op_type, library_type, customized_type_value,
- op_type,
- library_type,
- customized_type_value,
-
- [op_type](const framework::ExecutionContext& ctx) {
- Functor()(ctx);
......@@ -29,7 +31,7 @@ index a1f07f9f25..179df3b981 100644
constexpr auto size =
std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
@@ -275,7 +269,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
@@ -296,7 +288,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
VarTypeInference
InferShapeBase
*/
......@@ -38,7 +40,7 @@ index a1f07f9f25..179df3b981 100644
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op__##op_type, \
"REGISTER_OPERATOR must be called in global namespace"); \
@@ -286,15 +280,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
@@ -307,15 +299,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
return 0; \
}
......@@ -62,7 +64,7 @@ index a1f07f9f25..179df3b981 100644
place_class, customized_name, \
customized_type_value, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -311,18 +312,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
@@ -332,18 +331,22 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
return 0; \
}
......@@ -91,7 +93,7 @@ index a1f07f9f25..179df3b981 100644
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -340,6 +345,11 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
@@ -361,6 +364,11 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
......@@ -103,7 +105,7 @@ index a1f07f9f25..179df3b981 100644
customized_name, \
customized_type_value, \
...) \
@@ -357,8 +367,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
@@ -378,8 +386,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
return 0; \
}
......@@ -116,7 +118,7 @@ index a1f07f9f25..179df3b981 100644
op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
@@ -375,12 +387,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
@@ -396,12 +406,6 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
......@@ -129,7 +131,7 @@ index a1f07f9f25..179df3b981 100644
#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE, \
@@ -392,7 +398,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
@@ -413,7 +417,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
* we will use and tell the compiler to
* link them into target.
*/
......@@ -140,7 +142,7 @@ index a1f07f9f25..179df3b981 100644
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_itself_##op_type, \
"USE_OP_ITSELF must be called in global namespace"); \
@@ -400,6 +408,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
@@ -421,6 +427,10 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \
......@@ -151,7 +153,7 @@ index a1f07f9f25..179df3b981 100644
LIBRARY_TYPE, \
customized_name) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -410,33 +422,58 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
@@ -431,33 +441,58 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
......
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 58ad42ddd1..8ffdafcf0d 100644
index 9715fd7704..44109e1081 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -18,7 +18,7 @@ add_subdirectory(infermeta)
......@@ -10,4 +10,4 @@ index 58ad42ddd1..8ffdafcf0d 100644
+#add_subdirectory(tools)
# phi tests
add_subdirectory(tests)
# phi capi
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 394ce7799e..8edbef50be 100644
index 7a9c5b889d..c847a5d523 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -390,6 +390,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
@@ -418,6 +418,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass));
......@@ -10,7 +10,7 @@ index 394ce7799e..8edbef50be 100644
if (use_static_engine) {
trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key);
@@ -399,6 +400,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
@@ -427,6 +428,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
......@@ -30,16 +30,18 @@ index 394ce7799e..8edbef50be 100644
return;
}
}
@@ -411,12 +425,25 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
@@ -439,7 +453,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
- std::unordered_set<std::string> param_set(params.begin(), params.end());
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlockToTRTEngine(
&block_desc_temp, *scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, trt_engine);
&block_desc_temp,
@@ -449,6 +462,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
output_mapping,
trt_engine);
+ const auto* root_scope{scope};
+ for (;root_scope->parent();) {
+ root_scope = root_scope->parent();
......@@ -54,10 +56,11 @@ index 394ce7799e..8edbef50be 100644
+ memory::Release(platform::CUDAPlace(dev_id));
+ }
+ memory::Release(platform::CPUPlace());
+
if (use_static_engine) {
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
@@ -431,6 +458,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine_serialized_data =
@@ -462,6 +490,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
}
......
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
index f125670a59..f858a30301 100644
index 875e57cfd4..b111ada3ab 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -13,18 +13,62 @@
@@ -13,19 +13,62 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
......@@ -64,12 +64,13 @@ index f125670a59..f858a30301 100644
- buddy_allocator_.reset(new memory::detail::BuddyAllocator(
- std::unique_ptr<memory::detail::SystemAllocator>(
- new memory::detail::GPUAllocator(place_.device)),
- platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
- platform::GpuMinChunkSize(),
- platform::GpuMaxChunkSize()));
+ direct_allocator_.reset(new DirectAllocator{place_});
} else {
PADDLE_THROW(platform::errors::Unavailable(
"Thread local allocator only supports CUDAPlace now."));
@@ -59,7 +103,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
@@ -61,7 +104,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
......@@ -78,7 +79,7 @@ index f125670a59..f858a30301 100644
auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
return tl_allocation;
@@ -67,12 +111,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
@@ -69,12 +112,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册