Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
7d14613d
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7d14613d
编写于
6月 27, 2022
作者:
S
Shang Zhizhou
提交者:
GitHub
6月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix prune tool (#43849)
* update pateches * test=document_fix; add patch file
上级
c4a52b83
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
45 addition
and
39 deletion
+45
-39
tools/infer_prune_patches/jitcode.h.patch
tools/infer_prune_patches/jitcode.h.patch
+6
-6
tools/infer_prune_patches/op_registry.h.patch
tools/infer_prune_patches/op_registry.h.patch
+19
-17
tools/infer_prune_patches/phi_cmake.patch
tools/infer_prune_patches/phi_cmake.patch
+2
-2
tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
+12
-9
tools/infer_prune_patches/thread_local_allocator.cc.patch
tools/infer_prune_patches/thread_local_allocator.cc.patch
+6
-5
未找到文件。
tools/infer_prune_patches/jitcode.h.patch
浏览文件 @
7d14613d
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index
23650c8efc..24466e4327
100644
index
d71497275d..cb56e1d949
100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -
97,8 +97
,8 @@
class JitCode : public GenBase, public Xbyak::CodeGenerator {
@@ -
102,8 +102
,8 @@
class JitCode : public GenBase, public Xbyak::CodeGenerator {
}
ret();
}
- void L(const char* label) { Xbyak::CodeGenerator::L(label); }
- void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } // NOLINT
+ void L(const char* label) { }
+ void L(Xbyak::Label& label) { } // NOLINT
+ void L(const char* label) {
}
+ void L(Xbyak::Label& label) {
} // NOLINT
// Enhanced vector extension
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
int offt,
bool bcast = false) {
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
int offt,
tools/infer_prune_patches/op_registry.h.patch
浏览文件 @
7d14613d
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index
a1f07f9f25..179df3b981
100644
index
d38efbff31..f5bef776d6
100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -1
78,9 +178
,8 @@
struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
RegisterKernelClass<PlaceType, T>(
op_type, library_type,
customized_type_value,
@@ -1
86,9 +186
,8 @@
struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
library_type,
customized_type_value,
- [op_type](const framework::ExecutionContext& ctx) {
+ [](const framework::ExecutionContext& ctx) {
...
...
@@ -13,12 +13,14 @@ index a1f07f9f25..179df3b981 100644
});
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
@@ -2
40,13 +239,8 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
void operator()(const char* op_type,
const char* library_type,
@@ -2
57,15 +256,8 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
void operator()(const char* op_type,
const char* library_type,
int customized_type_value) const {
- RegisterKernelClass<PlaceType, T>(
- op_type, library_type, customized_type_value,
- op_type,
- library_type,
- customized_type_value,
-
- [op_type](const framework::ExecutionContext& ctx) {
- Functor()(ctx);
...
...
@@ -29,7 +31,7 @@ index a1f07f9f25..179df3b981 100644
constexpr auto size =
std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
@@ -2
75,7 +269,7 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -2
96,7 +288,7 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
VarTypeInference
InferShapeBase
*/
...
...
@@ -38,7 +40,7 @@ index a1f07f9f25..179df3b981 100644
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op__##op_type, \
"REGISTER_OPERATOR must be called in global namespace"); \
@@ -
286,15 +280,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -
307,15 +299,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
return 0; \
}
...
...
@@ -62,7 +64,7 @@ index a1f07f9f25..179df3b981 100644
place_class, customized_name, \
customized_type_value, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -3
11,18 +312,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -3
32,18 +331,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
return 0; \
}
...
...
@@ -91,7 +93,7 @@ index a1f07f9f25..179df3b981 100644
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -3
40,6 +345,11 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -3
61,6 +364,11 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
...
...
@@ -103,7 +105,7 @@ index a1f07f9f25..179df3b981 100644
customized_name, \
customized_type_value, \
...) \
@@ -3
57,8 +367,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -3
78,8 +386,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
return 0; \
}
...
...
@@ -116,7 +118,7 @@ index a1f07f9f25..179df3b981 100644
op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
@@ -3
75,12 +387,6 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -3
96,12 +406,6 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
...
...
@@ -129,7 +131,7 @@ index a1f07f9f25..179df3b981 100644
#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE, \
@@ -
392,7 +398,9 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -
413,7 +417,9 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
* we will use and tell the compiler to
* link them into target.
*/
...
...
@@ -140,7 +142,7 @@ index a1f07f9f25..179df3b981 100644
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_itself_##op_type, \
"USE_OP_ITSELF must be called in global namespace"); \
@@ -4
00,6 +408,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -4
21,6 +427,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \
...
...
@@ -151,7 +153,7 @@ index a1f07f9f25..179df3b981 100644
LIBRARY_TYPE, \
customized_name) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -4
10,33 +422,58 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -4
31,33 +441,58 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
...
...
tools/infer_prune_patches/phi_cmake.patch
浏览文件 @
7d14613d
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index
58ad42ddd1..8ffdafcf0d
100644
index
9715fd7704..44109e1081
100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -18,7 +18,7 @@
add_subdirectory(infermeta)
...
...
@@ -10,4 +10,4 @@ index 58ad42ddd1..8ffdafcf0d 100644
+#add_subdirectory(tools)
# phi tests
add_subdirectory(tests)
# phi capi
tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
浏览文件 @
7d14613d
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index
394ce7799e..8edbef50be
100644
index
7a9c5b889d..c847a5d523
100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -
390,6 +390
,7 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
@@ -
418,6 +418
,7 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass));
...
...
@@ -10,7 +10,7 @@ index 394ce7799e..8edbef50be 100644
if (use_static_engine) {
trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key);
@@ -
399,6 +400
,19 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
@@ -
427,6 +428
,19 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
...
...
@@ -30,16 +30,18 @@ index 394ce7799e..8edbef50be 100644
return;
}
}
@@ -4
11,12 +425,25
@@
void TensorRtSubgraphPass::CreateTensorRTOp(
@@ -4
39,7 +453,6
@@
void TensorRtSubgraphPass::CreateTensorRTOp(
auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
- std::unordered_set<std::string> param_set(params.begin(), params.end());
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlockToTRTEngine(
&block_desc_temp, *scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, trt_engine);
&block_desc_temp,
@@ -449,6 +462,21 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
output_mapping,
trt_engine);
+ const auto* root_scope{scope};
+ for (;root_scope->parent();) {
+ root_scope = root_scope->parent();
...
...
@@ -54,10 +56,11 @@ index 394ce7799e..8edbef50be 100644
+ memory::Release(platform::CUDAPlace(dev_id));
+ }
+ memory::Release(platform::CPUPlace());
+
if (use_static_engine) {
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
@@ -431,6 +458,8 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine_serialized_data =
@@ -462,6 +490,8 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
}
...
...
tools/infer_prune_patches/thread_local_allocator.cc.patch
浏览文件 @
7d14613d
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
index
f125670a59..f858a30301
100644
index
875e57cfd4..b111ada3ab
100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -13,1
8
+13,62 @@
@@ -13,1
9
+13,62 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
...
...
@@ -64,12 +64,13 @@ index f125670a59..f858a30301 100644
- buddy_allocator_.reset(new memory::detail::BuddyAllocator(
- std::unique_ptr<memory::detail::SystemAllocator>(
- new memory::detail::GPUAllocator(place_.device)),
- platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
- platform::GpuMinChunkSize(),
- platform::GpuMaxChunkSize()));
+ direct_allocator_.reset(new DirectAllocator{place_});
} else {
PADDLE_THROW(platform::errors::Unavailable(
"Thread local allocator only supports CUDAPlace now."));
@@ -
59,7 +103
,7 @@
ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
@@ -
61,7 +104
,7 @@
ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
...
...
@@ -78,7 +79,7 @@ index f125670a59..f858a30301 100644
auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
return tl_allocation;
@@ -6
7,12 +111
,12 @@
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
@@ -6
9,12 +112
,12 @@
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录