Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
7d14613d
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
7d14613d
编写于
6月 27, 2022
作者:
S
Shang Zhizhou
提交者:
GitHub
6月 27, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix prune tool (#43849)
* update pateches * test=document_fix; add patch file
上级
c4a52b83
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
45 addition
and
39 deletion
+45
-39
tools/infer_prune_patches/jitcode.h.patch
tools/infer_prune_patches/jitcode.h.patch
+6
-6
tools/infer_prune_patches/op_registry.h.patch
tools/infer_prune_patches/op_registry.h.patch
+19
-17
tools/infer_prune_patches/phi_cmake.patch
tools/infer_prune_patches/phi_cmake.patch
+2
-2
tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
+12
-9
tools/infer_prune_patches/thread_local_allocator.cc.patch
tools/infer_prune_patches/thread_local_allocator.cc.patch
+6
-5
未找到文件。
tools/infer_prune_patches/jitcode.h.patch
浏览文件 @
7d14613d
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index
23650c8efc..24466e4327
100644
index
d71497275d..cb56e1d949
100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -
97,8 +97
,8 @@
class JitCode : public GenBase, public Xbyak::CodeGenerator {
@@ -
102,8 +102
,8 @@
class JitCode : public GenBase, public Xbyak::CodeGenerator {
}
ret();
}
- void L(const char* label) { Xbyak::CodeGenerator::L(label); }
- void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } // NOLINT
+ void L(const char* label) { }
+ void L(Xbyak::Label& label) { } // NOLINT
+ void L(const char* label) {
}
+ void L(Xbyak::Label& label) {
} // NOLINT
// Enhanced vector extension
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
int offt,
bool bcast = false) {
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
int offt,
tools/infer_prune_patches/op_registry.h.patch
浏览文件 @
7d14613d
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index
a1f07f9f25..179df3b981
100644
index
d38efbff31..f5bef776d6
100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -1
78,9 +178
,8 @@
struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
RegisterKernelClass<PlaceType, T>(
op_type, library_type,
customized_type_value,
@@ -1
86,9 +186
,8 @@
struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
library_type,
customized_type_value,
- [op_type](const framework::ExecutionContext& ctx) {
+ [](const framework::ExecutionContext& ctx) {
...
...
@@ -13,12 +13,14 @@ index a1f07f9f25..179df3b981 100644
});
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
@@ -2
40,13 +239,8 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
void operator()(const char* op_type,
const char* library_type,
@@ -2
57,15 +256,8 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
void operator()(const char* op_type,
const char* library_type,
int customized_type_value) const {
- RegisterKernelClass<PlaceType, T>(
- op_type, library_type, customized_type_value,
- op_type,
- library_type,
- customized_type_value,
-
- [op_type](const framework::ExecutionContext& ctx) {
- Functor()(ctx);
...
...
@@ -29,7 +31,7 @@ index a1f07f9f25..179df3b981 100644
constexpr auto size =
std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
@@ -2
75,7 +269,7 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -2
96,7 +288,7 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
VarTypeInference
InferShapeBase
*/
...
...
@@ -38,7 +40,7 @@ index a1f07f9f25..179df3b981 100644
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op__##op_type, \
"REGISTER_OPERATOR must be called in global namespace"); \
@@ -
286,15 +280,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -
307,15 +299,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
return 0; \
}
...
...
@@ -62,7 +64,7 @@ index a1f07f9f25..179df3b981 100644
place_class, customized_name, \
customized_type_value, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -3
11,18 +312,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -3
32,18 +331,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
return 0; \
}
...
...
@@ -91,7 +93,7 @@ index a1f07f9f25..179df3b981 100644
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -3
40,6 +345,11 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -3
61,6 +364,11 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
...
...
@@ -103,7 +105,7 @@ index a1f07f9f25..179df3b981 100644
customized_name, \
customized_type_value, \
...) \
@@ -3
57,8 +367,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -3
78,8 +386,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
return 0; \
}
...
...
@@ -116,7 +118,7 @@ index a1f07f9f25..179df3b981 100644
op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
@@ -3
75,12 +387,6 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -3
96,12 +406,6 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
...
...
@@ -129,7 +131,7 @@ index a1f07f9f25..179df3b981 100644
#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE, \
@@ -
392,7 +398,9 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -
413,7 +417,9 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
* we will use and tell the compiler to
* link them into target.
*/
...
...
@@ -140,7 +142,7 @@ index a1f07f9f25..179df3b981 100644
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_itself_##op_type, \
"USE_OP_ITSELF must be called in global namespace"); \
@@ -4
00,6 +408,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -4
21,6 +427,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \
...
...
@@ -151,7 +153,7 @@ index a1f07f9f25..179df3b981 100644
LIBRARY_TYPE, \
customized_name) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -4
10,33 +422,58 @@
struct OpKernelRegistrarFunctorEx<PlaceType, false, I
,
@@ -4
31,33 +441,58 @@
struct OpKernelRegistrarFunctorEx<PlaceType
,
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
...
...
tools/infer_prune_patches/phi_cmake.patch
浏览文件 @
7d14613d
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index
58ad42ddd1..8ffdafcf0d
100644
index
9715fd7704..44109e1081
100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -18,7 +18,7 @@
add_subdirectory(infermeta)
...
...
@@ -10,4 +10,4 @@ index 58ad42ddd1..8ffdafcf0d 100644
+#add_subdirectory(tools)
# phi tests
add_subdirectory(tests)
# phi capi
tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
浏览文件 @
7d14613d
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index
394ce7799e..8edbef50be
100644
index
7a9c5b889d..c847a5d523
100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -
390,6 +390
,7 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
@@ -
418,6 +418
,7 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass));
...
...
@@ -10,7 +10,7 @@ index 394ce7799e..8edbef50be 100644
if (use_static_engine) {
trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key);
@@ -
399,6 +400
,19 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
@@ -
427,6 +428
,19 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
...
...
@@ -30,16 +30,18 @@ index 394ce7799e..8edbef50be 100644
return;
}
}
@@ -4
11,12 +425,25
@@
void TensorRtSubgraphPass::CreateTensorRTOp(
@@ -4
39,7 +453,6
@@
void TensorRtSubgraphPass::CreateTensorRTOp(
auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
- std::unordered_set<std::string> param_set(params.begin(), params.end());
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlockToTRTEngine(
&block_desc_temp, *scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, trt_engine);
&block_desc_temp,
@@ -449,6 +462,21 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
output_mapping,
trt_engine);
+ const auto* root_scope{scope};
+ for (;root_scope->parent();) {
+ root_scope = root_scope->parent();
...
...
@@ -54,10 +56,11 @@ index 394ce7799e..8edbef50be 100644
+ memory::Release(platform::CUDAPlace(dev_id));
+ }
+ memory::Release(platform::CPUPlace());
+
if (use_static_engine) {
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
@@ -431,6 +458,8 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
trt_engine_serialized_data =
@@ -462,6 +490,8 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
}
...
...
tools/infer_prune_patches/thread_local_allocator.cc.patch
浏览文件 @
7d14613d
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
index
f125670a59..f858a30301
100644
index
875e57cfd4..b111ada3ab
100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -13,1
8
+13,62 @@
@@ -13,1
9
+13,62 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
...
...
@@ -64,12 +64,13 @@ index f125670a59..f858a30301 100644
- buddy_allocator_.reset(new memory::detail::BuddyAllocator(
- std::unique_ptr<memory::detail::SystemAllocator>(
- new memory::detail::GPUAllocator(place_.device)),
- platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
- platform::GpuMinChunkSize(),
- platform::GpuMaxChunkSize()));
+ direct_allocator_.reset(new DirectAllocator{place_});
} else {
PADDLE_THROW(platform::errors::Unavailable(
"Thread local allocator only supports CUDAPlace now."));
@@ -
59,7 +103
,7 @@
ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
@@ -
61,7 +104
,7 @@
ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
...
...
@@ -78,7 +79,7 @@ index f125670a59..f858a30301 100644
auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
return tl_allocation;
@@ -6
7,12 +111
,12 @@
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
@@ -6
9,12 +112
,12 @@
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录