Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
a37ef769
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a37ef769
编写于
5月 09, 2023
作者:
K
Kim Yann
提交者:
GitHub
5月 09, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
rem tools/infer_prune_patches (#53596)
上级
7e9c87c5
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
0 addition
and
582 deletion
+0
-582
tools/infer_prune_patches/analysis_predictor.cc.patch
tools/infer_prune_patches/analysis_predictor.cc.patch
+0
-31
tools/infer_prune_patches/analyzer.cc.patch
tools/infer_prune_patches/analyzer.cc.patch
+0
-14
tools/infer_prune_patches/device_context.cc.patch
tools/infer_prune_patches/device_context.cc.patch
+0
-46
tools/infer_prune_patches/jitcode.h.patch
tools/infer_prune_patches/jitcode.h.patch
+0
-15
tools/infer_prune_patches/op_registry.h.patch
tools/infer_prune_patches/op_registry.h.patch
+0
-217
tools/infer_prune_patches/paddle_analysis_config.h.patch
tools/infer_prune_patches/paddle_analysis_config.h.patch
+0
-21
tools/infer_prune_patches/paddle_api.h.patch
tools/infer_prune_patches/paddle_api.h.patch
+0
-12
tools/infer_prune_patches/paddle_inference_api.h.patch
tools/infer_prune_patches/paddle_inference_api.h.patch
+0
-16
tools/infer_prune_patches/phi_cmake.patch
tools/infer_prune_patches/phi_cmake.patch
+0
-13
tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
+0
-71
tools/infer_prune_patches/thread_local_allocator.cc.patch
tools/infer_prune_patches/thread_local_allocator.cc.patch
+0
-96
tools/infer_prune_patches/thread_local_allocator.h.patch
tools/infer_prune_patches/thread_local_allocator.h.patch
+0
-30
未找到文件。
tools/infer_prune_patches/analysis_predictor.cc.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 0645af611b..6b05a7fffb 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1923,7 +1923,7 @@
USE_TRT_CONVERTER(shuffle_channel);
USE_TRT_CONVERTER(swish);
USE_TRT_CONVERTER(group_norm);
USE_TRT_CONVERTER(instance_norm);
-USE_TRT_CONVERTER(layer_norm);
+//USE_TRT_CONVERTER(layer_norm);
USE_TRT_CONVERTER(gelu);
USE_TRT_CONVERTER(multihead_matmul);
USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm);
@@ -1933,13 +1933,13 @@
USE_TRT_CONVERTER(scale);
USE_TRT_CONVERTER(stack);
USE_TRT_CONVERTER(clip);
USE_TRT_CONVERTER(gather);
-USE_TRT_CONVERTER(anchor_generator);
+//USE_TRT_CONVERTER(anchor_generator);
USE_TRT_CONVERTER(yolo_box);
USE_TRT_CONVERTER(yolo_box_head);
USE_TRT_CONVERTER(arg_max);
-USE_TRT_CONVERTER(roi_align);
-USE_TRT_CONVERTER(affine_channel);
-USE_TRT_CONVERTER(multiclass_nms);
+//USE_TRT_CONVERTER(roi_align);
+//USE_TRT_CONVERTER(affine_channel);
+//USE_TRT_CONVERTER(multiclass_nms);
USE_TRT_CONVERTER(multiclass_nms3);
USE_TRT_CONVERTER(nearest_interp);
USE_TRT_CONVERTER(nearest_interp_v2);
tools/infer_prune_patches/analyzer.cc.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index be7d6ab868..498e09cb4d 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -32,6 +32,9 @@
void Analyzer::RunAnalysis(Argument *argument) {
"analsis_passes is not valid in the argument."));
const bool disable_logs = argument->disable_logs();
for (auto &pass : argument->analysis_passes()) {
+ if (pass == "ir_params_sync_among_devices_pass") {
+ continue;
+ }
if (!disable_logs) {
string::PrettyLogH1("--- Running analysis [%s]", pass);
}
tools/infer_prune_patches/device_context.cc.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 904e4854ba..4f8c955d8c 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -466,15 +466,15 @@
CUDAContext::CUDAContext(const CUDAPlace& place,
place_ = place;
CUDADeviceGuard guard(place_.device);
stream_.reset(new stream::CUDAStream(place, priority, flag));
- InitEigenContext();
- InitCuBlasContext();
- InitCuDNNContext();
+ //InitEigenContext();
+ //InitCuBlasContext();
+ //InitCuDNNContext();
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
- InitCuBlasLtContext();
+ //InitCuBlasLtContext();
#endif
- InitCuSparseContext();
- InitCuSolverContext();
+ //InitCuSparseContext();
+ //InitCuSolverContext();
#endif
}
@@ -506,14 +506,14 @@
void CUDAContext::SetStream(gpuStream_t stream) {
CUDAContext::~CUDAContext() {
CUDADeviceGuard guard(place_.device);
- DestoryCuDNNContext();
- DestoryCuBlasContext();
+ //DestoryCuDNNContext();
+ //DestoryCuBlasContext();
#ifndef PADDLE_WITH_HIP
#if CUDA_VERSION >= 11060
- InitCuBlasLtContext();
+ //InitCuBlasLtContext();
#endif
- DestoryCuSparseContext();
- DestoryCuSolverContext();
+ //DestoryCuSparseContext();
+ //DestoryCuSolverContext();
#endif
}
tools/infer_prune_patches/jitcode.h.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h
index d71497275d..cb56e1d949 100644
--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -102,8 +102,8 @@
class JitCode : public GenBase, public Xbyak::CodeGenerator {
}
ret();
}
- void L(const char* label) { Xbyak::CodeGenerator::L(label); }
- void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } // NOLINT
+ void L(const char* label) { }
+ void L(Xbyak::Label& label) { } // NOLINT
// Enhanced vector extension
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base,
int offt,
tools/infer_prune_patches/op_registry.h.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index d38efbff31..f5bef776d6 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -186,9 +186,8 @@
struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
library_type,
customized_type_value,
- [op_type](const framework::ExecutionContext& ctx) {
+ [](const framework::ExecutionContext& ctx) {
KERNEL_TYPE().Compute(ctx);
- CheckKernelLaunch<PlaceType>(op_type);
});
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
@@ -257,15 +256,8 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
void operator()(const char* op_type,
const char* library_type,
int customized_type_value) const {
- RegisterKernelClass<PlaceType, T>(
- op_type,
- library_type,
- customized_type_value,
-
- [op_type](const framework::ExecutionContext& ctx) {
- Functor()(ctx);
- CheckKernelLaunch<PlaceType>(op_type);
- });
+ RegisterKernelClass<PlaceType, T>(op_type, library_type,
+ customized_type_value, Functor());
constexpr auto size =
std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
@@ -296,7 +288,7 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
VarTypeInference
InferShapeBase
*/
-#define REGISTER_OPERATOR(op_type, op_class, ...) \
+#define REGISTER_OPERATOR__(op_type, op_class, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__reg_op__##op_type, \
"REGISTER_OPERATOR must be called in global namespace"); \
@@ -307,15 +299,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
return 0; \
}
-#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...) \
+#define REGISTER_OPERATOR(op_type, op_class, ...)
+
+#define REGISTER_OP_WITHOUT_GRADIENT__(op_type, op_class, ...) \
REGISTER_OPERATOR(op_type, op_class, __VA_ARGS__, \
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, \
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
+#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...)
/**
* Macro to register OperatorKernel.
*/
#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type, \
+ place_class, customized_name, \
+ customized_type_value, ...)
+
+#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE__(op_type, library_type, \
place_class, customized_name, \
customized_type_value, ...) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -332,18 +331,22 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
return 0; \
}
-#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \
- REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( \
+#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...)
+
+#define REGISTER_OP_KERNEL__(op_type, library_type, place_class, ...) \
+ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE__( \
op_type, library_type, place_class, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define REGISTER_OP_CUDA_KERNEL__(op_type, ...) \
+ REGISTER_OP_KERNEL__(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
+
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
-#else
-#define REGISTER_OP_CUDA_KERNEL(op_type, ...)
-#endif
+
+#define REGISTER_OP_CPU_KERNEL__(op_type, ...) \
+ REGISTER_OP_KERNEL__(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -361,6 +364,11 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
REGISTER_OP_KERNEL(op_type, MLU, ::paddle::platform::MLUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
+ customized_name, \
+ customized_type_value, \
+ ...)
+
+#define REGISTER_OP_KERNEL_EX__(op_type, library_type, place_class, \
customized_name, \
customized_type_value, \
...) \
@@ -378,8 +386,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
return 0; \
}
-#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \
- REGISTER_OP_KERNEL_EX( \
+#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...)
+
+#define REGISTER_OP_CUDA_KERNEL_FUNCTOR__(op_type, ...) \
+ REGISTER_OP_KERNEL_EX__( \
op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
@@ -396,12 +406,6 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
-#define REGISTER_OP_NPU_KERNEL_FUNCTOR(op_type, ...) \
- REGISTER_OP_KERNEL_EX( \
- op_type, NPU, ::paddle::platform::NPUPlace, DEFAULT_TYPE, \
- ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
- __VA_ARGS__)
-
#define REGISTER_OP_MLU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, MLU, ::paddle::platform::MLUPlace, DEFAULT_TYPE, \
@@ -413,7 +417,9 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
* we will use and tell the compiler to
* link them into target.
*/
-#define USE_OP_ITSELF(op_type) \
+#define USE_OP_ITSELF(op_type)
+
+#define USE_OP_ITSELF__(op_type) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
__use_op_itself_##op_type, \
"USE_OP_ITSELF must be called in global namespace"); \
@@ -421,6 +427,10 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \
+ LIBRARY_TYPE, \
+ customized_name)
+
+#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE__(op_type, \
LIBRARY_TYPE, \
customized_name) \
STATIC_ASSERT_GLOBAL_NAMESPACE( \
@@ -431,33 +441,58 @@
struct OpKernelRegistrarFunctorEx<PlaceType,
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
-#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
- USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
+#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \
+ LIBRARY_TYPE, \
+ customized_name)
+
+#define USE_OP_DEVICE_KERNEL__(op_type, LIBRARY_TYPE) \
+ USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE__(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
+
+#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)
// TODO(fengjiayi): The following macros
// seems ugly, do we have better method?
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#ifndef PADDLE_WITH_CUDA
#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
+#define USE_OP_KERNEL__(op_type) USE_OP_DEVICE_KERNEL__(op_type, CPU)
#else
#define USE_OP_KERNEL(op_type) \
USE_OP_DEVICE_KERNEL(op_type, CPU); \
USE_OP_DEVICE_KERNEL(op_type, CUDA)
+
+#define USE_OP_KERNEL__(op_type) \
+ USE_OP_DEVICE_KERNEL__(op_type, CPU); \
+ USE_OP_DEVICE_KERNEL__(op_type, CUDA)
#endif
#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
+#define USE_NO_KERNEL_OP__(op_type) USE_OP_ITSELF__(op_type);
+
#define USE_CPU_ONLY_OP(op_type) \
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, CPU);
+#define USE_CPU_ONLY_OP__(op_type) \
+ USE_OP_ITSELF__(op_type); \
+ USE_OP_DEVICE_KERNEL__(op_type, CPU);
+
#define USE_CUDA_ONLY_OP(op_type) \
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, CUDA)
+#define USE_CUDA_ONLY_OP__(op_type) \
+ USE_OP_ITSELF__(op_type); \
+ USE_OP_DEVICE_KERNEL__(op_type, CUDA)
+
#define USE_OP(op_type) \
USE_OP_ITSELF(op_type); \
USE_OP_KERNEL(op_type)
+
+#define USE_OP__(op_type) \
+ USE_OP_ITSELF__(op_type); \
+ USE_OP_KERNEL__(op_type)
// clang-format on
} // namespace framework
tools/infer_prune_patches/paddle_analysis_config.h.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index d6a0b643c2..511844b482 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -46,6 +46,7 @@
namespace paddle {
class AnalysisPredictor;
+class TensorRTPredictor;
struct MkldnnQuantizerConfig;
struct LiteNNAdapterConfig {
@@ -700,6 +701,8 @@
struct PD_INFER_DECL AnalysisConfig {
friend class ::paddle::AnalysisPredictor;
+ friend class ::paddle::TensorRTPredictor;
+
///
/// \brief Get a pass builder for customize the passes in IR analysis phase.
/// NOTE: Just for developer, not an official API, easy to be broken.
tools/infer_prune_patches/paddle_api.h.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index b28370fb82..aaf20a28b0 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -194,6 +194,7 @@
class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
private:
friend class AnalysisPredictor;
+ friend class TensorRTPredictor;
friend class ONNXRuntimePredictor;
explicit ZeroCopyTensor(void* scope, const void* device_contexts)
: paddle_infer::Tensor{scope, device_contexts} {}
tools/infer_prune_patches/paddle_inference_api.h.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 35b90bfa54..ba8220d06a 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -41,6 +41,11 @@
limitations under the License. */
/// \since 2.0.0-beta
///
+namespace paddle {
+std::unique_ptr<PaddlePredictor> CreateTensorRTPredictor(
+ const AnalysisConfig& config);
+}
+
namespace paddle_infer {
using PrecisionType = paddle::AnalysisConfig::Precision;
tools/infer_prune_patches/phi_cmake.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 9715fd7704..44109e1081 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -18,7 +18,7 @@
add_subdirectory(infermeta)
# phi operator definitions
add_subdirectory(ops)
# phi tools
-add_subdirectory(tools)
+#add_subdirectory(tools)
# phi tests
add_subdirectory(tests)
# phi capi
tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 7a9c5b889d..c847a5d523 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -418,6 +418,7 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
graph->Has(framework::ir::kMultiheadMatmulPass));
+ std::unordered_set<std::string> param_set(params.begin(), params.end());
if (use_static_engine) {
trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key);
@@ -427,6 +428,19 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
+ const auto* root_scope{param_scope()};
+ for (;root_scope->parent();) {
+ root_scope = root_scope->parent();
+ }
+ for (const auto& name: param_set) {
+ LOG(INFO) << " ===== Clear param: " << name;
+ root_scope->FindLocalVar(name)->Clear();
+ }
+ for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount();
+ ++dev_id) {
+ memory::Release(platform::CUDAPlace(dev_id));
+ }
+ memory::Release(platform::CPUPlace());
return;
}
}
@@ -439,7 +453,6 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
- std::unordered_set<std::string> param_set(params.begin(), params.end());
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlockToTRTEngine(
&block_desc_temp,
@@ -449,6 +462,21 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
output_mapping,
trt_engine);
+ const auto* root_scope{scope};
+ for (;root_scope->parent();) {
+ root_scope = root_scope->parent();
+ }
+ VLOG(4) << "root_scope->LocalVarNames().size: " << root_scope->LocalVarNames().size();
+ for (const auto& name: param_set) {
+ VLOG(4) << " ===== Clear param: " << name;
+ root_scope->FindLocalVar(name)->Clear();
+ }
+ for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount();
+ ++dev_id) {
+ memory::Release(platform::CUDAPlace(dev_id));
+ }
+ memory::Release(platform::CPUPlace());
+
if (use_static_engine) {
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
trt_engine_serialized_data =
@@ -462,6 +490,8 @@
void TensorRtSubgraphPass::CreateTensorRTOp(
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
}
+ trt_engine_serialized_data.clear();
+ trt_engine_serialized_data.shrink_to_fit();
}
} // namespace analysis
tools/infer_prune_patches/thread_local_allocator.cc.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
index 875e57cfd4..b111ada3ab 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -13,19 +13,62 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
namespace paddle {
namespace memory {
namespace allocation {
+const int MALLOC_ALIGN = 64;
+
+#define CUDA_CALL(func) \
+ { \
+ auto e = (func); \
+ CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
+ << "CUDA: " << cudaGetErrorString(e); \
+ }
+
+void* DirectAllocator::Alloc(size_t unaligned_size) {
+ if (platform::is_cpu_place(place_)) {
+ size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
+ char* p = static_cast<char*>(std::malloc(offset + unaligned_size));
+ // Memory checking
+ CHECK(p) << "Error occurred in malloc period: available space is not enough "
+ "for mallocing "
+ << unaligned_size << " bytes.";
+ // Byte alignment
+ void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
+ (~(MALLOC_ALIGN - 1)));
+ static_cast<void**>(r)[-1] = p;
+ return r;
+ } else if (platform::is_gpu_place(place_)) {
+ int dev_id = place_.GetDeviceId();
+ platform::CUDADeviceGuard guard(dev_id);
+ void* ptr{};
+ CUDA_CALL(cudaMalloc(&ptr, unaligned_size));
+ return ptr;
+ }
+ return nullptr;
+}
+
+void DirectAllocator::Free(void* ptr) {
+ if (platform::is_cpu_place(place_)) {
+ if (ptr) {
+ std::free(static_cast<void**>(ptr)[-1]);
+ }
+ } else if (platform::is_gpu_place(place_)) {
+ int dev_id = place_.GetDeviceId();
+ platform::CUDADeviceGuard guard(dev_id);
+ CUDA_CALL(cudaFree(ptr));
+ }
+}
+
+
+
ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p)
: place_(p) {
if (platform::is_gpu_place(place_)) {
- buddy_allocator_.reset(new memory::detail::BuddyAllocator(
- std::unique_ptr<memory::detail::SystemAllocator>(
- new memory::detail::GPUAllocator(place_.device)),
- platform::GpuMinChunkSize(),
- platform::GpuMaxChunkSize()));
+ direct_allocator_.reset(new DirectAllocator{place_});
} else {
PADDLE_THROW(platform::errors::Unavailable(
"Thread local allocator only supports CUDAPlace now."));
@@ -61,7 +104,7 @@
ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool()
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size;
- void* ptr = buddy_allocator_->Alloc(size);
+ void* ptr = direct_allocator_->Alloc(size);
auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_);
tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this());
return tl_allocation;
@@ -69,12 +112,12 @@
ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) {
void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation;
- buddy_allocator_->Free(allocation->ptr());
+ direct_allocator_->Free(allocation->ptr());
delete allocation;
}
uint64_t ThreadLocalAllocatorImpl::ReleaseImpl() {
- return buddy_allocator_->Release();
+ return direct_allocator_->Release();
}
} // namespace allocation
tools/infer_prune_patches/thread_local_allocator.h.patch
已删除
100644 → 0
浏览文件 @
7e9c87c5
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index 654fb3fe7b..44c5dbf87f 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -26,6 +26,16 @@
namespace paddle {
namespace memory {
namespace allocation {
+class DirectAllocator {
+public:
+ DirectAllocator(const platform::Place& place) : place_{place} {}
+ void* Alloc(size_t unaligned_size);
+ void Free(void* ptr);
+ uint64_t Release() { return 0;}
+private:
+ platform::Place place_;
+};
+
class ThreadLocalAllocatorImpl;
class ThreadLocalAllocation : public Allocation {
@@ -55,7 +65,7 @@
class ThreadLocalAllocatorImpl
uint64_t ReleaseImpl();
private:
- std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
+ std::unique_ptr<DirectAllocator> direct_allocator_;
platform::Place place_;
};
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录