From d74d1838fffdd6ac2d5e1e06a328eb2a8d54a605 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Tue, 14 Jun 2022 10:30:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E4=B8=BAJetson=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E7=9A=84=E5=BA=93=E4=BD=93=E7=A7=AF=E8=A3=81=E5=89=AA?= =?UTF-8?q?=E5=B7=A5=E5=85=B7=20(#43453)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test=document_fix * test=document_fix; add patch file * test=document_fix;update style * test=document_fix;update patch file * test=document_fix;remove useless patch file --- .../analysis_predictor.cc.patch | 31 +++ tools/infer_prune_patches/analyzer.cc.patch | 14 ++ .../device_context.cc.patch | 46 ++++ tools/infer_prune_patches/jitcode.h.patch | 15 ++ tools/infer_prune_patches/op_registry.h.patch | 215 ++++++++++++++++++ .../paddle_analysis_config.h.patch | 21 ++ tools/infer_prune_patches/paddle_api.h.patch | 12 + .../paddle_inference_api.h.patch | 16 ++ tools/infer_prune_patches/phi_cmake.patch | 13 ++ .../tensorrt_subgraph_pass.cc.patch | 68 ++++++ .../thread_local_allocator.cc.patch | 95 ++++++++ .../thread_local_allocator.h.patch | 30 +++ tools/prune_for_jetson.py | 190 ++++++++++++++++ 13 files changed, 766 insertions(+) create mode 100644 tools/infer_prune_patches/analysis_predictor.cc.patch create mode 100644 tools/infer_prune_patches/analyzer.cc.patch create mode 100644 tools/infer_prune_patches/device_context.cc.patch create mode 100644 tools/infer_prune_patches/jitcode.h.patch create mode 100644 tools/infer_prune_patches/op_registry.h.patch create mode 100644 tools/infer_prune_patches/paddle_analysis_config.h.patch create mode 100644 tools/infer_prune_patches/paddle_api.h.patch create mode 100644 tools/infer_prune_patches/paddle_inference_api.h.patch create mode 100644 tools/infer_prune_patches/phi_cmake.patch create mode 100644 tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch create mode 100644 tools/infer_prune_patches/thread_local_allocator.cc.patch create mode 100644 tools/infer_prune_patches/thread_local_allocator.h.patch create mode 100644 tools/prune_for_jetson.py diff --git a/tools/infer_prune_patches/analysis_predictor.cc.patch b/tools/infer_prune_patches/analysis_predictor.cc.patch new file mode 100644 index 00000000000..21fa24dd3d9 --- /dev/null +++ b/tools/infer_prune_patches/analysis_predictor.cc.patch @@ -0,0 +1,31 @@ +diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc +index 0645af611b..6b05a7fffb 100644 +--- a/paddle/fluid/inference/api/analysis_predictor.cc ++++ b/paddle/fluid/inference/api/analysis_predictor.cc +@@ -1923,7 +1923,7 @@ USE_TRT_CONVERTER(shuffle_channel); + USE_TRT_CONVERTER(swish); + USE_TRT_CONVERTER(group_norm); + USE_TRT_CONVERTER(instance_norm); +-USE_TRT_CONVERTER(layer_norm); ++//USE_TRT_CONVERTER(layer_norm); + USE_TRT_CONVERTER(gelu); + USE_TRT_CONVERTER(multihead_matmul); + USE_TRT_CONVERTER(fused_embedding_eltwise_layernorm); +@@ -1933,13 +1933,13 @@ USE_TRT_CONVERTER(scale); + USE_TRT_CONVERTER(stack); + USE_TRT_CONVERTER(clip); + USE_TRT_CONVERTER(gather); +-USE_TRT_CONVERTER(anchor_generator); ++//USE_TRT_CONVERTER(anchor_generator); + USE_TRT_CONVERTER(yolo_box); + USE_TRT_CONVERTER(yolo_box_head); + USE_TRT_CONVERTER(arg_max); +-USE_TRT_CONVERTER(roi_align); +-USE_TRT_CONVERTER(affine_channel); +-USE_TRT_CONVERTER(multiclass_nms); ++//USE_TRT_CONVERTER(roi_align); ++//USE_TRT_CONVERTER(affine_channel); ++//USE_TRT_CONVERTER(multiclass_nms); + USE_TRT_CONVERTER(multiclass_nms3); + USE_TRT_CONVERTER(nearest_interp); + USE_TRT_CONVERTER(nearest_interp_v2); diff --git a/tools/infer_prune_patches/analyzer.cc.patch b/tools/infer_prune_patches/analyzer.cc.patch new file mode 100644 index 00000000000..59a7b4d6b8c --- /dev/null +++ b/tools/infer_prune_patches/analyzer.cc.patch @@ -0,0 +1,14 @@ +diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc +index be7d6ab868..498e09cb4d 100644 +--- a/paddle/fluid/inference/analysis/analyzer.cc ++++ b/paddle/fluid/inference/analysis/analyzer.cc +@@ -32,6 +32,9 @@ void Analyzer::RunAnalysis(Argument *argument) { + "analsis_passes is not valid in the argument.")); + const bool disable_logs = argument->disable_logs(); + for (auto &pass : argument->analysis_passes()) { ++ if (pass == "ir_params_sync_among_devices_pass") { ++ continue; ++ } + if (!disable_logs) { + string::PrettyLogH1("--- Running analysis [%s]", pass); + } diff --git a/tools/infer_prune_patches/device_context.cc.patch b/tools/infer_prune_patches/device_context.cc.patch new file mode 100644 index 00000000000..75be9a0d1d9 --- /dev/null +++ b/tools/infer_prune_patches/device_context.cc.patch @@ -0,0 +1,46 @@ +diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc +index 904e4854ba..4f8c955d8c 100644 +--- a/paddle/fluid/platform/device_context.cc ++++ b/paddle/fluid/platform/device_context.cc +@@ -466,15 +466,15 @@ CUDAContext::CUDAContext(const CUDAPlace& place, + place_ = place; + CUDADeviceGuard guard(place_.device); + stream_.reset(new stream::CUDAStream(place, priority, flag)); +- InitEigenContext(); +- InitCuBlasContext(); +- InitCuDNNContext(); ++ //InitEigenContext(); ++ //InitCuBlasContext(); ++ //InitCuDNNContext(); + #ifndef PADDLE_WITH_HIP + #if CUDA_VERSION >= 11060 +- InitCuBlasLtContext(); ++ //InitCuBlasLtContext(); + #endif +- InitCuSparseContext(); +- InitCuSolverContext(); ++ //InitCuSparseContext(); ++ //InitCuSolverContext(); + #endif + } + +@@ -506,14 +506,14 @@ void CUDAContext::SetStream(gpuStream_t stream) { + + CUDAContext::~CUDAContext() { + CUDADeviceGuard guard(place_.device); +- DestoryCuDNNContext(); +- DestoryCuBlasContext(); ++ //DestoryCuDNNContext(); ++ //DestoryCuBlasContext(); + #ifndef PADDLE_WITH_HIP + #if CUDA_VERSION >= 11060 +- InitCuBlasLtContext(); ++ //InitCuBlasLtContext(); + #endif +- DestoryCuSparseContext(); +- DestoryCuSolverContext(); ++ //DestoryCuSparseContext(); ++ //DestoryCuSolverContext(); + #endif + } + diff --git a/tools/infer_prune_patches/jitcode.h.patch b/tools/infer_prune_patches/jitcode.h.patch new file mode 100644 index 00000000000..9022b459db5 --- /dev/null +++ b/tools/infer_prune_patches/jitcode.h.patch @@ -0,0 +1,15 @@ +diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h +index 23650c8efc..24466e4327 100644 +--- a/paddle/fluid/operators/jit/gen/jitcode.h ++++ b/paddle/fluid/operators/jit/gen/jitcode.h +@@ -97,8 +97,8 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator { + } + ret(); + } +- void L(const char* label) { Xbyak::CodeGenerator::L(label); } +- void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } // NOLINT ++ void L(const char* label) { } ++ void L(Xbyak::Label& label) { } // NOLINT + // Enhanced vector extension + Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt, + bool bcast = false) { diff --git a/tools/infer_prune_patches/op_registry.h.patch b/tools/infer_prune_patches/op_registry.h.patch new file mode 100644 index 00000000000..a1d2a66347c --- /dev/null +++ b/tools/infer_prune_patches/op_registry.h.patch @@ -0,0 +1,215 @@ +diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h +index a1f07f9f25..179df3b981 100644 +--- a/paddle/fluid/framework/op_registry.h ++++ b/paddle/fluid/framework/op_registry.h +@@ -178,9 +178,8 @@ struct OpKernelRegistrarFunctor { + RegisterKernelClass( + op_type, library_type, customized_type_value, + +- [op_type](const framework::ExecutionContext& ctx) { ++ [](const framework::ExecutionContext& ctx) { + KERNEL_TYPE().Compute(ctx); +- CheckKernelLaunch(op_type); + }); + constexpr auto size = std::tuple_size>::value; + OpKernelRegistrarFunctor +@@ -240,13 +239,8 @@ struct OpKernelRegistrarFunctorEx( +- op_type, library_type, customized_type_value, +- +- [op_type](const framework::ExecutionContext& ctx) { +- Functor()(ctx); +- CheckKernelLaunch(op_type); +- }); ++ RegisterKernelClass(op_type, library_type, ++ customized_type_value, Functor()); + + constexpr auto size = + std::tuple_size>::value; +@@ -275,7 +269,7 @@ struct OpKernelRegistrarFunctorEx, \ + paddle::framework::EmptyGradOpMaker) + ++#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, ...) + /** + * Macro to register OperatorKernel. + */ + #define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type, \ ++ place_class, customized_name, \ ++ customized_type_value, ...) ++ ++#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE__(op_type, library_type, \ + place_class, customized_name, \ + customized_type_value, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ +@@ -311,18 +312,22 @@ struct OpKernelRegistrarFunctorEx CreateTensorRTPredictor( ++ const AnalysisConfig& config); ++} ++ + namespace paddle_infer { + + using PrecisionType = paddle::AnalysisConfig::Precision; diff --git a/tools/infer_prune_patches/phi_cmake.patch b/tools/infer_prune_patches/phi_cmake.patch new file mode 100644 index 00000000000..2eba0e0c14c --- /dev/null +++ b/tools/infer_prune_patches/phi_cmake.patch @@ -0,0 +1,13 @@ +diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt +index 58ad42ddd1..8ffdafcf0d 100644 +--- a/paddle/phi/CMakeLists.txt ++++ b/paddle/phi/CMakeLists.txt +@@ -18,7 +18,7 @@ add_subdirectory(infermeta) + # phi operator definitions + add_subdirectory(ops) + # phi tools +-add_subdirectory(tools) ++#add_subdirectory(tools) + # phi tests + add_subdirectory(tests) + diff --git a/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch b/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch new file mode 100644 index 00000000000..307f12ee971 --- /dev/null +++ b/tools/infer_prune_patches/tensorrt_subgraph_pass.cc.patch @@ -0,0 +1,68 @@ +diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +index 394ce7799e..8edbef50be 100644 +--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc ++++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +@@ -390,6 +390,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + graph->Has(framework::ir::kEmbEltwiseLayernormPass) && + graph->Has(framework::ir::kMultiheadMatmulPass)); + ++ std::unordered_set param_set(params.begin(), params.end()); + if (use_static_engine) { + trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); +@@ -399,6 +400,19 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + LOG(INFO) << "Load TRT Optimized Info from " + << GetTrtEngineSerializedPath( + Get("model_opt_cache_dir"), engine_key); ++ const auto* root_scope{param_scope()}; ++ for (;root_scope->parent();) { ++ root_scope = root_scope->parent(); ++ } ++ for (const auto& name: param_set) { ++ LOG(INFO) << " ===== Clear param: " << name; ++ root_scope->FindLocalVar(name)->Clear(); ++ } ++ for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount(); ++ ++dev_id) { ++ memory::Release(platform::CUDAPlace(dev_id)); ++ } ++ memory::Release(platform::CPUPlace()); + return; + } + } +@@ -411,12 +425,25 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + + auto *scope = param_scope(); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); +- std::unordered_set param_set(params.begin(), params.end()); + inference::Singleton::Global() + .ConvertBlockToTRTEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, trt_engine); ++ const auto* root_scope{scope}; ++ for (;root_scope->parent();) { ++ root_scope = root_scope->parent(); ++ } ++ VLOG(4) << "root_scope->LocalVarNames().size: " << root_scope->LocalVarNames().size(); ++ for (const auto& name: param_set) { ++ VLOG(4) << " ===== Clear param: " << name; ++ root_scope->FindLocalVar(name)->Clear(); ++ } ++ for (int dev_id = 0; dev_id < paddle::platform::GetGPUDeviceCount(); ++ ++dev_id) { ++ memory::Release(platform::CUDAPlace(dev_id)); ++ } ++ memory::Release(platform::CPUPlace()); + + if (use_static_engine) { + nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); +@@ -431,6 +458,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( + << GetTrtEngineSerializedPath( + Get("model_opt_cache_dir"), engine_key); + } ++ trt_engine_serialized_data.clear(); ++ trt_engine_serialized_data.shrink_to_fit(); + } + + } // namespace analysis diff --git a/tools/infer_prune_patches/thread_local_allocator.cc.patch b/tools/infer_prune_patches/thread_local_allocator.cc.patch new file mode 100644 index 00000000000..6a4486aae94 --- /dev/null +++ b/tools/infer_prune_patches/thread_local_allocator.cc.patch @@ -0,0 +1,95 @@ +diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc +index f125670a59..f858a30301 100644 +--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc ++++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc +@@ -13,18 +13,62 @@ + // limitations under the License. + + #include "paddle/fluid/memory/allocation/thread_local_allocator.h" ++#include "paddle/fluid/platform/cuda_device_guard.h" + + namespace paddle { + namespace memory { + namespace allocation { + ++const int MALLOC_ALIGN = 64; ++ ++#define CUDA_CALL(func) \ ++ { \ ++ auto e = (func); \ ++ CHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \ ++ << "CUDA: " << cudaGetErrorString(e); \ ++ } ++ ++void* DirectAllocator::Alloc(size_t unaligned_size) { ++ if (platform::is_cpu_place(place_)) { ++ size_t offset = sizeof(void*) + MALLOC_ALIGN - 1; ++ char* p = static_cast(std::malloc(offset + unaligned_size)); ++ // Memory checking ++ CHECK(p) << "Error occurred in malloc period: available space is not enough " ++ "for mallocing " ++ << unaligned_size << " bytes."; ++ // Byte alignment ++ void* r = reinterpret_cast(reinterpret_cast(p + offset) & ++ (~(MALLOC_ALIGN - 1))); ++ static_cast(r)[-1] = p; ++ return r; ++ } else if (platform::is_gpu_place(place_)) { ++ int dev_id = place_.GetDeviceId(); ++ platform::CUDADeviceGuard guard(dev_id); ++ void* ptr{}; ++ CUDA_CALL(cudaMalloc(&ptr, unaligned_size)); ++ return ptr; ++ } ++ return nullptr; ++} ++ ++void DirectAllocator::Free(void* ptr) { ++ if (platform::is_cpu_place(place_)) { ++ if (ptr) { ++ std::free(static_cast(ptr)[-1]); ++ } ++ } else if (platform::is_gpu_place(place_)) { ++ int dev_id = place_.GetDeviceId(); ++ platform::CUDADeviceGuard guard(dev_id); ++ CUDA_CALL(cudaFree(ptr)); ++ } ++} ++ ++ ++ + ThreadLocalAllocatorImpl::ThreadLocalAllocatorImpl(const platform::Place& p) + : place_(p) { + if (platform::is_gpu_place(place_)) { +- buddy_allocator_.reset(new memory::detail::BuddyAllocator( +- std::unique_ptr( +- new memory::detail::GPUAllocator(place_.device)), +- platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); ++ direct_allocator_.reset(new DirectAllocator{place_}); + } else { + PADDLE_THROW(platform::errors::Unavailable( + "Thread local allocator only supports CUDAPlace now.")); +@@ -59,7 +103,7 @@ ThreadLocalCUDAAllocatorPool::ThreadLocalCUDAAllocatorPool() + + ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { + VLOG(10) << "ThreadLocalAllocatorImpl::AllocateImpl " << size; +- void* ptr = buddy_allocator_->Alloc(size); ++ void* ptr = direct_allocator_->Alloc(size); + auto* tl_allocation = new ThreadLocalAllocation(ptr, size, place_); + tl_allocation->SetThreadLocalAllocatorImpl(shared_from_this()); + return tl_allocation; +@@ -67,12 +111,12 @@ ThreadLocalAllocation* ThreadLocalAllocatorImpl::AllocateImpl(size_t size) { + + void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) { + VLOG(10) << "ThreadLocalAllocatorImpl::FreeImpl " << allocation; +- buddy_allocator_->Free(allocation->ptr()); ++ direct_allocator_->Free(allocation->ptr()); + delete allocation; + } + + uint64_t ThreadLocalAllocatorImpl::ReleaseImpl() { +- return buddy_allocator_->Release(); ++ return direct_allocator_->Release(); + } + + } // namespace allocation diff --git a/tools/infer_prune_patches/thread_local_allocator.h.patch b/tools/infer_prune_patches/thread_local_allocator.h.patch new file mode 100644 index 00000000000..a3c24178d20 --- /dev/null +++ b/tools/infer_prune_patches/thread_local_allocator.h.patch @@ -0,0 +1,30 @@ +diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h +index 654fb3fe7b..44c5dbf87f 100644 +--- a/paddle/fluid/memory/allocation/thread_local_allocator.h ++++ b/paddle/fluid/memory/allocation/thread_local_allocator.h +@@ -26,6 +26,16 @@ namespace paddle { + namespace memory { + namespace allocation { + ++class DirectAllocator { ++public: ++ DirectAllocator(const platform::Place& place) : place_{place} {} ++ void* Alloc(size_t unaligned_size); ++ void Free(void* ptr); ++ uint64_t Release() { return 0;} ++private: ++ platform::Place place_; ++}; ++ + class ThreadLocalAllocatorImpl; + + class ThreadLocalAllocation : public Allocation { +@@ -55,7 +65,7 @@ class ThreadLocalAllocatorImpl + uint64_t ReleaseImpl(); + + private: +- std::unique_ptr buddy_allocator_; ++ std::unique_ptr direct_allocator_; + platform::Place place_; + }; + diff --git a/tools/prune_for_jetson.py b/tools/prune_for_jetson.py new file mode 100644 index 00000000000..d53b21d6c37 --- /dev/null +++ b/tools/prune_for_jetson.py @@ -0,0 +1,190 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script simply removes all grad ops and kernels. You should use this script +when cmake ON_INFER=ON, which can greatly reduce the volume of the prediction library. +""" + +import os +import sys +import re +import glob +import io + + +def find_type_files(cur_dir, file_type, file_list=[]): + next_level_dirs = os.listdir(cur_dir) + for next_level_name in next_level_dirs: + next_level_dir = os.path.join(cur_dir, next_level_name) + if os.path.isfile(next_level_dir): + if os.path.splitext(next_level_dir)[1] == file_type: + file_list.append(next_level_dir) + elif os.path.isdir(next_level_dir): + find_type_files(next_level_dir, file_type, file_list) + return file_list + + +def find_kernel(content, pattern): + res = re.findall(pattern, content, flags=re.DOTALL) + ret = [] + for p in res: + left, right = 0, 0 + for c in p: + if c == '{': + left += 1 + elif c == '}': + right += 1 + + if left == right: + ret.append(p) + + return ret, len(ret) + + +def prune_phi_kernels(): + tool_dir = os.path.dirname(os.path.abspath(__file__)) + if sys.version_info[0] == 3: + all_op = glob.glob(os.path.join(tool_dir, + '../paddle/phi/kernels/**/*.cc'), + recursive=True) + all_op += glob.glob(os.path.join(tool_dir, + '../paddle/phi/kernels/**/*.cu'), + recursive=True) + elif sys.version_info[0] == 2: + all_op = find_type_files( + os.path.join(tool_dir, '../paddle/phi/kernels/'), '.cc') + all_op = find_type_files( + os.path.join(tool_dir, '../paddle/phi/kernels/'), '.cu', all_op) + + register_op_count = 0 + for op_file in all_op: + need_continue = False + file_blacklist = [ + "kernels/empty_kernel.cc", "/cast_kernel.c", "/batch_norm_kernel.c" + ] + for bname in file_blacklist: + if op_file.find(bname) >= 0: + need_continue = True + break + + if need_continue: + print("continue:", op_file) + continue + + op_name = os.path.split(op_file)[1] + all_matches = [] + with io.open(op_file, 'r', encoding='utf-8') as f: + content = ''.join(f.readlines()) + op_pattern = 'PD_REGISTER_KERNEL\(.*?\).*?\{.*?\}' + op, op_count = find_kernel(content, op_pattern) + register_op_count += op_count + all_matches.extend(op) + + for p in all_matches: + content = content.replace(p, '') + + with io.open(op_file, 'w', encoding='utf-8') as f: + f.write(u'{}'.format(content)) + + print('We erase all grad op and kernel for Paddle-Inference lib.') + print('%50s%10s' % ('type', 'count')) + print('%50s%10s' % ('REGISTER_OPERATOR', register_op_count)) + return True + + +def apply_patches(): + work_path = os.path.dirname(os.path.abspath(__file__)) + "/../" + ret = os.system( + "cd %s && rm -f paddle/fluid/inference/api/tensorrt_predictor.* " + " && rm -f paddle/fluid/inference/api/paddle_tensorrt_predictor.h " + " && git apply tools/infer_prune_patches/*.patch && cd -" % work_path) + return ret == 0 + + +def append_fluid_kernels(): + op_white_list = ["load", "load_combine"] + + #1. add to makefile + file_name = os.path.dirname(os.path.abspath(__file__)) \ + + "/../paddle/fluid/inference/tensorrt/CMakeLists.txt" + append_str = "\nfile(APPEND ${pybind_file} \"USE_NO_KERNEL_OP__(tensorrt_engine);\\n\")\n" + for op in op_white_list: + append_str = append_str + "file(APPEND ${pybind_file} \"USE_OP__(%s);\\n\")\n" % op + + with io.open(file_name, 'r', encoding='utf-8') as f: + content = ''.join(f.readlines()) + + location_str = "nv_library(\n tensorrt_op_teller\n SRCS op_teller.cc\n DEPS framework_proto device_context boost)" + new_content = content.replace(location_str, location_str + append_str) + + if new_content == content: + print("ERROR: can not find \"%s\" in file \"%s\"" % + (location_str, file_name)) + return False + + with io.open(file_name, 'w', encoding='utf-8') as f: + f.write(u'{}'.format(new_content)) + + #2. add op and kernel register + op_white_list.append("tensorrt_engine") + tool_dir = os.path.dirname(os.path.abspath(__file__)) + if sys.version_info[0] == 3: + all_op = glob.glob(os.path.join(tool_dir, + '../paddle/fluid/operators/**/*.cc'), + recursive=True) + all_op += glob.glob(os.path.join(tool_dir, + '../paddle/fluid/operators/**/*.cu'), + recursive=True) + elif sys.version_info[0] == 2: + all_op = find_type_files( + os.path.join(tool_dir, '../paddle/fluid/operators/'), '.cc') + all_op = find_type_files( + os.path.join(tool_dir, '../paddle/fluid/operators/'), '.cu', all_op) + + for op_file in all_op: + with io.open(op_file, 'r', encoding='utf-8') as f: + content = ''.join(f.readlines()) + + for op in op_white_list: + patterns = { + "REGISTER_OPERATOR": "REGISTER_OPERATOR\(\s*%s\s*," % op, + "REGISTER_OP_CPU_KERNEL": + "REGISTER_OP_CPU_KERNEL\(\s*%s\s*," % op, + "REGISTER_OP_CUDA_KERNEL": + "REGISTER_OP_CUDA_KERNEL\(\s*%s\s*," % op + } + for k, p in patterns.items(): + matches = re.findall(p, content, flags=re.DOTALL) + if len(matches) > 0: + content = content.replace(matches[0], + matches[0].replace(k, k + "__")) + with io.open(op_file, 'w', encoding='utf-8') as f: + f.write(u'{}'.format(content)) + + return True + + +if __name__ == '__main__': + + print("================ step 1: apply patches =======================") + assert (apply_patches()) + print("==============================================================\n") + + print("================ step 2: append fluid op/kernels==============") + assert (append_fluid_kernels()) + print("==============================================================\n") + + print("================ step 3:prune phi kernels ====================") + assert (prune_phi_kernels()) + print("==============================================================\n") -- GitLab