diff --git a/cmake/phi.cmake b/cmake/phi.cmake index 96760969adf902d2e8c644468e682dbc38144c18..6beb57daae10722d7a0e3cc0c9728efee605b01a 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -64,6 +64,12 @@ function(generate_unify_header DIR_NAME) endif() endif() endforeach() + if(DEFINED REDUCE_INFERENCE_LIB_SIZE) + if(${kernel_name} MATCHES ".*_grad") + continue() + endif() + endif() + # append header into extension.h string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}") file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n") diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index fe8548f9fb5fc3aa3099560e0f5e25d32cdb0387..bbffe745eba01e7e0fbbfa500116e3910bcd9fa6 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -116,6 +116,11 @@ file( "strings/gpu/*.cu" "fusion/gpu/*.cu") +if(DEFINED REDUCE_INFERENCE_LIB_SIZE) + list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$") + list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$") +endif() + if(WITH_CUTLASS) execute_process( COMMAND ${CMAKE_COMMAND} -E make_directory @@ -176,6 +181,9 @@ else() "fusion/*.cc" "fusion/cpu/*.cc") endif() +if(DEFINED REDUCE_INFERENCE_LIB_SIZE) + list(FILTER kernel_cc EXCLUDE REGEX ".*_grad_kernel\\.cc$") +endif() file(GLOB kernel_xpu "xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc") diff --git a/tools/reduce_lib_size_util.py b/tools/reduce_lib_size_util.py new file mode 100644 index 0000000000000000000000000000000000000000..0815e7b1ddc4d5932a143297bde6463875f39728 --- /dev/null +++ b/tools/reduce_lib_size_util.py @@ -0,0 +1,128 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This script simply removes grad kernels. You should use this script +when cmake ON_INFER=ON, which can greatly reduce the volume of the inference library. +""" + +import glob +import os + + +def is_balanced(content): + """ + Check whether sequence contains valid parenthesis. + Args: + content (str): content of string. + + Returns: + boolean: True if sequence contains valid parenthesis. + """ + + if content.find('{') == -1: + return False + stack = [] + push_chars, pop_chars = '({', ')}' + for c in content: + if c in push_chars: + stack.append(c) + elif c in pop_chars: + if not len(stack): + return False + else: + stack_top = stack.pop() + balancing_bracket = push_chars[pop_chars.index(c)] + if stack_top != balancing_bracket: + return False + return not stack + + +def grad_kernel_definition(content, kernel_pattern, grad_pattern): + """ + Args: + content(str): file content + kernel_pattern(str): kernel pattern + grad_pattern(str): grad pattern + + Returns: + (list, int): grad kernel definitions in file and count. + """ + + results = [] + count = 0 + start = 0 + lens = len(content) + while True: + index = content.find(kernel_pattern, start) + if index == -1: + return results, count + i = index + 1 + while i <= lens: + check_str = content[index:i] + if is_balanced(check_str): + if check_str.find(grad_pattern) != -1: + results.append(check_str) + count += 1 + start = i + break + i += 1 + else: + return results, count + + +def remove_grad_kernels(dry_run=False): + """ + Args: + dry_run(bool): whether just print + + Returns: + int: number of kernel(grad) removed + """ + + pd_kernel_pattern = 'PD_REGISTER_STRUCT_KERNEL' + register_op_pd_kernel_count = 0 + matches = [] + + tool_dir = os.path.dirname(os.path.abspath(__file__)) + all_op = glob.glob( + os.path.join(tool_dir, '../paddle/fluid/operators/**/*.cc'), + recursive=True, + ) + all_op += glob.glob( + os.path.join(tool_dir, '../paddle/fluid/operators/**/*.cu'), + recursive=True, + ) + + for op_file in all_op: + with open(op_file, 'r', encoding='utf-8') as f: + content = ''.join(f.readlines()) + + pd_kernel, pd_kernel_count = grad_kernel_definition( + content, pd_kernel_pattern, '_grad,' + ) + + register_op_pd_kernel_count += pd_kernel_count + + matches.extend(pd_kernel) + + for to_remove in matches: + content = content.replace(to_remove, '') + if dry_run: + print(op_file, to_remove) + + if not dry_run: + with open(op_file, 'w', encoding='utf-8') as f: + f.write(content) + + return register_op_pd_kernel_count diff --git a/tools/remove_grad_op_and_kernel.py b/tools/remove_grad_op_and_kernel.py index 870640347187d11808d594d662048b86ef2290ed..823707f904d9df6f62cac222c231757917c992ee 100644 --- a/tools/remove_grad_op_and_kernel.py +++ b/tools/remove_grad_op_and_kernel.py @@ -12,14 +12,27 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -This script simply removes all grad ops and kernels. You should use this script -when cmake ON_INFER=ON, which can greatly reduce the volume of the prediction library. +This script simply removes grad ops and kernels. You should use this script +when cmake ON_INFER=ON, which can greatly reduce the volume of the inference library. """ +import argparse import glob import os import re +import reduce_lib_size_util + + +def parse_args(): + """Parse input arguments.""" + parser = argparse.ArgumentParser(description='Remove grad op and kernels.') + parser.add_argument('--only_kernel', action='store_true', default=False) + parser.add_argument('--dry_run', action='store_true', default=False) + + args = parser.parse_args() + return args + def find_type_files(cur_dir, file_type, file_list=[]): next_level_dirs = os.listdir(cur_dir) @@ -42,6 +55,10 @@ def remove_grad_op_and_kernel(content, pattern1, pattern2): def update_operator_cmake(cmake_file): + """Update operator cmake. + Args: + cmake_file (str): cmake file path. + """ pat1 = 'add_subdirectory(optimizers)' pat2 = r'register_operators\(EXCLUDES.*?py_func_op.*?\)' @@ -66,6 +83,8 @@ def update_operator_cmake(cmake_file): if __name__ == '__main__': + args = parse_args() + tool_dir = os.path.dirname(os.path.abspath(__file__)) all_op = glob.glob( @@ -92,14 +111,17 @@ if __name__ == '__main__': # remove all grad op op_pattern1 = r'REGISTER_OPERATOR\(.*?\);?' op_pattern2 = r'REGISTER_OPERATOR\(.*?_grad,.*?\);?' + if args.only_kernel: + op_pattern1 = 'DISABLE_REMOVE_GRAD_OP_' + op_pattern1 + op_pattern2 = 'DISABLE_REMOVE_GRAD_OP_' + op_pattern2 # remove all cpu grad kernel - cpu_kernel_pattern1 = r'REGISTER_OP_CPU_KERNEL\(.*?\);?' - cpu_kernel_pattern2 = r'REGISTER_OP_CPU_KERNEL\(.*?_grad,.*?\);?' + cpu_kernel_pattern1 = r'REGISTER_OP_CPU_KERNEL\(.*?\);?|REGISTER_OP_CPU_KERNEL_FUNCTOR\(.*?\);?' + cpu_kernel_pattern2 = r'REGISTER_OP_CPU_KERNEL\(.*?_grad,.*?\);?|REGISTER_OP_CPU_KERNEL_FUNCTOR\(.*?_grad,.*?\);?' # remove all gpu grad kernel - gpu_kernel_pattern1 = r'REGISTER_OP_CUDA_KERNEL\(.*?\);?' - gpu_kernel_pattern2 = r'REGISTER_OP_CUDA_KERNEL\(.*?_grad,.*?\);?' + gpu_kernel_pattern1 = r'REGISTER_OP_CUDA_KERNEL\(.*?\);?|REGISTER_OP_CUDA_KERNEL_FUNCTOR\(.*?\);?' + gpu_kernel_pattern2 = r'REGISTER_OP_CUDA_KERNEL\(.*?_grad,.*?\);?|REGISTER_OP_CUDA_KERNEL_FUNCTOR\(.*?_grad,.*?\);?' # remove all xpu grad kernel xpu_kernel_pattern1 = r'REGISTER_OP_XPU_KERNEL\(.*?\);?' @@ -166,11 +188,14 @@ if __name__ == '__main__': all_matches.extend(op_kernel) all_matches.extend(custom_kernel) - for i in all_matches: - content = content.replace(i, '') + for to_remove in all_matches: + content = content.replace(to_remove, '') + if args.dry_run: + print(op_file, to_remove) - with open(op_file, 'w', encoding='utf-8') as f: - f.write(content) + if not args.dry_run: + with open(op_file, 'w', encoding='utf-8') as f: + f.write(content) # 2. update operators/CMakeLists.txt cmake_file = os.path.join( @@ -178,6 +203,10 @@ if __name__ == '__main__': ) update_operator_cmake(cmake_file) + register_pd_kernel_count = reduce_lib_size_util.remove_grad_kernels( + args.dry_run + ) + print('We erase all grad op and kernel for Paddle-Inference lib.') print('%50s%10s' % ('type', 'count')) print('%50s%10s' % ('REGISTER_OPERATOR', register_op_count)) @@ -194,3 +223,4 @@ if __name__ == '__main__': register_op_kernel_with_custom_type_count, ) ) + print('%50s%10s' % ('REGISTER_OP_PD_KERNEL', register_pd_kernel_count))