Reduce inference library size and compile time (#53369)

* Reduce inference library size and compile time * resolve conflicts

Reduce inference library size and compile time (#53369)
* Reduce inference library size and compile time * resolve conflicts
0ef51804 · chalsliu · GitHub · 972daa46 · 0ef51804 · 0ef51804
4 changed file
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -64,6 +64,11 @@ function(generate_unify_header DIR_NAME)
      endif()
    endif()
  endforeach()
+  if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
+    if(${kernel_name} MATCHES ".*_grad")
+      continue()
+    endif()
+  endif()
  # append header into extension.h
  string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}")
  file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n")

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -118,6 +118,11 @@ file(
  "strings/gpu/*.cu"
  "fusion/gpu/*.cu")
+if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
+  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cc$")
+  list(FILTER kernel_cu EXCLUDE REGEX ".*_grad_kernel\\.cu$")
+endif()
 if(WITH_CUTLASS)
  execute_process(
    COMMAND ${CMAKE_COMMAND} -E make_directory
@@ -184,6 +189,10 @@ else()
    "fusion/cpu/*.cc")
 endif()
+if(DEFINED REDUCE_INFERENCE_LIB_SIZE)
+  list(FILTER kernel_cc EXCLUDE REGEX ".*_grad_kernel\\.cc$")
+endif()
 file(
  GLOB
  kernel_xpu

--- a/tools/reduce_lib_size_util.py
+++ b/tools/reduce_lib_size_util.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script simply removes grad kernels. You should use this script
+when cmake ON_INFER=ON, which can greatly reduce the volume of the inference library.
+"""
+import glob
+import os
+def is_balanced(content):
+    """
+    Check whether sequence contains valid parenthesis.
+    Args:
+       content (str): content of string.
+    Returns:
+        boolean: True if sequence contains valid parenthesis.
+    """
+    if content.find('{') == -1:
+        return False
+    stack = []
+    push_chars, pop_chars = '({', ')}'
+    for c in content:
+        if c in push_chars:
+            stack.append(c)
+        elif c in pop_chars:
+            if not len(stack):
+                return False
+            else:
+                stack_top = stack.pop()
+                balancing_bracket = push_chars[pop_chars.index(c)]
+                if stack_top != balancing_bracket:
+                    return False
+    return not stack
+def grad_kernel_definition(content, kernel_pattern, grad_pattern):
+    """
+    Args:
+       content(str): file content
+       kernel_pattern(str): kernel pattern
+       grad_pattern(str): grad pattern
+    Returns:
+        (list, int): grad kernel definitions in file and count.
+    """
+    results = []
+    count = 0
+    start = 0
+    lens = len(content)
+    while True:
+        index = content.find(kernel_pattern, start)
+        if index == -1:
+            return results, count
+        i = index + 1
+        while i <= lens:
+            check_str = content[index:i]
+            if is_balanced(check_str):
+                if check_str.find(grad_pattern) != -1:
+                    results.append(check_str)
+                    count += 1
+                start = i
+                break
+            i += 1
+        else:
+            return results, count
+def remove_grad_kernels(dry_run=False):
+    """
+    Args:
+       dry_run(bool): whether just print
+    Returns:
+        int: number of kernel(grad) removed
+    """
+    pd_kernel_pattern = 'PD_REGISTER_STRUCT_KERNEL'
+    register_op_pd_kernel_count = 0
+    matches = []
+    tool_dir = os.path.dirname(os.path.abspath(__file__))
+    all_op = glob.glob(
+        os.path.join(tool_dir, '../paddle/fluid/operators/**/*.cc'),
+        recursive=True,
+    )
+    all_op += glob.glob(
+        os.path.join(tool_dir, '../paddle/fluid/operators/**/*.cu'),
+        recursive=True,
+    )
+    for op_file in all_op:
+        with open(op_file, 'r', encoding='utf-8') as f:
+            content = ''.join(f.readlines())
+            pd_kernel, pd_kernel_count = grad_kernel_definition(
+                content, pd_kernel_pattern, '_grad,'
+            )
+            register_op_pd_kernel_count += pd_kernel_count
+            matches.extend(pd_kernel)
+        for to_remove in matches:
+            content = content.replace(to_remove, '')
+            if dry_run:
+                print(op_file, to_remove)
+        if not dry_run:
+            with open(op_file, 'w', encoding='utf-8') as f:
+                f.write(content)
+    return register_op_pd_kernel_count
--- a/tools/remove_grad_op_and_kernel.py
+++ b/tools/remove_grad_op_and_kernel.py
@@ -12,14 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-This script simply removes all grad ops and kernels. You should use this script
+This script simply removes grad ops and kernels. You should use this script
-when cmake ON_INFER=ON, which can greatly reduce the volume of the prediction library.
+when cmake ON_INFER=ON, which can greatly reduce the volume of the inference library.
 """
+import argparse
 import glob
 import os
 import re
+import reduce_lib_size_util
+def parse_args():
+    """Parse input arguments."""
+    parser = argparse.ArgumentParser(description='Remove grad op and kernels.')
+    parser.add_argument('--only_kernel', action='store_true', default=False)
+    parser.add_argument('--dry_run', action='store_true', default=False)
+    args = parser.parse_args()
+    return args
 def find_type_files(cur_dir, file_type, file_list=[]):
    next_level_dirs = os.listdir(cur_dir)
@@ -42,6 +55,10 @@ def remove_grad_op_and_kernel(content, pattern1, pattern2):
 def update_operator_cmake(cmake_file):
+    """Update operator cmake.
+    Args:
+        cmake_file (str): cmake file path.
+    """
    pat1 = 'add_subdirectory(optimizers)'
    pat2 = r'register_operators\(EXCLUDES.*?py_func_op.*?\)'
@@ -66,6 +83,8 @@ def update_operator_cmake(cmake_file):
 if __name__ == '__main__':
+    args = parse_args()
    tool_dir = os.path.dirname(os.path.abspath(__file__))
    all_op = glob.glob(
@@ -92,14 +111,17 @@ if __name__ == '__main__':
        # remove all grad op
        op_pattern1 = r'REGISTER_OPERATOR\(.*?\);?'
        op_pattern2 = r'REGISTER_OPERATOR\(.*?_grad,.*?\);?'
+        if args.only_kernel:
+            op_pattern1 = 'DISABLE_REMOVE_GRAD_OP_' + op_pattern1
+            op_pattern2 = 'DISABLE_REMOVE_GRAD_OP_' + op_pattern2
        # remove all cpu grad kernel
-        cpu_kernel_pattern1 = r'REGISTER_OP_CPU_KERNEL\(.*?\);?'
+        cpu_kernel_pattern1 = r'REGISTER_OP_CPU_KERNEL\(.*?\);?|REGISTER_OP_CPU_KERNEL_FUNCTOR\(.*?\);?'
-        cpu_kernel_pattern2 = r'REGISTER_OP_CPU_KERNEL\(.*?_grad,.*?\);?'
+        cpu_kernel_pattern2 = r'REGISTER_OP_CPU_KERNEL\(.*?_grad,.*?\);?|REGISTER_OP_CPU_KERNEL_FUNCTOR\(.*?_grad,.*?\);?'
        # remove all gpu grad kernel
-        gpu_kernel_pattern1 = r'REGISTER_OP_CUDA_KERNEL\(.*?\);?'
+        gpu_kernel_pattern1 = r'REGISTER_OP_CUDA_KERNEL\(.*?\);?|REGISTER_OP_CUDA_KERNEL_FUNCTOR\(.*?\);?'
-        gpu_kernel_pattern2 = r'REGISTER_OP_CUDA_KERNEL\(.*?_grad,.*?\);?'
+        gpu_kernel_pattern2 = r'REGISTER_OP_CUDA_KERNEL\(.*?_grad,.*?\);?|REGISTER_OP_CUDA_KERNEL_FUNCTOR\(.*?_grad,.*?\);?'
        # remove all xpu grad kernel
        xpu_kernel_pattern1 = r'REGISTER_OP_XPU_KERNEL\(.*?\);?'
@@ -166,9 +188,12 @@ if __name__ == '__main__':
            all_matches.extend(op_kernel)
            all_matches.extend(custom_kernel)
-        for i in all_matches:
+        for to_remove in all_matches:
-            content = content.replace(i, '')
+            content = content.replace(to_remove, '')
+            if args.dry_run:
+                print(op_file, to_remove)
+        if not args.dry_run:
            with open(op_file, 'w', encoding='utf-8') as f:
                f.write(content)
@@ -178,6 +203,10 @@ if __name__ == '__main__':
    )
    update_operator_cmake(cmake_file)
+    register_pd_kernel_count = reduce_lib_size_util.remove_grad_kernels(
+        args.dry_run
+    )
    print('We erase all grad op and kernel for Paddle-Inference lib.')
    print('%50s%10s' % ('type', 'count'))
    print('%50s%10s' % ('REGISTER_OPERATOR', register_op_count))
@@ -194,3 +223,4 @@ if __name__ == '__main__':
            register_op_kernel_with_custom_type_count,
        )
    )
+    print('%50s%10s' % ('REGISTER_OP_PD_KERNEL', register_pd_kernel_count))