remove infrt V1.1 (#52672)

6913feb0 · jjyaoao · GitHub · 61fe2198 · 6913feb0 · 6913feb0
7 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -73,16 +73,7 @@ tools/nvcc_lazy

 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
-paddle/infrt/dialect/pd/ir/pd_ops.td
-paddle/infrt/dialect/phi/ir/phi_cpu_kernels.td
-paddle/infrt/dialect/phi/ir/phi_gpu_kernels.td
-tools/infrt/kernels.json
-tools/infrt/kernel_signature.json
-paddle/infrt/dialect/pd/common/pd_ops_info.h
 .lit_test_times.txt
-paddle/infrt/tests/dialect/Output
-paddle/infrt/tests/lit.cfg.py
-paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launchers.cc
 paddle/fluid/pybind/eager_op_function.cc
 tools/nvcc_lazy


--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -269,7 +269,6 @@ option(
  OFF)
 option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
-option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF)
 option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
 option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
 option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)

--- a/cmake/external/llvm.cmake
+++ b/cmake/external/llvm.cmake
-include(FetchContent)
-
-set(LLVM_DOWNLOAD_URL
-    https://paddle-inference-dist.bj.bcebos.com/infrt/llvm_b5149f4e66a49a98b67e8e2de4e24a4af8e2781b.tar.gz
-)
-set(LLVM_MD5 022819bb5760817013cf4b8a37e97d5e)
-
-set(FETCHCONTENT_BASE_DIR ${THIRD_PARTY_PATH}/llvm)
-set(FETCHCONTENT_QUIET OFF)
-FetchContent_Declare(
-  external_llvm
-  URL ${LLVM_DOWNLOAD_URL}
-  URL_MD5 ${LLVM_MD5}
-  PREFIX ${THIRD_PARTY_PATH}/llvm SOURCE_DIR ${THIRD_PARTY_PATH}/install/llvm)
-if(NOT LLVM_PATH)
-  FetchContent_GetProperties(external_llvm)
-  if(NOT external_llvm_POPULATED)
-    FetchContent_Populate(external_llvm)
-  endif()
-  set(LLVM_PATH ${THIRD_PARTY_PATH}/install/llvm)
-  set(LLVM_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/llvm)
-  set(MLIR_DIR ${THIRD_PARTY_PATH}/install/llvm/lib/cmake/mlir)
-else()
-  set(LLVM_DIR ${LLVM_PATH}/lib/cmake/llvm)
-  set(MLIR_DIR ${LLVM_PATH}/lib/cmake/mlir)
-endif()
-
-if(${CMAKE_CXX_COMPILER} STREQUAL "clang++")
-  set(CMAKE_EXE_LINKER_FLAGS
-      "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
-endif()
-
-message(STATUS "set LLVM_DIR: ${LLVM_DIR}")
-message(STATUS "set MLIR_DIR: ${MLIR_DIR}")
-find_package(LLVM REQUIRED CONFIG HINTS ${LLVM_DIR})
-find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR})
-find_package(ZLIB REQUIRED)
-
-list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
-include(AddLLVM)
-
-include_directories(${LLVM_INCLUDE_DIRS})
-list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
-list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
-include(AddLLVM)
-include(TableGen)
-include(AddMLIR)
-
-message(STATUS "Found MLIR: ${MLIR_DIR}")
-message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
-message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
-
-# To build with MLIR, the LLVM is build from source code using the following flags:
-
-#[==[
-cmake ../llvm  -G "Unix Makefiles" \
-  -DLLVM_ENABLE_PROJECTS="mlir;clang" \
-  -DLLVM_BUILD_EXAMPLES=OFF \
-  -DLLVM_TARGETS_TO_BUILD="X86" \
-  -DCMAKE_BUILD_TYPE=Release \
-  -DLLVM_ENABLE_ASSERTIONS=ON \
-  -DLLVM_ENABLE_ZLIB=OFF \
-  -DLLVM_ENABLE_RTTI=ON \
-  -DLLVM_INSTALL_UTILS=ON \
-  -DCMAKE_INSTALL_PREFIX=./install
-#]==]
-# The matched llvm-project version is b5149f4e66a49a98b67e8e2de4e24a4af8e2781b (currently a temporary commit)
-
-add_definitions(${LLVM_DEFINITIONS})
-
-llvm_map_components_to_libnames(
-  llvm_libs
-  Support
-  Core
-  irreader
-  X86
-  executionengine
-  orcjit
-  mcjit
-  all
-  codegen)
-
-message(STATUS "LLVM libs: ${llvm_libs}")
-
-get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
-message(STATUS "MLIR libs: ${mlir_libs}")
-add_definitions(${LLVM_DEFINITIONS})
-
-# The minimum needed libraries for MLIR IR parse and transform.
-set(MLIR_IR_LIBS MLIRAnalysis MLIRPass MLIRParser MLIRDialect MLIRIR MLIROptLib)
-
-# tb_base is the name of a xxx.td file (without the .td suffix)
-function(mlir_tablegen_on td_base)
-  set(options)
-  set(oneValueArgs DIALECT)
-  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN})
-
-  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  mlir_tablegen(${td_base}.hpp.inc -gen-op-decls)
-  mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
-  if(mlir_tablegen_on_DIALECT)
-    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls
-                  -dialect=${mlir_tablegen_on_DIALECT})
-    mlir_tablegen(${td_base}_dialect.cpp.inc --gen-dialect-defs
-                  -dialect=${mlir_tablegen_on_DIALECT})
-  endif()
-  add_public_tablegen_target(${td_base}_IncGen)
-  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
-endfunction()
-
-function(mlir_add_rewriter td_base)
-  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
-  set(LLVM_TARGET_DEPENDS
-      ${LLVM_TARGET_DEPENDS}
-      ${CMAKE_SOURCE_DIR}/paddle/infrt/dialect/infrt/ir/infrt_base.td)
-  mlir_tablegen(${td_base}.cpp.inc -gen-rewriters)
-  add_public_tablegen_target(MLIR${td_base}IncGen)
-  add_dependencies(mlir-headers MLIR${td_base}IncGen)
-endfunction()
-
-# Execute the mlir script with infrt-exec program.
-# @name: name of the test
-# @script: path to the mlir script file
-function(infrt_exec_check name script)
-  add_test(
-    NAME ${name}
-    COMMAND
-      sh -c
-      "${CMAKE_BINARY_DIR}/paddle/infrt/host_context/infrt-exec -i ${CMAKE_CURRENT_SOURCE_DIR}/${script}| ${LLVM_PATH}/bin/FileCheck  ${CMAKE_CURRENT_SOURCE_DIR}/${script}"
-  )
-endfunction()
--- a/cmake/infrt_lib.cmake
+++ b/cmake/infrt_lib.cmake
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-set(INFRT_INSTALL_DIR
-    "${CMAKE_BINARY_DIR}/paddle_infrt_install_dir"
-    CACHE STRING "A path setting paddle infrt shared and static libraries")
-
-function(copy TARGET)
-  set(options "")
-  set(oneValueArgs "")
-  set(multiValueArgs SRCS DSTS)
-  cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN})
-
-  list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
-  list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
-  if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
-    message(
-      FATAL_ERROR
-        "${TARGET} source numbers are not equal to destination numbers")
-  endif()
-  math(EXPR len "${copy_lib_SRCS_len} - 1")
-  foreach(index RANGE ${len})
-    list(GET copy_lib_SRCS ${index} src)
-    list(GET copy_lib_DSTS ${index} dst)
-    add_custom_command(
-      TARGET ${TARGET}
-      POST_BUILD
-      COMMAND mkdir -p "${dst}"
-      COMMAND cp -r "${src}" "${dst}"
-      COMMENT "copying ${src} -> ${dst}")
-  endforeach()
-endfunction()
-
-function(copy_part_of_thrid_party TARGET DST)
-  set(dst_dir "${DST}/third_party/install/glog")
-  copy(
-    ${TARGET}
-    SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
-    DSTS ${dst_dir} ${dst_dir}/lib)
-endfunction()
-
-# inference library for only inference
-set(infrt_lib_deps third_party infrt infrt_static)
-add_custom_target(infrt_lib_dist DEPENDS ${infrt_lib_deps})
-
-# CMakeCache Info
-copy(
-  infrt_lib_dist
-  SRCS ${CMAKE_BINARY_DIR}/CMakeCache.txt
-  DSTS ${INFRT_INSTALL_DIR})
-
-set(infrt_lib ${INFRT_BINARY_DIR}/libinfrt.*)
-copy(
-  infrt_lib_dist
-  SRCS ${INFRT_SOURCE_DIR}/api/infrt_api.h ${infrt_lib}
-  DSTS ${INFRT_INSTALL_DIR}/infrt/include ${INFRT_INSTALL_DIR}/infrt/lib)
-
-copy(
-  infrt_lib_dist
-  SRCS ${INFRT_BINARY_DIR}/paddle/framework.pb.h
-  DSTS ${INFRT_INSTALL_DIR}/infrt/include/internal)
-
-# paddle fluid version
-function(version version_file)
-  execute_process(
-    COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
-    OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-  file(WRITE ${version_file} "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n")
-  file(APPEND ${version_file}
-       "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n")
-endfunction()
-version(${INFRT_INSTALL_DIR}/version.txt)
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -493,11 +493,6 @@ if(WIN32)
  list(APPEND third_party_deps extern_dirent)
 endif()

-if(WITH_INFRT)
-  include(external/llvm)
-  list(APPEND third_party_deps ${llvm_libs})
-endif()
-
 if(WITH_IPU)
  include(external/poplar)
  list(APPEND third_party_deps extern_poplar)

--- a/paddle/phi/README.md
+++ b/paddle/phi/README.md
@@ -22,11 +22,11 @@ The root cause of poor reusability is the inflexibility of the original Op archi

 1. When an Op reuses the `Opkernel::Compute` method of another Op, an `ExecutionContext` needs to be constructed first, and the reuse method is relatively cumbersome

-    > It will be much more convenient if you can directly call the Kernel in the form of a function
+   > It will be much more convenient if you can directly call the Kernel in the form of a function

 2. Due to the overhead introduced by additional data structure construction and independent Op scheduling, from the perspective of computing performance, it is better to copy the calculation code directly when reusing Op, which leads us to gradually abandon the earlier principle of backward Op reusing forward Op, and began to implement Kernel separately for each backward Op, so that Paddle maintains a large number of backward OpKernel implementation codes internally.

-    > Only when the overhead of reusing Ops is small enough, reusing existing Ops to implement new Ops can be widely promoted
+   > Only when the overhead of reusing Ops is small enough, reusing existing Ops to implement new Ops can be widely promoted

 ### 1.2 Conciseness and fine-grained execution scheduling

@@ -54,11 +54,7 @@ For a long time, because the Paddle and Paddle-Lite operators are maintained sep

 Therefore, this functional operator library will be jointly constructed by training and inference team, and will serve as an independent compilation component and underlying infrastructure (not yet independently split), which can serve training, server-inference, and -inference execution systems at the same time.

-### 1.5 The adaptation of the new inference Runtime design 'infrt'
-
-Inference team designed a new runtime `infrt`. It is expected to unify the execution system of Paddle-Inference and Paddle-Lite. It is necessary to directly call the operators in the PHI operator library jointly built this time. Therefore, the adaptation to `infrt` needs to be considered in the design. (Currently the `infrt` project is temporarily on hold).
-
-### 1.6 Op and Kernel parameter normalization
+### 1.5 Op and Kernel parameter normalization

 The Python 2.0 API project in 2020 standardized the argument list of the Paddle Python-side API, making it concise, easy to use, and standard. However, due to cost considerations, the argument list at the Op level was not standardized, so there will be many early developed operators that differ greatly in arguments from the Python API. For example, `conv` op, the Python API has only 8 arguments, but the corresponding C++ `Conv` Op has 29 arguments. API and Op are essentially the same layer of concepts, both are descriptions of an operation, and the arguments should be consistent. In order to solve this problem, 'the operator definition enhancement project' was launched, and the declarations of 'AsExtra' and 'AsQuant' were added to some unnecessary arguments, but the problem was not fundamentally solved, which is what the construction of the PHI operator library hopes to solve.

@@ -68,7 +64,7 @@ We hope to be able to achieve the same three-layer arguments of Python API -> Op

 ### 2.1 Location

-The PHI code directory is inside the paddle directory, which is at the same level as fluid, rather than inside the fluid directory. PHI is a basic component that is called by various upper-layer runtime such as fluid, lite, and infrt, and it will be used later as a separately compiled dynamic library, therefore PHI is not suitable as the submodule of fluid.
+The PHI code directory is inside the paddle directory, which is at the same level as fluid, rather than inside the fluid directory. PHI is a basic component that is called by various upper-layer runtime such as fluid, lite, and it will be used later as a separately compiled dynamic library, therefore PHI is not suitable as the submodule of fluid.

 ### 2.2 Directory Structure

@@ -78,27 +74,31 @@ Training and inference require a clear operator library directory structure:

 - The directory design should support various split compilation requirements of the operator library, which including:

-    - Split and compile according to the computing device.
-        - For example, compile for cpu only, or compile for gpu only.
-    - Split and compile according to the training and inference scenarios.
-        - For example, the inference scenario does not compile backward-relevant kernels (xxx_grad_kernel.cc|cu)
-    - Precisely crop and compile according to the operators actually used by the mobile device (not supported yet)
-        - For example, a model uses `add` and `multiply` only, ideally it could be cropped to only 2 kernels.
+  - Split and compile according to the computing device.
+    - For example, compile for cpu only, or compile for gpu only.
+  - Split and compile according to the training and inference scenarios.
+    - For example, the inference scenario does not compile backward-relevant kernels (xxx_grad_kernel.cc|cu)
+  - Precisely crop and compile according to the operators actually used by the mobile device (not supported yet)
+    - For example, a model uses `add` and `multiply` only, ideally it could be cropped to only 2 kernels.

 - In the long run, support the requirement of easily reusing kernel implementation.
-    - Explanation: When reusing the kernel, the corresponding function implementation should be introduced through `include` easily, rather than cannot find the kernel because of the complex directory structure.
+
+  - Explanation: When reusing the kernel, the corresponding function implementation should be introduced through `include` easily, rather than cannot find the kernel because of the complex directory structure.

 - In the long run, support the requirement of the unified writing method among cross-device kernels, and the writing method is intuitive and easy to use, without introducing unnecessary template parameters.
-    - Explanation: Kernel Primitive API module is at the lower layer of the operator library. Its long-term vision is that each operation uses only one kernel to adapt to various devices, the code that truly distinguishes the device is only in the implementation of the Kernel Primitive API. In the future, the template parameters should be limited to as concise as possible when passing complex parameters into the reused kernel.
+
+  - Explanation: Kernel Primitive API module is at the lower layer of the operator library. Its long-term vision is that each operation uses only one kernel to adapt to various devices, the code that truly distinguishes the device is only in the implementation of the Kernel Primitive API. In the future, the template parameters should be limited to as concise as possible when passing complex parameters into the reused kernel.

 - In terms of ease of use, developers can accurately understand where the newly added kernel should be placed, without ambiguity.
-    - Explanation: When developers add an API, they will not be confused about which directory they should put the corresponding kernel in. Moreover, different people should have no ambiguous understanding of where the same kernel should be placed.
+
+  - Explanation: When developers add an API, they will not be confused about which directory they should put the corresponding kernel in. Moreover, different people should have no ambiguous understanding of where the same kernel should be placed.

 - Do not introduce a lot of duplicate directory design.
-    - Explanation: Concept splitting is needed, but also with boundaries. Avoid subdirectories with the same name occurring in multiple directories. For example, if `eigen`, `funcs`, `math` directories are placed under the cpu directory, then they shouldn't be placed under the gpu directory. The directory design of the new operator library is mainly divided according to the device, and the directory splitting at other levels should be weakened as much as possible. For example, try not to split based on functions, try not to split based on fields, etc.
+
+  - Explanation: Concept splitting is needed, but also with boundaries. Avoid subdirectories with the same name occurring in multiple directories. For example, if `eigen`, `funcs`, `math` directories are placed under the cpu directory, then they shouldn't be placed under the gpu directory. The directory design of the new operator library is mainly divided according to the device, and the directory splitting at other levels should be weakened as much as possible. For example, try not to split based on functions, try not to split based on fields, etc.

 - Do not introduce too deep directory design.
-    - Explanation: The directory level should not be too deep, otherwise it will lead to higher understanding and maintenance costs.
+  - Explanation: The directory level should not be too deep, otherwise it will lead to higher understanding and maintenance costs.

 #### 2.2.2 Directory design details

@@ -129,13 +129,12 @@ Some directory structure description:
 - `common`: Data structures to be used both inside PHI `core` and PHI `api` directory. These data structures neither belong to the `core` nor the `api` directory.
 - `core`: PHI has some public module implementations that it needs, such as `DenseTensor`, kernel registration and management modules.
 - `backends`: The backends include data structures that need to be added for each backend, such as `CPUContext`, `GPUContext`, etc.
-    - The basic data structures are placed in the `core`, while the dedicated data structures of specific backends are not placed in the `core`, and the dependencies strictly ensure that the `backends` depend on the `core`, but the `core` cannot depend on the `backends`.
-    - Example 1: If Context is a base class, then put it in `core`, inherited `CPUContext` is in `backends/cpu` and `GPUContext` is in `backends/gpu`.
-    - Example 2: TensorBase is in `core`, `DenseTensor` is used by most devices so that it is also in the `core`. If there is `OneDNNTensor`, which is only used for `OneDNN`, then it should be placed in `backends/onednn`.
+  - The basic data structures are placed in the `core`, while the dedicated data structures of specific backends are not placed in the `core`, and the dependencies strictly ensure that the `backends` depend on the `core`, but the `core` cannot depend on the `backends`.
+  - Example 1: If Context is a base class, then put it in `core`, inherited `CPUContext` is in `backends/cpu` and `GPUContext` is in `backends/gpu`.
+  - Example 2: TensorBase is in `core`, `DenseTensor` is used by most devices so that it is also in the `core`. If there is `OneDNNTensor`, which is only used for `OneDNN`, then it should be placed in `backends/onednn`.
 - `infermeta`: The location of the infermeta function, the infermeta function is equivalent to `infershape + inferdtype + inferlayout`, etc.
 - `kernels`: Kernels related to each device.
-    - `cpu, gpu, ...`
-
+  - `cpu, gpu, ...`

 ##### 2.2.2.2 Kernels directory

@@ -156,25 +155,26 @@ paddle/phi/kernels
 The directory structure is described as follows:

 - The root directory under kernels includes device-independent `kernel.h` and `kernel.cc`. In principle, each kernel has one .h and .cc
-    - For example, if a kernel is implemented using Primitive api, or is implemented by reusing other basic kernels, there should be only one implementation for all devices, so its declaration and implementation can be placed directly in the kernels directory. (This is the ideal state in the future.)
-    - At present, most of our kernels do not have the feature of unity implementation across devices, but the input parameters and return values of the kernel should be consistent except for `DeviceContext`, so the kernel parameter declaration header file is also placed in the current directory (consistent with the original design, `DeviceContext` and `T` are used as template parameters), The functions implementation of each device are placed in the corresponding device folder.
-        - Note that the unity implementation across devices here does not mean that the CPU and GPU implementations of a kernel are unified, but the implementations of all devices are the same. Currently, it includes at least `CPU`, `GPU`, `XPU`, `ONEDNN`, `GPUDNN`, etc.
-    - If the backward kernel does not need to support cropping, it can be merged appropriately (but if you want to leave the possibility of supporting end-to-side training, the backward kernel may also be a potential target for cropping)
+  - For example, if a kernel is implemented using Primitive api, or is implemented by reusing other basic kernels, there should be only one implementation for all devices, so its declaration and implementation can be placed directly in the kernels directory. (This is the ideal state in the future.)
+  - At present, most of our kernels do not have the feature of unity implementation across devices, but the input parameters and return values of the kernel should be consistent except for `DeviceContext`, so the kernel parameter declaration header file is also placed in the current directory (consistent with the original design, `DeviceContext` and `T` are used as template parameters), The functions implementation of each device are placed in the corresponding device folder.
+    - Note that the unity implementation across devices here does not mean that the CPU and GPU implementations of a kernel are unified, but the implementations of all devices are the same. Currently, it includes at least `CPU`, `GPU`, `XPU`, `ONEDNN`, `GPUDNN`, etc.
+  - If the backward kernel does not need to support cropping, it can be merged appropriately (but if you want to leave the possibility of supporting end-to-side training, the backward kernel may also be a potential target for cropping)
 - The next-level subdirectory of kernels, in principle, is created according to the backend classification, and only two special directories are reserved:
-    - `funcs`: In order to be compatible with the directories of functor and function in the original fluid/operators directory, when placing functions and functor that support multiple backends, we organize them according to the original design that one header file corresponding to multiple .cc(u) (This part of the code may be removed in the future, because it will be gradually replaced by Kernel Primitive API and reuse between Kernels, so no over-design here.)
-        - Example 1: A common function `XXXFunction` is called in both reduce CPU and reduce GPU kernel implementations, and the reduce CPU and reduce GPU kernel implementations are different, then `XXXFunction` should be in the `funcs` directory.
-    - `primitive`: Kernel Primitive API, some basic tools for multi-device unified kernel implementation.
-    - `impl`: Many paddle's original op kernel implementation reuse the same code for CPU and GPU, and they are in a large number of `xx_op.h`. This part of the code is not suitable to be placed in the `cpu` or `gpu` directory, nor in the `funcs` directory (putting it in the `funcs` directory will cause a considerable part of the kernel implementation to be placed in the `funcs` directory, which is too bloated and confusing. The `funcs` directory is created to place the `functor` and `function` tools as in the original operators/math directory). This part of the code is also not suitable to be placed in the root directory of `kernels` (it is not a device-independent implementation, only an implementation shared by cpu and gpu). Therefore, in order not to overthink this part of the code when migrating, and the location of the placement is relatively consistent with its implementation nature, the `impl` directory was created.
-        - In the `impl` directory, only the kernel functions that are consistent across some devices are placed. They are all header files, and the names are all suffixed with `xxx_kernel_impl.h`
-        - For example: `scale`, `fill_constant`, `fill_any_like` kernels are all such cases.
+  - `funcs`: In order to be compatible with the directories of functor and function in the original fluid/operators directory, when placing functions and functor that support multiple backends, we organize them according to the original design that one header file corresponding to multiple .cc(u) (This part of the code may be removed in the future, because it will be gradually replaced by Kernel Primitive API and reuse between Kernels, so no over-design here.)
+    - Example 1: A common function `XXXFunction` is called in both reduce CPU and reduce GPU kernel implementations, and the reduce CPU and reduce GPU kernel implementations are different, then `XXXFunction` should be in the `funcs` directory.
+  - `primitive`: Kernel Primitive API, some basic tools for multi-device unified kernel implementation.
+  - `impl`: Many paddle's original op kernel implementation reuse the same code for CPU and GPU, and they are in a large number of `xx_op.h`. This part of the code is not suitable to be placed in the `cpu` or `gpu` directory, nor in the `funcs` directory (putting it in the `funcs` directory will cause a considerable part of the kernel implementation to be placed in the `funcs` directory, which is too bloated and confusing. The `funcs` directory is created to place the `functor` and `function` tools as in the original operators/math directory). This part of the code is also not suitable to be placed in the root directory of `kernels` (it is not a device-independent implementation, only an implementation shared by cpu and gpu). Therefore, in order not to overthink this part of the code when migrating, and the location of the placement is relatively consistent with its implementation nature, the `impl` directory was created.
+    - In the `impl` directory, only the kernel functions that are consistent across some devices are placed. They are all header files, and the names are all suffixed with `xxx_kernel_impl.h`
+    - For example: `scale`, `fill_constant`, `fill_any_like` kernels are all such cases.
 - The auxiliary functions that are only used by the current kernel, they are always placed in the same backend folder as the kernel implementation, and the .h file is used to manage the code. Auxiliary function codes are no longer placed elsewhere, unless their implementations are used in multiple places.
-    - Even if there are multiple calls, if it is still limited to the same device, directly build the header file and put it in the same directory.
+  - Even if there are multiple calls, if it is still limited to the same device, directly build the header file and put it in the same directory.
 - The implementation of the backward kernel and the forward kernel are placed in different files, and the file suffix is `*_grad_kernel.*`, which is convenient for cmake to separate and compile.
-    - No more directories are created for the backward kernel, otherwise directories such as cpu/gpu will also be created under the backward kernel directory.
-    - The implementation of the second-order derivative and the third-order derivative is also placed in the grad kernel implementation file.
+
+  - No more directories are created for the backward kernel, otherwise directories such as cpu/gpu will also be created under the backward kernel directory.
+  - The implementation of the second-order derivative and the third-order derivative is also placed in the grad kernel implementation file.

 - Why is the directory named `gpu` instead of `cuda` and `hip`?
-    - The code of `cuda` and `hip` is very repetitive, and the unified implementation is easier to maintain.
+  - The code of `cuda` and `hip` is very repetitive, and the unified implementation is easier to maintain.

 #### 2.2.3 Namespace

@@ -230,26 +230,28 @@ void FullKernel(const Context& dev_ctx,
 ##### 2.3.2.1 API Tensor interface

 - The top-layer is the API-level Tensor interface, which contains two pointer members, `TensorBase` and `AbstractAutogradMeta`.
-    - Both members are designed as Interface and do not depend on real Tensor and `Autograd` implementations.
-    - `AutogradMeta` is only meaningful in the dynamic graph API-level Tensor, it will not be used in the specific kernel calculation, so put it in the top-layer Tensor interface.
-    - In addition, such a design facilitates data sharing and reduces copy overhead.
-        - When a Tensor is assigned to another Tensor, or Tensor is used as a function return value, only the pointer is actually copied, and no real data copy is performed.
+
+  - Both members are designed as Interface and do not depend on real Tensor and `Autograd` implementations.
+  - `AutogradMeta` is only meaningful in the dynamic graph API-level Tensor, it will not be used in the specific kernel calculation, so put it in the top-layer Tensor interface.
+  - In addition, such a design facilitates data sharing and reduces copy overhead.
+    - When a Tensor is assigned to another Tensor, or Tensor is used as a function return value, only the pointer is actually copied, and no real data copy is performed.

 - The top-layer C++ Tensor plays a similar role as the Python-side Tensor, and the interface design is as consistent as possible with the Python-side.
-    - Contain basic property access and data access methods of Tensor.
-        - `shape`, `place`, `dtype`, `data`.
-    - Contain the `autograd` methods required by the dynamic graph Tensor.
-        - `gradient`, `backward`.
-    - Contain conversion methods between Tensors.
-        - cpu, gpu, xpu etc.
-    - Contain calculation methods related to Tensor (not added yet).
-        - All methods of the `paddle.tensor` module.
+
+  - Contain basic property access and data access methods of Tensor.
+    - `shape`, `place`, `dtype`, `data`.
+  - Contain the `autograd` methods required by the dynamic graph Tensor.
+    - `gradient`, `backward`.
+  - Contain conversion methods between Tensors.
+    - cpu, gpu, xpu etc.
+  - Contain calculation methods related to Tensor (not added yet).
+    - All methods of the `paddle.tensor` module.

 - Compilation decoupling:

-    - The `autograd` information here is just a pointer index, which is empty by default.
-        - `std::unique_ptr<AbstractAutogradMeta> autograd_meta_ = nullptr;`
-    - `AbstractAutogradMeta` is an abstract class interface that does not depend on any module of `autograd`, so it will not affect the independent compilation of PHI, and at the same time takes into account the need for dynamic graph Tensor to hold backward information.
+  - The `autograd` information here is just a pointer index, which is empty by default.
+    - `std::unique_ptr<AbstractAutogradMeta> autograd_meta_ = nullptr;`
+  - `AbstractAutogradMeta` is an abstract class interface that does not depend on any module of `autograd`, so it will not affect the independent compilation of PHI, and at the same time takes into account the need for dynamic graph Tensor to hold backward information.

 - `AutogradMeta` is only set in the dynamic graph scenario. For unneeded scenarios, such as in static graphs, `AutogradMeta` is just a null pointer.

@@ -277,22 +279,23 @@ Tensor ondnn() const;
 ```

 - This conversion process may be `cast` or `copy`:
-    - `cast` if no data copy required.
-    - `copy` if data copy required.
-    - Transformations are implemented by functional kernels.
+
+  - `cast` if no data copy required.
+  - `copy` if data copy required.
+  - Transformations are implemented by functional kernels.

 - Usage in API Scenarios
-    - In a complete training scenario, when a user uses an API, such as `DataLoader`, the data is generally read from the disk, put it into the CPU, and then converted to the specific execution device.
+  - In a complete training scenario, when a user uses an API, such as `DataLoader`, the data is generally read from the disk, put it into the CPU, and then converted to the specific execution device.

 ##### 2.3.2.2 TensorBase

 - The interface implemented by Tensor only contains the necessary pure virtual Tensor methods, and does not contain members with real meaning. The methods here should also be strictly monitored during the development process.

 - Why use abstract class design at this level?
-    - On the one hand, it is to isolate the Tensor API from the specific implementation of Tensor without generating too many dependencies. If the Tensor API needs to be redesigned in the future, or the `autograd` information needs to be abandoned, only the Tensor API needs to be redesigned, which has little effect on the implementation of the underlying Tensor.
-    - On the other hand, in order to reserve sufficient expansion space for heterogeneous Tensors, the framework-level API only needs one Tensor data structure, and there is no need to expose multiple data structures. In fact, a large-scale definition is made here: all data structures in the framework are Tensors.
-        - For a basically consistent memory layout, or a basically consistent implementation of Tensor descriptions, it can be inherited based on an implementation of `DenseTensor`.
-        - For Tensors with a high degree of heterogeneity, new Tensor classes (such as Tensors with only one Object) can be directly inherited from Interface. This ensures that Tensor has no bottlenecks in scaling flexibility.
+  - On the one hand, it is to isolate the Tensor API from the specific implementation of Tensor without generating too many dependencies. If the Tensor API needs to be redesigned in the future, or the `autograd` information needs to be abandoned, only the Tensor API needs to be redesigned, which has little effect on the implementation of the underlying Tensor.
+  - On the other hand, in order to reserve sufficient expansion space for heterogeneous Tensors, the framework-level API only needs one Tensor data structure, and there is no need to expose multiple data structures. In fact, a large-scale definition is made here: all data structures in the framework are Tensors.
+    - For a basically consistent memory layout, or a basically consistent implementation of Tensor descriptions, it can be inherited based on an implementation of `DenseTensor`.
+    - For Tensors with a high degree of heterogeneity, new Tensor classes (such as Tensors with only one Object) can be directly inherited from Interface. This ensures that Tensor has no bottlenecks in scaling flexibility.

 ##### 2.3.3.3 DenseTensor、SparseTensor

@@ -334,12 +337,12 @@ Inherit other Tensors with high degrees of freedom: directly inherit `TensorBase

 - `TensorBase` is an abstract class, which leaves a lot of room for the description of specific Tensor. If the description of traditional Tensor cannot meet the requirements, a specialized Tensor implementation can be designed.

-
 #### 2.3.3 C++ API

 ##### 2.3.3.1 C++ API form

 > Highlights of this section:
+>
 > 1. The C++ API corresponds to the Python 2.0 API: the function name, parameter name, parameter order, and return value are the same.

 After investigation, we found that very few framework products are designed with the ease of use of the C++ API in mind. For the long-term consideration, if we want to attract more developers to build the paddle ecology, it is also very important to provide a standardized and easy-to-use C++ API architecture. At the same time, the Python 2.0 API project has laid a good reference foundation for the C++ API, and we can directly inherit its achievements.
@@ -362,11 +365,11 @@ Described as follows:
 **What scenarios is this new C++ API architecture mainly used for?**

 1. C++ API that can be called when developing custom operators, it improves ease of use.
-    - For example, the user needs to initialize a Tensor in a custom operator, loop through the Tensor data and assign values, then you can directly call `paddle::ones`, `paddle::full` APIs.
+   - For example, the user needs to initialize a Tensor in a custom operator, loop through the Tensor data and assign values, then you can directly call `paddle::ones`, `paddle::full` APIs.
 2. The architecture serves as the basic calling unit of the new dynamic graph.
-    - The new dynamic graph will use the API as the scheduling calculation unit, and will no longer call the Op architecture, thus improving the scheduling performance.
+   - The new dynamic graph will use the API as the scheduling calculation unit, and will no longer call the Op architecture, thus improving the scheduling performance.
 3. As a basis for the development of backward Op reuse forward Op.
-    - Now the backward op kernel needs to be implemented separately. After the API architecture is completed, it is hoped that the backward op implementation can be completed by reusing the forward API.
+   - Now the backward op kernel needs to be implemented separately. After the API architecture is completed, it is hoped that the backward op implementation can be completed by reusing the forward API.

 ##### 2.3.3.2 C++ API auto-generate

@@ -386,24 +389,24 @@ The key to C++ API generation lies in the configuration of the YAML file. Taking

 ```yaml
 ## Forward API configuration
- api : matmul
-  args : (Tensor x, Tensor y, bool transpose_x=false, bool transpose_y=false)
-  output : Tensor
-  infer_meta :
-    func : MatmulInferMeta
-  kernel :
-    func : matmul
-  backward : matmul_grad
+- api: matmul
+  args: (Tensor x, Tensor y, bool transpose_x=false, bool transpose_y=false)
+  output: Tensor
+  infer_meta:
+    func: MatmulInferMeta
+  kernel:
+    func: matmul
+  backward: matmul_grad

 ## Backward API configuration
- backward_api : matmul_grad
-  forward : matmul (Tensor x, Tensor y, bool transpose_x, bool transpose_y) -> Tensor(out)
-  args : (Tensor x, Tensor y, Tensor out_grad, bool transpose_x=false, bool transpose_y=false)
-  output : Tensor(x_grad), Tensor(y_grad)
-  infer_meta :
-    func : MatmulGradInferMeta
-  kernel :
-    func : matmul_grad
+- backward_api: matmul_grad
+  forward: matmul (Tensor x, Tensor y, bool transpose_x, bool transpose_y) -> Tensor(out)
+  args: (Tensor x, Tensor y, Tensor out_grad, bool transpose_x=false, bool transpose_y=false)
+  output: Tensor(x_grad), Tensor(y_grad)
+  infer_meta:
+    func: MatmulGradInferMeta
+  kernel:
+    func: matmul_grad
 ```

 The meaning of each configuration parameter:
@@ -412,9 +415,9 @@ The meaning of each configuration parameter:
 - args: the function parameters. Their order and data type must be exactly the same as the PHI Kernel function of the same name, and the `Attributes` type must be ranked after the `Tensor` type.
 - output: the output type. If there are multiple outputs, then separate them by commas (","). You can optionally mark the name of each input with "()" after the type (e.g. `Tensor(out)`). If there is no mark, the default markers is `out0`, `out1`, ...
 - infer_meta: calculate the dimension and type of the returned Tensor (see the introduction of the `InferMeta` function for details).
-    - func: the called `InferMeta` function. It's default input is all the parameters of the args item and the output parameter of api, the Tensor type variable in it will be automatically replaced with `MetaTensor`.
+  - func: the called `InferMeta` function. It's default input is all the parameters of the args item and the output parameter of api, the Tensor type variable in it will be automatically replaced with `MetaTensor`.
 - kernel: the specific Kernel function called by the API.
-    - func: the registered name of the kernel function (the name used by `REGISTER`, not the function name). It's default input is all the parameters of the args item and the output parameter of api.
+  - func: the registered name of the kernel function (the name used by `REGISTER`, not the function name). It's default input is all the parameters of the args item and the output parameter of api.
 - backward: (optional). The corresponding backward function name, if not set only the forward API will be generated.

 The YAML parsing script will automatically generate the corresponding C++ API according to the above configuration items. The generated code includes the relevant processing logic such as Kernel automatic selection, Tensor transformation, Data Transform, `InferMeta` and Kernel calling. For details, please refer to the generated code in `api.cc` .
@@ -426,10 +429,11 @@ Due to the large number of C++ APIs and their various forms and functions, some
 ##### 2.3.4.1 Kernel form

 > Highlights of this section:
+>
 > 1. Notes on Kernel function form:
-> (1) Data type `T` and `DeviceContext` (abbreviated as `Context`) as template parameters;
-> (2) `Context` is the first parameter of Kernel;
-> (3) The return value Tensor takes the form of a pointer as an input parameter, and the return value of Kernel itself is void.
+>    (1) Data type `T` and `DeviceContext` (abbreviated as `Context`) as template parameters;
+>    (2) `Context` is the first parameter of Kernel;
+>    (3) The return value Tensor takes the form of a pointer as an input parameter, and the return value of Kernel itself is void.

 This part includes the specific Kernel. The functions implemented in this part will be registered in the framework as Kernel for unified search and scheduling by the framework.

@@ -451,33 +455,37 @@ Described as follows:

 - The kernels of different devices must have different function implementations. The function names are named in **camel case**. Except for the capitalization of the first letter, the naming should be as consistent as possible with the API function name. The function names of the same calculation are kept the same, and the functions of different devices are managed through different files or directories.
 - There are generally two template parameters, `T` and `Context`, which are used to determine the data type and device type at runtime.
-    - According to our current architecture, the vast majority of Kernels reduce the code in the way of **specialized DeviceContext and data type**, which is consistent with the original `OpKernel` form.
-    - The form should be unified. If the Kernel level is also exposed as a fine-grained API in the future, the ease of use is guaranteed.
+  - According to our current architecture, the vast majority of Kernels reduce the code in the way of **specialized DeviceContext and data type**, which is consistent with the original `OpKernel` form.
+  - The form should be unified. If the Kernel level is also exposed as a fine-grained API in the future, the ease of use is guaranteed.
 - Specification of function input parameters:
-    - Take a specific `DeviceContext` (such as `CPUContext`, `GPUContext`) as the first input parameter to meet the needs of specific context information required at runtime. Pass the stream in if there are multiple streams.
-        - Currently, it is not supported to pass multiple `DeviceContext` parameters to one Kernel. At present, such a requirement is considered unreasonable.
-    - The parameter list is consistent with the API. If there is other special information that needs to be passed into the Kernel, pass it through the `Context`.
-    - Then all input Tensors and input Attributes are passed in with const &, and POD types are passed in directly by value.
-    - The input Tensor is a specific Tensor type, such as `DenseTensor` or `SelectedRows`, not the Tensor of the external interface API.
-    - Finally, the Tensor return value of the function, passed in as a pointer.
-    - In order to make the mechanism more flexible and allow the kernel to adapt to more scenarios, the declaration of flexible types of input, output and parameters will be allowed subsequently to adapt to non-Tensor input, output and Tensor Attribute.
+  - Take a specific `DeviceContext` (such as `CPUContext`, `GPUContext`) as the first input parameter to meet the needs of specific context information required at runtime. Pass the stream in if there are multiple streams.
+    - Currently, it is not supported to pass multiple `DeviceContext` parameters to one Kernel. At present, such a requirement is considered unreasonable.
+  - The parameter list is consistent with the API. If there is other special information that needs to be passed into the Kernel, pass it through the `Context`.
+  - Then all input Tensors and input Attributes are passed in with const &, and POD types are passed in directly by value.
+  - The input Tensor is a specific Tensor type, such as `DenseTensor` or `SelectedRows`, not the Tensor of the external interface API.
+  - Finally, the Tensor return value of the function, passed in as a pointer.
+  - In order to make the mechanism more flexible and allow the kernel to adapt to more scenarios, the declaration of flexible types of input, output and parameters will be allowed subsequently to adapt to non-Tensor input, output and Tensor Attribute.
 - The internal implementation of the function is determined on demand:
-    - Short term:
-        - Migrate the implementation of the existing `OpKernel` to the specific device Kernel.
-        - Abstract the implementation of `OpKernel` with common devices into functions, which are called by multiple device Kernels.
-    - Long term:
-        - The complex kernel directly calls the basic kernel to complete the calculation, encourages kernel reuse, thus simplifies the code.
+  - Short term:
+    - Migrate the implementation of the existing `OpKernel` to the specific device Kernel.
+    - Abstract the implementation of `OpKernel` with common devices into functions, which are called by multiple device Kernels.
+  - Long term:
+    - The complex kernel directly calls the basic kernel to complete the calculation, encourages kernel reuse, thus simplifies the code.

 > FAQ:

->- Why does the first parameter need to be `DeviceContext`? Why must this parameter be passed in?
+> - Why does the first parameter need to be `DeviceContext`? Why must this parameter be passed in?
+
    - The PHI kernel requires a pure function format. The variables used in the function are passed in through parameters or created inside the function, global singletons are not allowed inside the function. In order to adapt to various kernel requirements, the `DeviceContext` parameter that stores context information is necessary.
->- Why are two template parameters needed?
+
+> - Why are two template parameters needed?
+
    - In order to efficiently support the reusing of device-independent kernels. If we want to implement a Fourier transform `fft` kernel, assuming that the kernel can be derived by combining the basic kernels, the form of `Xxx<T, Device>()` can avoid dynamically redistributing devices.

 ##### 2.3.4.3 Kernel implementation

 > Highlights of this section:
+>
 > 1. Kernel focuses on computing logic without mixing scheduling logic.
 > 2. Kernel is fine-grained enough, with clear boundaries, no optional parameters, easy to reuse.

@@ -531,13 +539,14 @@ In addition to the change of kernel form from structure format to functional for
 2. In the PHI kernel, the memory application of the output Tensor is required to use the `ctx.Alloc` or `ctx.HostAlloc` method, and no longer use the original `mutable_data` to apply for memory.

 > FAQ
+>
 > 1. Why is `mutable_data` replaced by `ctx.Alloc`?
-> Answer: Because the global method `memory::AllocShared` called in the original `mutable_data` method uses a global singleton for memory allocation, which does not conform to the pure function design principle mentioned above. In terms of business requirements, if a single instance is used in the kernel to determine the way of memory allocation, in the multi-threaded environment of inference, different threads will not be able to flexibly specify different memory allocation ways.
-
+>    Answer: Because the global method `memory::AllocShared` called in the original `mutable_data` method uses a global singleton for memory allocation, which does not conform to the pure function design principle mentioned above. In terms of business requirements, if a single instance is used in the kernel to determine the way of memory allocation, in the multi-threaded environment of inference, different threads will not be able to flexibly specify different memory allocation ways.

 ##### 2.3.4.4 Kernel registration

 > Highlights of this section:
+>
 > 1. Kernel needs to expose all its key information to the framework and record its input, output and attribute information, otherwise it will lead to unclear boundaries between framework scheduling and Kernel calculation.

 When fluid Kernel is registered, only the `place`, `layout`, `dtype`, `input` and `output` of the Kernel are recorded and managed by `ExecutionContext`, and there is no corresponding information record. Now the kernel needs to be changed to a functional type. The input, output and attributes of each function are clear. We hope to record the information of each input and output here, which is also compatible with paddle-lite scheduling.
@@ -546,69 +555,69 @@ Meanwhile, we need to simplify the writing method of Kernel registration. The ex

 1. There is a lot of redundant information in the Kernel registration method of fluid. Taking `scale` as an example, you can see that in addition to the last data type of each kernel, the preceding function names and `DeviceContext` specialization information are redundant.

-    ```c++
-    REGISTER_OP_CPU_KERNEL(
-        scale, ops::ScaleKernel<phi::CPUContext, float>,
-        ops::ScaleKernel<phi::CPUContext, double>,
-        ops::ScaleKernel<phi::CPUContext,
-                         phi::dtype::bfloat16>,
-        ops::ScaleKernel<phi::CPUContext, uint8_t>,
-        ops::ScaleKernel<phi::CPUContext, int8_t>,
-        ops::ScaleKernel<phi::CPUContext, int16_t>,
-        ops::ScaleKernel<phi::CPUContext, int>,
-        ops::ScaleKernel<phi::CPUContext, int64_t>);
-    ```
+   ```c++
+   REGISTER_OP_CPU_KERNEL(
+       scale, ops::ScaleKernel<phi::CPUContext, float>,
+       ops::ScaleKernel<phi::CPUContext, double>,
+       ops::ScaleKernel<phi::CPUContext,
+                        phi::dtype::bfloat16>,
+       ops::ScaleKernel<phi::CPUContext, uint8_t>,
+       ops::ScaleKernel<phi::CPUContext, int8_t>,
+       ops::ScaleKernel<phi::CPUContext, int16_t>,
+       ops::ScaleKernel<phi::CPUContext, int>,
+       ops::ScaleKernel<phi::CPUContext, int64_t>);
+   ```

 2. Paddle-Lite's kernel registration method declares input and output information for each Kernel, but since the kernel of each data type is different, it will also cause redundancy in the writing method. As you can see in the following code, except for the data type, other information is basically redundant.

-    ```c++
-    #ifdef LITE_BUILD_EXTRA
-    using scale_int32_f =
-        paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kFloat)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_int32_f, int32)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-        .Finalize();
-
-    using scale_int64_f =
-        paddle::lite::kernels::arm::ScaleCompute<int64_t, PRECISION(kFloat)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_int64_f, int64)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-        .Finalize();
-    #endif  // LITE_BUILD_EXTRA
-
-    #ifdef ENABLE_ARM_FP16
-    using scale_float16 =
-        paddle::lite::kernels::arm::ScaleCompute<float16_t, PRECISION(kFP16)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kFP16, kNCHW, scale_float16, def)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
-        .Finalize();
-
-    #endif  // ENABLE_ARM_FP16
-
-    using scale_float =
-        paddle::lite::kernels::arm::ScaleCompute<float, PRECISION(kFloat)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_float, def)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
-        .Finalize();
-
-    using scale_int32 =
-        paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kInt32)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kInt32, kNCHW, scale_int32, def)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-        .Finalize();
-
-    using scale_int64 =
-        paddle::lite::kernels::arm::ScaleCompute<int64_t, PRECISION(kInt64)>;
-    REGISTER_LITE_KERNEL(scale, kARM, kInt64, kNCHW, scale_int64, def)
-        .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-        .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-        .Finalize();
-    ```
+   ```c++
+   #ifdef LITE_BUILD_EXTRA
+   using scale_int32_f =
+       paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kFloat)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_int32_f, int32)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+       .Finalize();
+
+   using scale_int64_f =
+       paddle::lite::kernels::arm::ScaleCompute<int64_t, PRECISION(kFloat)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_int64_f, int64)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+       .Finalize();
+   #endif  // LITE_BUILD_EXTRA
+
+   #ifdef ENABLE_ARM_FP16
+   using scale_float16 =
+       paddle::lite::kernels::arm::ScaleCompute<float16_t, PRECISION(kFP16)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kFP16, kNCHW, scale_float16, def)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFP16))})
+       .Finalize();
+
+   #endif  // ENABLE_ARM_FP16
+
+   using scale_float =
+       paddle::lite::kernels::arm::ScaleCompute<float, PRECISION(kFloat)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kFloat, kNCHW, scale_float, def)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+       .Finalize();
+
+   using scale_int32 =
+       paddle::lite::kernels::arm::ScaleCompute<int, PRECISION(kInt32)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kInt32, kNCHW, scale_int32, def)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+       .Finalize();
+
+   using scale_int64 =
+       paddle::lite::kernels::arm::ScaleCompute<int64_t, PRECISION(kInt64)>;
+   REGISTER_LITE_KERNEL(scale, kARM, kInt64, kNCHW, scale_int64, def)
+       .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+       .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+       .Finalize();
+   ```

 Therefore, in this design, we do not want to continue to maintain this redundant writing method. We hope that the writing method of kernel registration is concise enough, and at the same time, it can flexibly meet the requirements of Kernel input and output information configuration.

@@ -655,6 +664,7 @@ In addition, only basic template adaptation has been implemented at present, and
 ##### 2.3.4.4 Kernel management

 > Highlights of this section:
+>
 > 1. Introduce the design of the current Kernel management components

 For the management of the new form of Kernel, described as follows:
@@ -663,10 +673,10 @@ For the management of the new form of Kernel, described as follows:
 - `KernelKey` is similar to the original `OpKernelType`, but the `palce` and `library_type` fields are combined into one and called `Backend`, because the original `LibraryType` is a limited enumeration class, which is strongly related to place, the splitting increases the cost of understanding instead.
 - `Kernel` holds more information than the original `OpKernel`. In addition to the Function during execution, it also holds information about specific parameters, namely `KernelArgsDef`. For Tensor type input and output, it saves Tensor type information, Device, data Type, data layout. For Attribute type input and output, it saves type information.

-
 #### 2.3.5 Kernel Compilation and Dependencies

 > Highlights of this section:
+>
 > 1. Introduce the compilation design of the kernel.
 > 2. Introduce the establishment of kernel dependencies.

@@ -714,8 +724,9 @@ The original `InferShape` of fluid Op is the same as `OpKernel`, has the problem
 We also rewrite `InferShape` into a functional form, which supports different Ops to call the same `InferShape` function, which improves ease of use and reduces maintenance costs.

 > FAQ:
+>
 > 1. Why call it `InferMeta` instead of continuing to call it `InferShape`?
-> Answer: The `Meta` of `InferMeta` comes from the `meta` member in `DenseTensor`. In PHI, an op has two components, `InferMeta` and `Kernel`. `InferMeta` covers the functions of `InferShape`, but it is not limited to `InferShape`. In addition to the inference of dims and lod, `InferMeta` also infers dtype and layout, which is different from the original.
+>    Answer: The `Meta` of `InferMeta` comes from the `meta` member in `DenseTensor`. In PHI, an op has two components, `InferMeta` and `Kernel`. `InferMeta` covers the functions of `InferShape`, but it is not limited to `InferShape`. In addition to the inference of dims and lod, `InferMeta` also infers dtype and layout, which is different from the original.

 ##### 2.3.6.1 InferMeta related design

@@ -757,8 +768,8 @@ The purpose of using `MetaTensor` is to mask multiple Tensor types, and to be co

 The basic design of `MetaTensor` see the `paddle/phi/core/meta_tensor.h`. There is a pointer member `TensorBase` in the base class `MetaTensor`, so it can be compatible with `DenseTensor`, `SelectedRows`, `SparseCsrTensor` and other types in PHI.

-
 > Note:
 > Only the content related to the design of PHI itself in this README. If you want to know more about the design of how phi and fluid are compatible, please refer to:
+>
 > 1. [Paddle HIgh reusability operator library (PHI) Design Document (CN Version)](https://github.com/PaddlePaddle/docs/blob/develop/docs/design/phi/design_cn.md)
 > 2. [Paddle HIgh reusability operator library (PHI) Design Document (EN Version)](https://github.com/PaddlePaddle/docs/blob/develop/docs/design/phi/design_en.md)
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -330,9 +330,7 @@ class PRChecker:
            if filename.startswith(PADDLE_ROOT + 'python/'):
                file_list.append(filename)
            elif filename.startswith(PADDLE_ROOT + 'paddle/'):
-                if filename.startswith(PADDLE_ROOT + 'paddle/infrt'):
-                    filterFiles.append(filename)
-                elif filename.startswith(PADDLE_ROOT + 'paddle/scripts'):
+                if filename.startswith(PADDLE_ROOT + 'paddle/scripts'):
                    if filename.startswith(
                        (
                            PADDLE_ROOT + 'paddle/scripts/paddle_build.sh',