diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 355212280d40aef30c80cfc08d275af4a5657451..530e5e7f2472f1b86f744c1baa260c646e79c883 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -245,7 +245,7 @@ copy(inference_lib_dist
 # the header file of pten is copied to the experimental directory,
 # the include path of pten needs to be changed to adapt to inference api path
 add_custom_command(TARGET inference_lib_dist POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten.cmake"
+        COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten_header.cmake"
         COMMENT "Change pten header include path to adapt to inference api path")
 
 # CAPI inference library for only inference
diff --git a/cmake/pten.cmake b/cmake/pten_header.cmake
similarity index 100%
rename from cmake/pten.cmake
rename to cmake/pten_header.cmake
diff --git a/cmake/pten_kernel.cmake b/cmake/pten_kernel.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..3934a828c2c2a1e98c2d5459b47bb45a9e6a7430
--- /dev/null
+++ b/cmake/pten_kernel.cmake
@@ -0,0 +1,175 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# call kernel_declare need to make sure the target of input is exists
+function(kernel_declare TARGET_LIST)
+    foreach(kernel_path ${TARGET_LIST})
+        file(READ ${kernel_path} kernel_impl)
+        # TODO(chenweihang): rename PT_REGISTER_CTX_KERNEL to PT_REGISTER_KERNEL
+        # NOTE(chenweihang): now we don't recommend to use digit in kernel name
+        string(REGEX MATCH "PT_REGISTER_CTX_KERNEL\\([ \t\r\n]*[a-z_]*," first_registry "${kernel_impl}")
+        if (NOT first_registry STREQUAL "")
+            # parse the first kernel name
+            string(REPLACE "PT_REGISTER_CTX_KERNEL(" "" kernel_name "${first_registry}")
+            string(REPLACE "," "" kernel_name "${kernel_name}")
+            string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
+            # append kernel declare into declarations.h
+            # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
+            if (${kernel_path} MATCHES "./cpu\/")
+                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./gpu\/")
+                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./xpu\/")
+                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./npu\/*")
+                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, NPU, ALL_LAYOUT);\n")
+            else ()
+                # deal with device independent kernel, now we use CPU temporaary
+                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+            endif()
+        endif()
+    endforeach()
+endfunction()
+
+function(kernel_library TARGET)
+    set(common_srcs)
+    set(cpu_srcs)
+    set(gpu_srcs)
+    set(xpu_srcs)
+    set(npu_srcs)
+
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}"
+        "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH kernel_library_SRCS kernel_library_SRCS_len)
+    # one kernel only match one impl file in each backend
+    # TODO(chenweihang): parse compile deps by include headers
+    if (${kernel_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+            list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+        endif()
+        if (WITH_GPU OR WITH_ROCM)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
+                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu)
+            endif()
+        endif()
+        if (WITH_XPU)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
+                list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
+            endif()
+        endif()
+        if (WITH_ASCEND_CL)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
+                list(APPEND npu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/npu/${TARGET}.cc)
+            endif()
+        endif()
+    else()
+        # TODO(chenweihang): impl compile by source later
+    endif()
+
+    list(LENGTH common_srcs common_srcs_len)
+    list(LENGTH cpu_srcs cpu_srcs_len)
+    list(LENGTH gpu_srcs gpu_srcs_len)
+    list(LENGTH xpu_srcs xpu_srcs_len)
+    list(LENGTH npu_srcs npu_srcs_len)
+
+    if (${common_srcs_len} GREATER 0)
+        # If the kernel has a device independent public implementation,
+        # we will use this implementation and will not adopt the implementation
+        # under specific devices
+        if (WITH_GPU)
+            nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
+        elseif (WITH_ROCM)
+            hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
+        else()
+            cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS})
+        endif()
+    else()
+        # If the kernel has a header file declaration, but no corresponding
+        # implementation can be found, this is not allowed
+        if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
+            ${xpu_srcs_len} EQUAL 0 AND ${npu_srcs_len} EQUAL 0)
+            message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
+        else()
+            if (WITH_GPU)
+                if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                    nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
+                endif()
+            elseif (WITH_ROCM)
+                if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
+                    hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS})
+                endif()
+            else()
+                if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
+                    cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${npu_srcs} DEPS ${kernel_library_DEPS})
+                endif()
+            endif()
+        endif()
+    endif()
+
+    if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
+        ${xpu_srcs_len} GREATER 0 OR ${npu_srcs_len} GREATER 0)
+        # append target into PTEN_KERNELS property
+        get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
+        set(pten_kernels ${pten_kernels} ${TARGET})
+        set_property(GLOBAL PROPERTY PTEN_KERNELS ${pten_kernels})
+    endif()
+
+    # parse kernel name and auto generate kernel declaration
+    # here, we don't need to check WITH_XXX, because if not WITH_XXX, the
+    # xxx_srcs_len will be equal to 0
+    if (${common_srcs_len} GREATER 0)
+        kernel_declare(${common_srcs})
+    endif()
+    if (${cpu_srcs_len} GREATER 0)
+        kernel_declare(${cpu_srcs})
+    endif()
+    if (${gpu_srcs_len} GREATER 0)
+        kernel_declare(${gpu_srcs})
+    endif()
+    if (${xpu_srcs_len} GREATER 0)
+        kernel_declare(${xpu_srcs})
+    endif()
+    if (${npu_srcs_len} GREATER 0)
+        kernel_declare(${npu_srcs})
+    endif()
+endfunction()
+
+function(register_kernels)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs EXCLUDES DEPS)
+    cmake_parse_arguments(register_kernels "${options}" "${oneValueArgs}"
+        "${multiValueArgs}" ${ARGN})
+
+    file(GLOB KERNELS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_kernel.h")
+    string(REPLACE ".h" "" KERNELS "${KERNELS}")
+    list(LENGTH register_kernels_DEPS register_kernels_DEPS_len)
+
+    foreach(target ${KERNELS})
+        list(FIND register_kernels_EXCLUDES ${target} _index)
+        if (${_index} EQUAL -1)
+            if (${register_kernels_DEPS_len} GREATER 0)
+                kernel_library(${target} DEPS ${register_kernels_DEPS})
+            else()
+                kernel_library(${target})
+            endif()
+        endif()
+    endforeach()
+endfunction()
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 7c870ec336437cd3a39fa789d4414c60c32346ab..b6ea57fdf964bc3318b40d2b9994e191990d44d4 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -24,10 +24,12 @@ add_subdirectory(tests)
 
 # make an unity target for compile deps
 set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context)
-set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu conj_kernel_cpu scale_kernel_cpu full_kernel_cpu flatten)
+get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
+set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels})
+set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu manipulation_cpu conj_kernel_cpu)
 set(PTEN_DEPS ${PTEN_DEPS} nary unary binary)
 if(WITH_GPU OR WITH_ROCM)
-  set(PTEN_DEPS ${PTEN_DEPS} math_gpu linalg_gpu manipulation_gpu conj_kernel_gpu scale_kernel_gpu full_kernel_gpu)
+  set(PTEN_DEPS ${PTEN_DEPS} math_gpu linalg_gpu manipulation_gpu conj_kernel_gpu)
 endif()
 if(WITH_XPU)
   set(PTEN_DEPS ${PTEN_DEPS} manipulation_xpu)
diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h
index 01a3c193a34861883c403e399548dffe78a2caa4..4f2160a761836410661408ea5a09215865988e3f 100644
--- a/paddle/pten/api/lib/kernel_declare.h
+++ b/paddle/pten/api/lib/kernel_declare.h
@@ -20,25 +20,18 @@ limitations under the License. */
 // the kernel declare statement is automatically generated according to the
 // file name of the kernel, and this header file will be removed
 
-PT_DECLARE_KERNEL(full_like, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(flatten, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(sign, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(full_like, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(dot, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(cast, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(flatten, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(sign, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(scale, GPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(conj, GPU, ALL_LAYOUT);
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_DECLARE_KERNEL(flatten, XPU, ALL_LAYOUT);
 PT_DECLARE_KERNEL(reshape, XPU, ALL_LAYOUT);
 #endif
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index bacdc1ce67928ea5119811685732e94b9a6b0e1e..9d37effb8d7b312f4d4d7f7cd3f21029ac287d2a 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -1,3 +1,9 @@
+include(pten_kernel)
+
+set(kernel_declare_file ${PADDLE_BINARY_DIR}/paddle/pten/kernels/declarations.h.tmp CACHE INTERNAL "declarations.h file")
+set(kernel_declare_file_final ${PADDLE_BINARY_DIR}/paddle/pten/kernels/declarations.h)
+file(WRITE ${kernel_declare_file} "// Generated by the paddle/pten/kernels/CMakeLists.txt.  DO NOT EDIT!\n\n#pragma once\n\n")
+
 # kernel primitive api
 add_subdirectory(primitive)
 # pten hybird functors and functions called by kernels
@@ -18,17 +24,24 @@ if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
 
-set(FLATTEN_DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
+# pten depends all pten kernel targets
+set_property(GLOBAL PROPERTY PTEN_KERNELS "")
+
+set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function)
+
+# auto build kernel targets by cmake
+register_kernels(EXCLUDES flatten_kernel DEPS ${COMMON_KERNEL_DEPS})
+# TODO(chenweihang): auto parse compile deps by include headers later
+set(FLATTEN_DEPS ${COMMON_KERNEL_DEPS} utils_cpu unary)
 if(WITH_GPU OR WITH_ROCM)
   set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_gpu)
 elseif(WITH_XPU)
   set(FLATTEN_DEPS ${FLATTEN_DEPS} utils_xpu)
 endif()
+kernel_library(flatten_kernel DEPS ${FLATTEN_DEPS})
 
-if(WITH_GPU)
-  nv_library(flatten SRCS flatten_kernel.cc DEPS ${FLATTEN_DEPS})
-elseif(WITH_ROCM)
-  hip_library(flatten SRCS flatten_kernel.cc DEPS ${FLATTEN_DEPS})
-else()
-  cc_library(flatten SRCS flatten_kernel.cc DEPS ${FLATTEN_DEPS})
-endif()
+get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS)
+message(STATUS "PTEN_KERNELS: ${pten_kernels}")
+
+copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt
index 036ce68ee43c1efc96985526e99c0ab1aeec3743..7a32fab2674c34f6cb7d7218661139977fa2fc1c 100644
--- a/paddle/pten/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
@@ -2,6 +2,4 @@ cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
 cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
 cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
-cc_library(scale_kernel_cpu SRCS scale_kernel.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
-cc_library(full_kernel_cpu SRCS full_kernel.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
 cc_library(conj_kernel_cpu SRCS conj_kernel.cc DEPS dense_tensor kernel_context kernel_factory)
diff --git a/paddle/pten/kernels/flatten_kernel.cc b/paddle/pten/kernels/flatten_kernel.cc
index b2b5d74432a2e86f3a491fe00bcec5315db8c1bc..98df9b93d27763c6d14f69ccb0faf522463b314b 100644
--- a/paddle/pten/kernels/flatten_kernel.cc
+++ b/paddle/pten/kernels/flatten_kernel.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "paddle/pten/kernels/flatten_kernel.h"
-#include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/infermeta/unary.h"
diff --git a/paddle/pten/kernels/gpu/CMakeLists.txt b/paddle/pten/kernels/gpu/CMakeLists.txt
index 11ff1608b814c727f935a188e4021386fc6a3c99..a0646e1cb7879270d25e6bf95dc8d00e82ff470f 100644
--- a/paddle/pten/kernels/gpu/CMakeLists.txt
+++ b/paddle/pten/kernels/gpu/CMakeLists.txt
@@ -3,15 +3,11 @@ if(WITH_GPU)
   nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
   nv_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
   nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
-  nv_library(scale_kernel_gpu SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  nv_library(full_kernel_gpu SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
   nv_library(conj_kernel_gpu SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
 elseif(WITH_ROCM)
   hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
   hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
   hip_library(utils_gpu SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
   hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_gpu unary)
-  hip_library(scale_kernel_gpu SRCS scale_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
-  hip_library(full_kernel_gpu SRCS full_kernel.cu DEPS dense_tensor kernel_context kernel_factory eigen_function)
   hip_library(conj_kernel_gpu SRCS conj_kernel.cu DEPS dense_tensor kernel_context kernel_factory)
 endif()
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 029985475011eaa4aac415680dc0b9307f79a73e..e1822a62b8c51794e616ae38de424e136754429f 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -349,6 +349,7 @@ def source_include(header_file_path):
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/include/core.h"
 #include "paddle/pten/include/infermeta.h"
+#include "paddle/pten/kernels/declarations.h"
 """