[PluggableDevice] Add custom kernel support based on pten kernel management (#38848)

* [Demo] custom kernel based on pten kernel * merge and npu custom work well * del comments * delete other code * fix CUDAContext * fix not found small_vector.h * support NPU * fix NPUContext * fix DeviceContext support * add UT * fix call * add UT * fix * fix for comments and ut * add MACRO control * fix multi input output * support env CUSTOM_DEVICE_ROOT * deal with special cases * fix for Windows * try coverage with test_custom_kernel_dot.py * fix test_custom_kernel_dot * fix test_custom_kernel_dot * fix merge * fix merge * fix CI * update * merge and fix * remove WITH_CUSTOM_KERNEL * fix merge * merge and fix * fix ut * fix ut for mac * add more UT * add more UT * fix

[PluggableDevice] Add custom kernel support based on pten kernel management (#38848)
* [Demo] custom kernel based on pten kernel * merge and npu custom work well * del comments * delete other code * fix CUDAContext * fix not found small_vector.h * support NPU * fix NPUContext * fix DeviceContext support * add UT * fix call * add UT * fix * fix for comments and ut * add MACRO control * fix multi input output * support env CUSTOM_DEVICE_ROOT * deal with special cases * fix for Windows * try coverage with test_custom_kernel_dot.py * fix test_custom_kernel_dot * fix test_custom_kernel_dot * fix merge * fix merge * fix CI * update * merge and fix * remove WITH_CUSTOM_KERNEL * fix merge * merge and fix * fix ut * fix ut for mac * add more UT * add more UT * fix
a8879215 · Aganlengzi · GitHub · 7e6a2190 · a8879215 · a8879215
25 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -437,11 +437,12 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)

 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
-
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS
+           tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api)
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)

-set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
+set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel)

 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})

@@ -451,3 +452,4 @@ endif()

 cc_test(scope_guard_test SRCS scope_guard_test.cc)
 cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils)
+cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor)
--- a/paddle/fluid/framework/custom_kernel.cc
+++ b/paddle/fluid/framework/custom_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined _WIN32 || defined __APPLE__
+#else
+#define _LINUX
+#endif
+
+#include "paddle/fluid/framework/custom_kernel.h"
+#include <dirent.h>
+#include <algorithm>
+#include <regex>
+#include "paddle/fluid/framework/op_kernel_info_helper.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/api/ext/op_kernel_info.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+DECLARE_bool(run_pten_kernel);
+
+namespace paddle {
+
+namespace framework {
+
+// set pten::Kernel args_def_ from op_kernel_info
+// because we can not set directly to pten::Kernel without exposing
+// pten::KernelArgsDef when parsing custom user function
+static void ParseArgs(const OpKernelInfo& op_kernel_info,
+                      pten::KernelArgsDef* args_def) {
+  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
+  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
+  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
+
+  for (auto& input : input_defs) {
+    args_def->AppendInput(input.backend, input.layout, input.dtype);
+  }
+  for (auto& output : output_defs) {
+    args_def->AppendOutput(output.backend, output.layout, output.dtype);
+  }
+  for (auto& attr : attribute_defs) {
+    args_def->AppendAttribute(attr.type_index);
+  }
+}
+
+// custom pten kernel call function define
+static void RunKernelFunc(pten::KernelContext* ctx,
+                          const OpKernelInfo& op_kernel_info) {
+  VLOG(3) << "[CUSTOM KERNEL] RunKernelFunc begin...";
+
+  // input and output size is not params' num
+  // but actual Tensors' size
+  size_t input_size = ctx->InputsSize();
+  size_t output_size = ctx->OutputsSize();
+  size_t attr_size = ctx->AttrsSize();
+
+  // parameters' num of unified user kernel function
+  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
+  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
+  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
+
+  PADDLE_ENFORCE_GE(input_size, input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of ctx inputs size (%d) must be larger than "
+                        "the size of kernel input_defs (%d).",
+                        input_size, input_defs.size()));
+
+  PADDLE_ENFORCE_GE(output_size, output_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of ctx outputs size (%d) must be larger than "
+                        "the size of kernel output_defs (%d).",
+                        output_size, output_defs.size()));
+
+  PADDLE_ENFORCE_EQ(attr_size, attribute_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of ctx attribute size (%d) must be equal to "
+                        "to the size of kernel attribute_defs (%d).",
+                        attr_size, attribute_defs.size()));
+
+  VLOG(3) << "[CUSTOM KERNEL] Input num: " << input_defs.size()
+          << "[tensor size:" << input_size << "]"
+          << " Attribute num: " << attribute_defs.size()
+          << " Output num: " << output_defs.size()
+          << "[tensor size:" << output_size << "].";
+
+  // Inputs mapping
+  std::vector<paddle::experimental::Tensor> custom_ins;
+  std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
+  for (size_t in_idx = 0; in_idx < input_defs.size(); ++in_idx) {
+    VLOG(3) << "Mapping Input[" << in_idx << "]";
+    const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
+
+    // is_vector tells if this Input is Tensor or std::vector<Tensor>
+    if (!input_defs.at(in_idx).is_vector) {
+      paddle::experimental::Tensor custom_t;
+      auto& ctx_tensor = ctx->InputAt<pten::DenseTensor>(range.first);
+      custom_t.set_impl(std::make_shared<pten::DenseTensor>(ctx_tensor));
+      custom_ins.emplace_back(custom_t);
+    } else {
+      std::vector<paddle::experimental::Tensor> custom_vec_in;
+      auto ctx_tensor_vec =
+          ctx->MoveInputsBetween<pten::DenseTensor>(range.first, range.second);
+      for (auto& ctx_tensor : ctx_tensor_vec) {
+        paddle::experimental::Tensor custom_t;
+        custom_t.set_impl(std::make_shared<pten::DenseTensor>(ctx_tensor));
+        custom_vec_in.emplace_back(custom_t);
+      }
+      custom_vec_ins.emplace_back(custom_vec_in);
+    }
+    VLOG(3) << "Mapped Input[" << in_idx << "] with range[" << range.first
+            << "," << range.second << ").";
+  }
+
+  // Attributes mapping
+  std::vector<paddle::any> custom_attrs;
+  for (size_t attr_idx = 0; attr_idx < attribute_defs.size(); ++attr_idx) {
+    VLOG(3) << "Mapping Attribute[" << attr_idx << "]";
+    if (attribute_defs[attr_idx].type_index == std::type_index(typeid(bool))) {
+      bool arg = ctx->AttrAt<bool>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(int))) {
+      int arg = ctx->AttrAt<int>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(float))) {
+      float arg = ctx->AttrAt<float>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(double))) {
+      double arg = ctx->AttrAt<double>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(int64_t))) {
+      int64_t arg = ctx->AttrAt<int64_t>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(pten::dtype::float16))) {
+      pten::dtype::float16 arg = ctx->AttrAt<pten::dtype::float16>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(DataType))) {
+      DataType arg = ctx->AttrAt<DataType>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(const Scalar&))) {
+      const Scalar& arg = ctx->AttrAt<const Scalar&>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(const std::vector<int64_t>&))) {
+      const std::vector<int64_t>& arg =
+          ctx->AttrAt<const std::vector<int64_t>&>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(const ScalarArray&))) {
+      const ScalarArray& arg = ctx->AttrAt<const ScalarArray&>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(const std::vector<int>&))) {
+      const std::vector<int>& arg =
+          ctx->AttrAt<const std::vector<int>&>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported attribute attribute_defs[%d].type_index", attr_idx));
+    }
+    VLOG(3) << "Mapped Attribute[" << attr_idx << "]";
+  }
+
+  // Outputs mapping
+  std::vector<paddle::experimental::Tensor*> custom_outs;
+  std::vector<std::vector<paddle::experimental::Tensor*>> custom_vec_outs;
+  std::vector<std::shared_ptr<pten::DenseTensor>> custom_outs_ptr;
+  std::vector<std::vector<std::shared_ptr<pten::DenseTensor>>>
+      custom_vec_outs_ptr;
+
+  for (size_t out_idx = 0; out_idx < output_defs.size(); ++out_idx) {
+    VLOG(3) << "Mapping Output[" << out_idx << "]";
+    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
+
+    // is_vector tells if this Output is Tensor or std::vector<Tensor>
+    if (!output_defs.at(out_idx).is_vector) {
+      auto* ctx_tensor = ctx->MutableOutputAt<pten::DenseTensor>(range.first);
+      auto* custom_t = new paddle::experimental::Tensor();
+      auto custom_t_ptr = std::make_shared<pten::DenseTensor>(*ctx_tensor);
+      custom_t->set_impl(custom_t_ptr);
+      custom_outs.emplace_back(custom_t);
+      custom_outs_ptr.emplace_back(custom_t_ptr);
+    } else {
+      std::vector<paddle::experimental::Tensor*> custom_vec_out;
+      std::vector<std::shared_ptr<pten::DenseTensor>> custom_vec_out_ptr;
+      auto ctx_tensor_vec = ctx->MutableOutputBetween<pten::DenseTensor>(
+          range.first, range.second);
+      for (auto ctx_tensor : ctx_tensor_vec) {
+        auto* custom_t = new paddle::experimental::Tensor();
+        auto custom_t_ptr = std::make_shared<pten::DenseTensor>(*ctx_tensor);
+        custom_t->set_impl(custom_t_ptr);
+        custom_vec_out.emplace_back(custom_t);
+        custom_vec_out_ptr.emplace_back(custom_t_ptr);
+      }
+      custom_vec_outs.emplace_back(custom_vec_out);
+      custom_vec_outs_ptr.emplace_back(custom_vec_out_ptr);
+    }
+    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
+            << "," << range.second << ").";
+  }
+
+  // DeviceContext
+  // In pten, the first paramter XXContext is decided when registering
+  // through template param, but custom kernel function use unified
+  // DeviceContext as first parameter of user_kernel_fn, we use backend
+  // from OpKernelInfo to decide XXContext. In temporary simple
+  // DeviceContext, we just set necessary info to dev_ctx(such as stream
+  // in NPUContext), more related work should be done when
+  // pten::DeviceContext is exposed to outer.
+  DeviceContext dev_ctx;
+  auto& backend = OpKernelInfoHelper::GetBackend(op_kernel_info);
+  if (backend == pten::Backend::CPU) {
+    // do nothing
+  } else {
+    LOG(ERROR) << "[CUSTOM KERNEL] Unsupported kernel backend: " << backend
+               << " with compiled Paddle.";
+    return;
+  }
+
+  auto& user_kernel_fn = OpKernelInfoHelper::GetKernelFn(op_kernel_info);
+  // call user function
+  user_kernel_fn(dev_ctx, custom_ins, custom_vec_ins, custom_attrs,
+                 &custom_outs, &custom_vec_outs);
+
+  VLOG(3) << "[CUSTOM KERNEL] finished call user kernel function.";
+
+  // NOTE: Map back the output tensors with stored shared_ptrs.
+  for (int out_idx = output_defs.size() - 1; out_idx >= 0; --out_idx) {
+    VLOG(3) << "Mapping Back Output[" << out_idx << "]";
+    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
+
+    // is_vector tells if this Output is Tensor or std::vector<Tensor>
+    if (!output_defs.at(out_idx).is_vector) {
+      auto* ctx_tensor = ctx->MutableOutputAt<pten::DenseTensor>(range.first);
+      *ctx_tensor = *(custom_outs_ptr.back().get());
+      custom_outs_ptr.pop_back();
+    } else {
+      auto ctx_tensor_vec = ctx->MutableOutputBetween<pten::DenseTensor>(
+          range.first, range.second);
+      auto custom_vec_ptr_out = custom_vec_outs_ptr.back();
+      for (int idx = ctx_tensor_vec.size() - 1; idx >= 0; --idx) {
+        *(ctx_tensor_vec[idx]) = *(custom_vec_ptr_out.back().get());
+        custom_vec_ptr_out.pop_back();
+      }
+      custom_vec_outs_ptr.pop_back();
+    }
+    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
+            << "," << range.second << "].";
+  }
+
+  // delete newed paddle::Tensor for outputs while calling user kernel function
+  for (size_t i = 0; i < custom_outs.size(); ++i) {
+    delete custom_outs[i];
+  }
+  for (size_t i = 0; i < custom_vec_outs.size(); ++i) {
+    for (size_t j = 0; j < custom_vec_outs[i].size(); ++j) {
+      delete custom_vec_outs[i][j];
+    }
+  }
+}
+
+void RegisterKernelWithMetaInfo(
+    const std::vector<OpKernelInfo>& op_kernel_infos) {
+  PADDLE_ENFORCE_EQ(FLAGS_run_pten_kernel, true,
+                    platform::errors::Unimplemented(
+                        "Custom Kernel depends on pten kernel enabled,"));
+
+  for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
+    auto& kernel_info = op_kernel_infos[i];
+    auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);
+    auto kernel_key = OpKernelInfoHelper::GetKernelKey(kernel_info);
+
+    VLOG(3) << "[CUSTOM KERNEL] registering [" << op_type << "]" << kernel_key;
+
+    // 1.Check whether this kernel is valid for a specific operator
+    PADDLE_ENFORCE_EQ(
+        pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type), true,
+        platform::errors::InvalidArgument(
+            "[CUSTOM KERNEL] %s is not ready for custom kernel registering.",
+            op_type));
+
+    // 2.Check whether kernel_key has been already registed
+    PADDLE_ENFORCE_EQ(
+        pten::KernelFactory::Instance().kernels()[op_type].find(kernel_key),
+        pten::KernelFactory::Instance().kernels()[op_type].end(),
+        platform::errors::InvalidArgument(
+            "[CUSTOM KERNEL] The operator <%s>'s kernel: %s has been "
+            "already existed in Paddle, please contribute PR if need "
+            "to optimize the kernel code. Custom kernel do NOT support "
+            "to replace existing kernel in Paddle.",
+            op_type, kernel_key));
+
+    // pten::KernelFn
+    pten::KernelFn kernel_fn = [kernel_info](pten::KernelContext* ctx) {
+      VLOG(3) << "[CUSTOM KERNEL] run custom PTEN kernel func in lambda.";
+      RunKernelFunc(ctx, kernel_info);
+    };
+    // variadic_kernel_fn
+    void* variadic_kernel_fn =
+        OpKernelInfoHelper::GetVariadicKernelFn(kernel_info);
+    pten::Kernel kernel(kernel_fn, variadic_kernel_fn);
+    // args info
+    ParseArgs(kernel_info, kernel.mutable_args_def());
+    // register custom kernel to pten::KernelFactory
+    pten::KernelFactory::Instance().kernels()[op_type][kernel_key] = kernel;
+    VLOG(3) << "[CUSTOM KERNEL] Successed in registering operator <" << op_type
+            << ">'s kernel " << kernel_key << " to Paddle. "
+            << "It will be used like native ones.";
+  }
+}
+
+void RegisterKernelWithMetaInfoMap(
+    const paddle::OpKernelInfoMap& op_kernel_info_map) {
+  auto& kernel_info_map = op_kernel_info_map.GetMap();
+  VLOG(3) << "[CUSTOM KERNEL] size of op_kernel_info_map: "
+          << kernel_info_map.size();
+
+  // pair: {op_type, OpKernelInfo}
+  for (auto& pair : kernel_info_map) {
+    VLOG(3) << "[CUSTOM KERNEL] pair first -> op name: " << pair.first;
+    RegisterKernelWithMetaInfo(pair.second);
+  }
+}
+
+void LoadCustomKernelLib(const std::string& dso_lib_path) {
+#ifdef _LINUX
+  void* dso_handle = nullptr;
+  int dynload_flags = RTLD_NOW | RTLD_LOCAL;
+  dso_handle = dlopen(dso_lib_path.c_str(), dynload_flags);
+
+  // MUST valid dso_lib_path
+  PADDLE_ENFORCE_NOT_NULL(
+      dso_handle,
+      platform::errors::InvalidArgument(
+          "Fail to open library: %s with error: %s", dso_lib_path, dlerror()));
+
+  typedef OpKernelInfoMap& get_op_kernel_info_map_t();
+  auto* func = reinterpret_cast<get_op_kernel_info_map_t*>(
+      dlsym(dso_handle, "PD_GetOpKernelInfoMap"));
+
+  if (func == nullptr) {
+    LOG(INFO) << "Skipped lib [" << dso_lib_path << "]: fail to find "
+              << "PD_GetOpKernelInfoMap symbol in this lib.";
+    return;
+  }
+  auto& op_kernel_info_map = func();
+  RegisterKernelWithMetaInfoMap(op_kernel_info_map);
+  LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
+#else
+  VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
+#endif
+  return;
+}
+
+// List all libs with given path
+std::vector<std::string> ListAllLib(const std::string& libs_path) {
+  DIR* dir = nullptr;
+  dir = opendir(libs_path.c_str());
+
+  // MUST valid libs_path
+  PADDLE_ENFORCE_NOT_NULL(dir, platform::errors::InvalidArgument(
+                                   "Fail to open path: %s", libs_path));
+
+  dirent* ptr = nullptr;
+  std::vector<std::string> libs;
+  std::regex express(".*\\.so");
+  std::match_results<std::string::iterator> results;
+  while ((ptr = readdir(dir)) != nullptr) {
+    std::string filename(ptr->d_name);
+    if (std::regex_match(filename.begin(), filename.end(), results, express)) {
+      libs.emplace_back(libs_path + '/' + filename);
+      LOG(INFO) << "Found lib [" << filename << "]";
+    } else {
+      VLOG(3) << "Skipped file [" << filename << "] without .so postfix";
+    }
+  }
+  closedir(dir);
+  return libs;
+}
+
+// Load custom kernels with given path
+void LoadCustomKernel(const std::string& libs_path) {
+  VLOG(3) << "Try loading custom libs from: [" << libs_path << "]";
+  std::vector<std::string> libs = ListAllLib(libs_path);
+  for (auto& lib_path : libs) {
+    LoadCustomKernelLib(lib_path);
+  }
+  LOG(INFO) << "Finished in LoadCustomKernel with libs_path: [" << libs_path
+            << "]";
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/custom_kernel.h
+++ b/paddle/fluid/framework/custom_kernel.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/ext/op_kernel_info.h"
+
+namespace paddle {
+namespace framework {
+
+// Load custom kernel lib from giwen path
+void LoadCustomKernel(const std::string& libs_path);
+
+void LoadCustomKernelLib(const std::string& dso_lib_path);
+
+// Load custom kernel api: register kernel after user compiled
+void LoadOpKernelInfoAndRegister(const std::string& dso_name);
+
+// Register custom kernel api: register kernel directly
+void RegisterKernelWithMetaInfoMap(
+    const paddle::OpKernelInfoMap& op_kernel_info_map);
+
+// Interface for selective register custom kernel.
+void RegisterKernelWithMetaInfo(
+    const std::vector<OpKernelInfo>& op_kernel_infos);
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/custom_kernel_test.cc
+++ b/paddle/fluid/framework/custom_kernel_test.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined _WIN32 || defined __APPLE__
+#else
+#define _LINUX
+#endif
+
+#include "paddle/fluid/framework/custom_kernel.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/extension.h"
+#include "paddle/fluid/framework/op_kernel_info_helper.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_factory.h"
+#include "paddle/pten/infermeta/binary.h"
+#include "paddle/utils/small_vector.h"
+
+#ifdef _LINUX
+// user kernel function
+namespace custom_kernel {
+
+// Here we use dot <CPU, ANY, UINT8> for test
+// This test will fail when these two kernels are aupported in framework
+// input 3: two Tensors and one std::vector<Tensor>
+// attribute 11: fake_attributes
+// output 2: one Tensor* and one std::vector<Tensor*>
+template <typename T>
+void FakeDot(const paddle::CPUContext& dev_ctx, const paddle::Tensor& x,
+             const paddle::Tensor& y,
+             const std::vector<paddle::Tensor>& fake_input_vec,
+             bool fake_attr_bool, int fake_attr_int, float fake_attr_float,
+             double fake_attr_double, int64_t fake_attr_int64,
+             pten::dtype::float16 fake_attr_f16, pten::DataType fake_attr_dtype,
+             const pten::Scalar& fake_attr_scalar,
+             const pten::ScalarArray& fake_attr_scalar_array,
+             const std::vector<int64_t>& fake_attr_int64_vec,
+             const std::vector<int>& fake_attr_int_vec, paddle::Tensor* out,
+             std::vector<paddle::Tensor*> fake_out_vec) {
+  // print param info
+  std::cout << "fake_input_vec.size: " << fake_input_vec.size() << std::endl;
+  std::cout << "fake_attr_bool: " << fake_attr_bool << std::endl;
+  std::cout << "fake_attr_int: " << fake_attr_int << std::endl;
+  std::cout << "fake_attr_float: " << fake_attr_float << std::endl;
+  std::cout << "fake_attr_double: " << fake_attr_double << std::endl;
+  std::cout << "fake_attr_int64: " << fake_attr_int64 << std::endl;
+  std::cout << "fake_attr_f16: " << fake_attr_f16 << std::endl;
+  std::cout << "fake_attr_dtype: " << fake_attr_dtype << std::endl;
+  std::cout << "fake_attr_int64_vec: " << fake_attr_int64_vec.size()
+            << std::endl;
+  std::cout << "fake_attr_int_vec: " << fake_attr_int_vec.size() << std::endl;
+  std::cout << "fake_out_vec: " << fake_out_vec.size() << std::endl;
+
+  // assert check
+  assert(fake_input_vec.size() == 2);
+  assert(fake_attr_bool == false);
+  assert(fake_attr_int == 1);
+  assert(fake_attr_float == 2);
+  assert(fake_attr_double == 3);
+  assert(fake_attr_int64 == 4);
+  assert(fake_attr_f16 == 5);
+  assert(fake_attr_dtype == pten::DataType::UINT32);
+  assert(fake_attr_int64_vec.size() == 0);
+  assert(fake_attr_int_vec.size() == 0);
+  assert(fake_out_vec.size() == 2);
+
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
+  auto shape = x.shape();
+  auto const N = x.numel();
+  auto const B = shape[shape.size() - 1];
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+}  // namespace custom_kernel
+
+PD_REGISTER_KERNEL(dot, CPU, ALL_LAYOUT, UINT8,
+                   custom_kernel::FakeDot<uint8_t>) {
+  /* do some args define here
+   * the only param can be used is OpKernelInfo* kernel */
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UINT8);
+}
+
+// Upper code will store dot kernels info into OpKernelInfoMap
+TEST(CustomKernel, custom_kernel_dot) {
+  std::string op_name = "dot";
+  pten::Backend backend = pten::Backend::CPU;
+  pten::DataLayout layout = pten::DataLayout::ANY;
+  pten::DataType dtype = pten::DataType::UINT8;
+
+  // 1.custom kernel info parsed and store
+  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance().GetMap().find("dot") !=
+              paddle::OpKernelInfoMap::Instance().GetMap().end());
+
+  // 2.info check
+  EXPECT_EQ(
+      1, static_cast<int>(paddle::OpKernelInfoMap::Instance()["dot"].size()));
+  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetBackend() ==
+              backend);
+  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetDataLayout() ==
+              layout);
+  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetDataType() ==
+              dtype);
+
+  // 3.register
+  EXPECT_TRUE(pten::KernelFactory::Instance().kernels().end() !=
+              pten::KernelFactory::Instance().kernels().find("dot"));
+
+  pten::KernelKey kernel_key(backend, layout, dtype);
+  EXPECT_TRUE(
+      pten::KernelFactory::Instance().kernels()["dot"].find(kernel_key) ==
+      pten::KernelFactory::Instance().kernels()["dot"].end());
+
+  paddle::framework::RegisterKernelWithMetaInfoMap(
+      paddle::OpKernelInfoMap::Instance());
+
+  EXPECT_TRUE(
+      pten::KernelFactory::Instance().kernels()["dot"].find(kernel_key) !=
+      pten::KernelFactory::Instance().kernels()["dot"].end());
+
+  // 4.kernel select
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      op_name, kernel_key);
+
+  // 5.prepare parameters for kernel
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc.get(), pten::DenseTensorMeta(pten::DataType::UINT8,
+                                         paddle::framework::make_ddim({2, 3}),
+                                         pten::DataLayout::NCHW));
+  auto* dense_x_data =
+      dense_x->mutable_data<uint8_t>(paddle::platform::CPUPlace());
+
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      alloc.get(), pten::DenseTensorMeta(pten::DataType::UINT8,
+                                         paddle::framework::make_ddim({2, 3}),
+                                         pten::DataLayout::NCHW));
+  auto* dense_y_data =
+      dense_y->mutable_data<uint8_t>(paddle::platform::CPUPlace());
+
+  // dot x,y and result
+  uint8_t sum[2] = {0, 0};
+  for (size_t i = 0; i < 2; ++i) {
+    for (size_t j = 0; j < 3; ++j) {
+      dense_x_data[i * 3 + j] = (i * 3 + j);
+      dense_y_data[i * 3 + j] = (i * 3 + j);
+      sum[i] += (i * 3 + j) * (i * 3 + j);
+    }
+  }
+
+  // 6.prepare kernel_context
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+  kernel_context.EmplaceBackInput(dense_x.get());  // idx:0, index:[0,1)
+  kernel_context.EmplaceBackInput(dense_y.get());  // idx:1, index:[1,2)
+
+  // fake_input_vec: idx:2, index:[2,4)
+  size_t fake_input_vec_idx = 2;
+  size_t fake_input_vec_index_start = 2;
+  size_t fake_input_vec_index_end = 4;
+  kernel_context.EmplaceBackInputWithoutSetRange(dense_x.get());
+  kernel_context.EmplaceBackInputWithoutSetRange(dense_y.get());
+  kernel_context.AssignInputRange(
+      std::make_pair(fake_input_vec_index_start, fake_input_vec_index_end),
+      fake_input_vec_idx);
+
+  bool fake_attr_bool = false;
+  int fake_attr_int = 1;
+  float fake_attr_float = 2.0;
+  double fake_attr_double = 3.0;
+  int64_t fake_attr_int64 = 4;
+  pten::dtype::float16 fake_attr_f16 = pten::dtype::float16(5);
+  pten::DataType fake_attr_dtype = pten::DataType::UINT32;
+  paddle::framework::LoDTensor tmp_tensor;
+  tmp_tensor.mutable_data<uint8_t>({1}, pten::TransToFluidPlace(backend));
+  pten::Scalar fake_attr_scalar =
+      paddle::experimental::MakePtenScalar(tmp_tensor);
+  pten::ScalarArray fake_attr_scalar_array;
+  std::vector<int64_t> fake_attr_int64_vec;
+  std::vector<int> fake_attr_int_vec;
+
+  kernel_context.EmplaceBackAttr(fake_attr_bool);
+  kernel_context.EmplaceBackAttr(fake_attr_int);
+  kernel_context.EmplaceBackAttr(fake_attr_float);
+  kernel_context.EmplaceBackAttr(fake_attr_double);
+  kernel_context.EmplaceBackAttr(fake_attr_int64);
+  kernel_context.EmplaceBackAttr(fake_attr_f16);
+  kernel_context.EmplaceBackAttr(fake_attr_dtype);
+  kernel_context.EmplaceBackAttr(fake_attr_scalar);
+  kernel_context.EmplaceBackAttr(fake_attr_scalar_array);
+  kernel_context.EmplaceBackAttr(fake_attr_int64_vec);
+  kernel_context.EmplaceBackAttr(fake_attr_int_vec);
+
+  auto out_meta = pten::DotInferMeta(dense_x->meta(), dense_y->meta());
+  auto dense_out = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          pten::TransToFluidPlace(backend)),
+      std::move(out_meta));
+  kernel_context.EmplaceBackOutput(dense_out.get());  // idx:0 index:[0,1)
+
+  // fake_input_vec: idx:1, index:[1,3)
+  size_t fake_out_vec_idx = 1;
+  size_t fake_out_vec_index_start = 1;
+  size_t fake_out_vec_index_end = 3;
+  kernel_context.EmplaceBackOutputWithoutSetRange(dense_out.get());
+  kernel_context.EmplaceBackOutputWithoutSetRange(dense_out.get());
+  kernel_context.AssignOutputRange(
+      std::make_pair(fake_out_vec_index_start, fake_out_vec_index_end),
+      fake_out_vec_idx);
+
+  // 7.kernel call
+  kernel(&kernel_context);
+
+  // 8.check result
+  ASSERT_EQ(dense_out->dims().size(), 2);
+  ASSERT_EQ(dense_out->dims()[0], 2);
+  ASSERT_EQ(dense_out->numel(), 2);
+  ASSERT_EQ(dense_out->dtype(), pten::DataType::UINT8);
+  ASSERT_EQ(dense_out->layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(dense_out->initialized(), true);
+
+  auto expect_result = sum;
+  auto actual_result0 = dense_out->data<uint8_t>()[0];
+  auto actual_result1 = dense_out->data<uint8_t>()[1];
+  ASSERT_EQ(expect_result[0], actual_result0);
+  ASSERT_EQ(expect_result[1], actual_result1);
+}
+
+// test OpKernelInfoHelper
+TEST(OpKernelInfoHelper, op_kernel_info_help_getters) {
+  using OpKernelInfoHelper = paddle::framework::OpKernelInfoHelper;
+  std::string op_name = "dot";
+  pten::Backend backend = pten::Backend::CPU;
+  pten::DataLayout layout = pten::DataLayout::ANY;
+  pten::DataType dtype = pten::DataType::UINT8;
+
+  auto op_kernel_info = paddle::OpKernelInfoMap::Instance()[op_name][0];
+
+  EXPECT_EQ(op_name, OpKernelInfoHelper::GetOpName(op_kernel_info));
+  EXPECT_EQ(backend, OpKernelInfoHelper::GetBackend(op_kernel_info));
+  EXPECT_EQ(layout, OpKernelInfoHelper::GetDataLayout(op_kernel_info));
+  EXPECT_EQ(dtype, OpKernelInfoHelper::GetDataType(op_kernel_info));
+
+  EXPECT_EQ(pten::KernelKey(backend, layout, dtype),
+            OpKernelInfoHelper::GetKernelKey(op_kernel_info));
+
+  paddle::CustomKernelFunc kernel_fn =
+      PD_PT_KERNEL(custom_kernel::FakeDot<uint8_t>);
+  EXPECT_EQ(kernel_fn, OpKernelInfoHelper::GetKernelFn(op_kernel_info));
+
+  void* variadic_func = PD_PT_VARIADIC_KERNEL(custom_kernel::FakeDot<uint8_t>);
+  EXPECT_EQ(variadic_func,
+            OpKernelInfoHelper::GetVariadicKernelFn(op_kernel_info));
+
+  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
+  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
+  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
+  EXPECT_EQ(3, static_cast<int>(input_defs.size()));
+  EXPECT_EQ(2, static_cast<int>(output_defs.size()));
+  EXPECT_EQ(11, static_cast<int>(attribute_defs.size()));
+}
+#endif
--- a/paddle/fluid/framework/op_kernel_info_helper.h
+++ b/paddle/fluid/framework/op_kernel_info_helper.h
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/ext/op_kernel_info.h"
+#include "paddle/pten/core/kernel_factory.h"
+
+namespace paddle {
+namespace framework {
+
+class OpKernelInfoHelper {
+ public:
+  static const std::string& GetOpName(const paddle::OpKernelInfo& info) {
+    return info.op_name_;
+  }
+
+  static const pten::Backend& GetBackend(const paddle::OpKernelInfo& info) {
+    return info.backend_;
+  }
+
+  static const pten::DataLayout& GetDataLayout(
+      const paddle::OpKernelInfo& info) {
+    return info.layout_;
+  }
+
+  static const pten::DataType& GetDataType(const paddle::OpKernelInfo& info) {
+    return info.dtype_;
+  }
+
+  static pten::KernelKey GetKernelKey(const paddle::OpKernelInfo& info) {
+    return pten::KernelKey(info.backend_, info.layout_, info.dtype_);
+  }
+
+  static const CustomKernelFunc& GetKernelFn(const paddle::OpKernelInfo& info) {
+    return info.kernel_fn_;
+  }
+
+  static void* GetVariadicKernelFn(const paddle::OpKernelInfo& info) {
+    return info.variadic_kernel_fn_;
+  }
+
+  static const paddle::SmallVector<TensorArgDef>& GetInputDefs(
+      const paddle::OpKernelInfo& info) {
+    return info.input_defs_;
+  }
+
+  static const paddle::SmallVector<TensorArgDef>& GetOutputDefs(
+      const paddle::OpKernelInfo& info) {
+    return info.output_defs_;
+  }
+
+  static const paddle::SmallVector<AttributeArgDef>& GetAttributeDefs(
+      const paddle::OpKernelInfo& info) {
+    return info.attribute_defs_;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -30,14 +30,15 @@ cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg}
 cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)

+set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
+    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel)
+
 if(WITH_CRYPTO)
-    cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
-else()
-    cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
+    list(APPEND paddle_inference_api_deps paddle_crypto)
 endif()

+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS ${paddle_inference_api_deps})
+
 if(WIN32)
    target_link_libraries(paddle_inference_api gflags)
 endif()

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -116,9 +116,12 @@ endif()

 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)

+# seperate init from device_context to avoid cycle dependencies
+cc_library(init SRCS init.cc DEPS device_context custom_kernel)
+
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
+cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
    place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context)
 if(WITH_XPU)

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -53,6 +53,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif

+#include "paddle/fluid/framework/custom_kernel.h"
+
 DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
    multiple_of_cupti_buffer_size, 1,
@@ -224,6 +226,18 @@ void InitDevices(const std::vector<int> devices) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  places.emplace_back(platform::CUDAPinnedPlace());
 #endif
+  const char *custom_kernel_root_p = std::getenv("CUSTOM_DEVICE_ROOT");
+  if (!custom_kernel_root_p) {
+    VLOG(3) << "Env [CUSTOM_DEVICE_ROOT] is not set.";
+  } else {
+    std::string custom_kernel_root(custom_kernel_root_p);
+    if (!custom_kernel_root.empty()) {
+      LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root;
+      framework::LoadCustomKernel(custom_kernel_root);
+    } else {
+      VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty.";
+    }
+  }
  platform::DeviceContextPool::Init(places);

 #ifndef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
-set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
+set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
  feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
  analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
  gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator

--- a/paddle/pten/api/all.h
+++ b/paddle/pten/api/all.h
@@ -40,6 +40,7 @@ limitations under the License. */
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/api/ext/dll_decl.h"
 #include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/api/ext/op_kernel_info.h"
 #include "paddle/pten/api/ext/op_meta_info.h"
 #include "paddle/pten/api/ext/place.h"
 #include "paddle/pten/api/ext/tensor_compat.h"
--- a/paddle/pten/api/ext/op_kernel_info.h
+++ b/paddle/pten/api/ext/op_kernel_info.h
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -3,16 +3,17 @@ add_subdirectory(utils)
 cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place)

 if (WITH_GPU)
-  nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
+  nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
 elseif (WITH_ROCM)
-  hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
+  hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
 else()
-  cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
+  cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
 endif()

 cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context kernel_factory)

 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
+cc_library(op_kernel_info SRCS op_kernel_info.cc DEPS pten_tensor)

 # forward api file
 set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)

--- a/paddle/pten/api/lib/op_kernel_info.cc
+++ b/paddle/pten/api/lib/op_kernel_info.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/ext/op_kernel_info.h"
+#include "paddle/fluid/framework/custom_kernel.h"
+
+namespace paddle {
+
+////////////////////// Op Kernel Info //////////////////////
+
+OpKernelInfo& OpKernelInfo::SetKernelFn(CustomKernelFunc&& func) {
+  kernel_fn_ = std::forward<CustomKernelFunc>(func);
+  return *this;
+}
+
+OpKernelInfo& OpKernelInfo::SetVariadicKernelFn(void* func) {
+  variadic_kernel_fn_ = func;
+  return *this;
+}
+
+//////////////// Op Kernel Info Map /////////////////
+
+std::vector<OpKernelInfo>& OpKernelInfoMap::operator[](
+    const std::string& name) {
+  return map_[name];
+}
+
+const std::unordered_map<std::string, std::vector<OpKernelInfo>>&
+OpKernelInfoMap::GetMap() const {
+  return map_;
+}
+
+//////////////// Op Kernel Info Builder /////////////////
+
+OpKernelInfoBuilder::OpKernelInfoBuilder(std::string&& op_name,
+                                         pten::Backend backend,
+                                         pten::DataLayout data_layout,
+                                         pten::DataType data_type) {
+  // 1. member assign
+  op_name_ = std::forward<std::string>(op_name);
+  backend_ = backend;
+  layout_ = data_layout;
+  dtype_ = data_type;
+
+  // 2. info parse
+  auto& info_vector = OpKernelInfoMap::Instance()[op_name_];
+  auto op_kernel_info = OpKernelInfo(op_name_, backend_, layout_, dtype_);
+  info_vector.emplace_back(std::move(op_kernel_info));
+
+  // 3. get current info ptr
+  info_ptr_ = &(info_vector.back());
+}
+
+OpKernelInfoBuilder& OpKernelInfoBuilder::SetKernelFn(CustomKernelFunc func) {
+  info_ptr_->SetKernelFn(std::forward<CustomKernelFunc>(func));
+  return *this;
+}
+
+OpKernelInfoBuilder& OpKernelInfoBuilder::SetVariadicKernelFn(void* func) {
+  info_ptr_->SetVariadicKernelFn(func);
+  return *this;
+}
+
+OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsParse(
+    CustomKernelArgsParseFn func) {
+  func(this->info_ptr_);
+  return *this;
+}
+
+OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsDef(CustomKernelArgsDefFn func) {
+  func(this->info_ptr_);
+  return *this;
+}
+
+/////////////////////// Op register API /////////////////////////
+
+// For inference: compile directly with framework
+// Call after PD_REGISTER_KERNEL(...)
+void RegisterAllCustomKernel() {
+  auto& op_kernel_info_map = OpKernelInfoMap::Instance();
+  framework::RegisterKernelWithMetaInfoMap(op_kernel_info_map);
+}
+
+// Using this api to load compiled custom kernel's dynamic library and
+// register custom kernels
+void LoadCustomKernelLib(const std::string& dso_name) {
+  framework::LoadCustomKernelLib(dso_name);
+}
+
+}  // namespace paddle
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C-API to get global OpKernelInfoMap.
+paddle::OpKernelInfoMap& PD_GetOpKernelInfoMap() {
+  return paddle::OpKernelInfoMap::Instance();
+}
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif
--- a/paddle/pten/core/kernel_def.h
+++ b/paddle/pten/core/kernel_def.h
@@ -26,7 +26,7 @@ class KernelSignature;
 class ArgumentMappingContext;
 class InferMetaContext;

-using KernelFn = void (*)(KernelContext* ctx);
+using KernelFn = std::function<void(KernelContext* ctx)>;
 using KernelArgsDefFn = void (*)(Kernel* kernel);
 using KernelArgsParseFn = void (*)(const KernelKey& default_key,
                                   KernelArgsDef* args_def);

--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -49,8 +49,6 @@ using DataLayout = paddle::experimental::DataLayout;

 class KernelContext;

-using KernelFn = void (*)(KernelContext* ctx);
-
 class KernelKey {
 public:
  KernelKey() = default;

--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
 # for paddle test case

 if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
 endif()
 cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -371,6 +371,17 @@ if load_noavx:
        raise e


+def set_paddle_custom_device_lib_path(lib_path):
+    if os.environ.get('CUSTOM_DEVICE_ROOT', None) is not None:
+        # use setted environment value
+        return
+    if os.path.exists(lib_path):
+        # set CUSTOM_DEVICE_ROOT default path
+        os.environ['CUSTOM_DEVICE_ROOT'] = os.path.normpath(lib_path)
+    else:
+        os.environ['CUSTOM_DEVICE_ROOT'] = ''
+
+
 # set paddle lib path
 def set_paddle_lib_path():
    site_dirs = site.getsitepackages() if hasattr(
@@ -380,11 +391,15 @@ def set_paddle_lib_path():
        lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
        if os.path.exists(lib_dir):
            _set_paddle_lib_path(lib_dir)
+            set_paddle_custom_device_lib_path(
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
            return
    if hasattr(site, 'USER_SITE'):
        lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
        if os.path.exists(lib_dir):
            _set_paddle_lib_path(lib_dir)
+            set_paddle_custom_device_lib_path(
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))


 set_paddle_lib_path()
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -9,5 +9,6 @@ endforeach()
 add_subdirectory(unittests)
 add_subdirectory(book)
 add_subdirectory(custom_op)
+add_subdirectory(custom_kernel)

 set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
--- a/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt
+py_test(test_custom_kernel_dot SRCS test_custom_kernel_dot.py)
+py_test(test_custom_kernel_load SRCS test_custom_kernel_load.py)
--- a/python/paddle/fluid/tests/custom_kernel/__init__.py
+++ b/python/paddle/fluid/tests/custom_kernel/__init__.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+namespace paddle {
+
+namespace custom_kernel {
+
+// Here we use dot <CPU, ANY, INT8> for test
+// This test will fail when this kernel is supported in framework
+template <typename T>
+void Dot(const paddle::CPUContext& dev_ctx,
+         const paddle::Tensor& x,
+         const paddle::Tensor& y,
+         paddle::Tensor* out) {
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
+
+  // Loop over the total N elements of both operands while sum-reducing every
+  // B pairs along the way where B is the dimension of the least ordered axis
+  auto shape = x.shape();
+  auto const N = x.numel();
+  auto const B = shape[shape.size() - 1];
+
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+
+}  // namespace custom_kernel
+}  // namespace paddle
+
+PD_REGISTER_KERNEL(
+    dot, CPU, ALL_LAYOUT, INT8, paddle::custom_kernel::Dot<int8_t>) {
+  /* do some args define here
+   * the only param can be used is OpKernelInfo* kernel */
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::INT8);
+}
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from paddle.fluid import core
+from distutils.sysconfig import get_python_lib
+from distutils.core import setup, Extension
+
+# cc flags
+paddle_extra_compile_args = ['-std=c++14', '-shared', '-fPIC']
+if core.is_compiled_with_npu():
+    paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
+
+# include path
+site_packages_path = get_python_lib()
+paddle_custom_kernel_include = [
+    os.path.join(site_packages_path, 'paddle', 'include'),
+]
+
+# libs path
+paddle_custom_kernel_library_dir = [
+    os.path.join(site_packages_path, 'paddle', 'fluid'),
+]
+
+# libs
+libs = [':core_avx.so']
+if not core.has_avx_core and core.has_noavx_core:
+    libs = [':core_noavx.so']
+
+custom_kernel_dot_module = Extension(
+    'custom_kernel_dot',
+    sources=['custom_kernel_dot.cc'],
+    include_dirs=paddle_custom_kernel_include,
+    library_dirs=paddle_custom_kernel_library_dir,
+    libraries=libs,
+    extra_compile_args=paddle_extra_compile_args)
+
+setup(
+    name='custom_kernel_dot',
+    version='1.0',
+    description='custom kernel fot compiling',
+    ext_modules=[custom_kernel_dot_module])
--- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import site
+import unittest
+import numpy as np
+
+
+# use dot <CPU, ANY, INT8> as test case.
+class TestCustomKernelDot(unittest.TestCase):
+    def setUp(self):
+        # compile so and set to current path
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+        # --inplace to place output so file to current dir
+        cmd = 'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format(
+            cur_dir, sys.executable)
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir
+
+    def test_custom_kernel_dot_run(self):
+        # test dot run
+        x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        result = np.sum(x_data * y_data, axis=1).reshape([2, 1])
+
+        import paddle
+        paddle.set_device('cpu')
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        out = paddle.dot(x, y)
+
+        self.assertTrue(
+            np.array_equal(out.numpy(), result),
+            "custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(),
+                                                                    result))
+
+    def tearDown(self):
+        del os.environ['CUSTOM_DEVICE_ROOT']
+
+
+if __name__ == '__main__':
+    if os.name == 'nt' or sys.platform.startswith('darwin'):
+        # only support Linux now
+        exit()
+    unittest.main()
--- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import site
+import unittest
+import numpy as np
+
+
+class TestCustomKernelLoad(unittest.TestCase):
+    def setUp(self):
+        # compile so and set to current path
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+        # --inplace to place output so file to current dir
+        cmd = 'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format(
+            cur_dir, sys.executable)
+        os.system(cmd)
+
+        # get paddle lib path and place so
+        paddle_lib_path = ''
+        site_dirs = site.getsitepackages() if hasattr(
+            site, 'getsitepackages'
+        ) else [x for x in sys.path if 'site-packages' in x]
+        for site_dir in site_dirs:
+            lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
+            if os.path.exists(lib_dir):
+                paddle_lib_path = lib_dir
+                break
+        if paddle_lib_path == '':
+            if hasattr(site, 'USER_SITE'):
+                lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
+                if os.path.exists(lib_dir):
+                    paddle_lib_path = lib_dir
+        self.default_path = os.path.sep.join(
+            [paddle_lib_path, '..', '..', 'paddle-plugins'])
+        # copy so to defalut path
+        cmd = 'mkdir -p {} && cp ./*.so {}'.format(self.default_path,
+                                                   self.default_path)
+        os.system(cmd)  # wait
+
+    def test_custom_kernel_dot_load(self):
+        # test dot load
+        x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        result = np.sum(x_data * y_data, axis=1).reshape([2, 1])
+
+        import paddle
+        paddle.set_device('cpu')
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        out = paddle.dot(x, y)
+
+        self.assertTrue(
+            np.array_equal(out.numpy(), result),
+            "custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(),
+                                                                    result))
+
+    def tearDown(self):
+        cmd = 'rm -rf {}'.format(self.default_path)
+        os.system(cmd)
+
+
+if __name__ == '__main__':
+    if os.name == 'nt' or sys.platform.startswith('darwin'):
+        # only support Linux now
+        exit()
+    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -573,7 +573,8 @@ headers = (
    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) +  # pten common headers
    # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
    # to `paddle/pten/api/ext`,
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'])
+    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'])

 if '${WITH_MKLDNN}' == 'ON':
    headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn