From a8879215aa58a5c93c86ac78ac247b6d50bf31c1 Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Thu, 27 Jan 2022 15:19:35 +0800
Subject: [PATCH] [PluggableDevice] Add custom kernel support based on pten
 kernel management (#38848)

* [Demo] custom kernel based on pten kernel

* merge and npu custom work well

* del comments

* delete other code

* fix CUDAContext

* fix not found small_vector.h

* support NPU

* fix NPUContext

* fix DeviceContext support

* add UT

* fix call

* add UT

* fix

* fix for comments and ut

* add MACRO control

* fix multi input output

* support env CUSTOM_DEVICE_ROOT

* deal with special cases

* fix for Windows

* try coverage with test_custom_kernel_dot.py

* fix test_custom_kernel_dot

* fix test_custom_kernel_dot

* fix merge

* fix merge

* fix CI

* update

* merge and fix

* remove WITH_CUSTOM_KERNEL

* fix merge

* merge and fix

* fix ut

* fix ut for mac

* add more UT

* add more UT

* fix
---
 paddle/fluid/framework/CMakeLists.txt         |   6 +-
 paddle/fluid/framework/custom_kernel.cc       | 411 +++++++++++
 paddle/fluid/framework/custom_kernel.h        |  38 +
 paddle/fluid/framework/custom_kernel_test.cc  | 283 ++++++++
 .../fluid/framework/op_kernel_info_helper.h   |  71 ++
 paddle/fluid/inference/api/CMakeLists.txt     |  11 +-
 paddle/fluid/platform/CMakeLists.txt          |   5 +-
 paddle/fluid/platform/init.cc                 |  14 +
 paddle/fluid/pybind/CMakeLists.txt            |   2 +-
 paddle/pten/api/all.h                         |   1 +
 paddle/pten/api/ext/op_kernel_info.h          | 663 ++++++++++++++++++
 paddle/pten/api/lib/CMakeLists.txt            |   7 +-
 paddle/pten/api/lib/op_kernel_info.cc         | 114 +++
 paddle/pten/core/kernel_def.h                 |   2 +-
 paddle/pten/core/kernel_factory.h             |   2 -
 paddle/testing/CMakeLists.txt                 |   2 +-
 python/paddle/fluid/core.py                   |  15 +
 python/paddle/fluid/tests/CMakeLists.txt      |   1 +
 .../fluid/tests/custom_kernel/CMakeLists.txt  |   2 +
 .../fluid/tests/custom_kernel/__init__.py     |  13 +
 .../tests/custom_kernel/custom_kernel_dot.cc  |  53 ++
 .../custom_kernel/custom_kernel_dot_setup.py  |  53 ++
 .../custom_kernel/test_custom_kernel_dot.py   |  62 ++
 .../custom_kernel/test_custom_kernel_load.py  |  80 +++
 python/setup.py.in                            |   3 +-
 25 files changed, 1897 insertions(+), 17 deletions(-)
 create mode 100644 paddle/fluid/framework/custom_kernel.cc
 create mode 100644 paddle/fluid/framework/custom_kernel.h
 create mode 100644 paddle/fluid/framework/custom_kernel_test.cc
 create mode 100644 paddle/fluid/framework/op_kernel_info_helper.h
 create mode 100644 paddle/pten/api/ext/op_kernel_info.h
 create mode 100644 paddle/pten/api/lib/op_kernel_info.cc
 create mode 100644 python/paddle/fluid/tests/custom_kernel/CMakeLists.txt
 create mode 100644 python/paddle/fluid/tests/custom_kernel/__init__.py
 create mode 100644 python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
 create mode 100644 python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
 create mode 100644 python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
 create mode 100644 python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 27ba88b56f0..de3a957df08 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -437,11 +437,12 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
 
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
-
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS
+           tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api)
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
-set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
+set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator custom_kernel)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 
@@ -451,3 +452,4 @@ endif()
 
 cc_test(scope_guard_test SRCS scope_guard_test.cc)
 cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils)
+cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor)
diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc
new file mode 100644
index 00000000000..a5498623941
--- /dev/null
+++ b/paddle/fluid/framework/custom_kernel.cc
@@ -0,0 +1,411 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined _WIN32 || defined __APPLE__
+#else
+#define _LINUX
+#endif
+
+#include "paddle/fluid/framework/custom_kernel.h"
+#include <dirent.h>
+#include <algorithm>
+#include <regex>
+#include "paddle/fluid/framework/op_kernel_info_helper.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/api/ext/op_kernel_info.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_registry.h"
+
+DECLARE_bool(run_pten_kernel);
+
+namespace paddle {
+
+namespace framework {
+
+// set pten::Kernel args_def_ from op_kernel_info
+// because we can not set directly to pten::Kernel without exposing
+// pten::KernelArgsDef when parsing custom user function
+static void ParseArgs(const OpKernelInfo& op_kernel_info,
+                      pten::KernelArgsDef* args_def) {
+  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
+  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
+  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
+
+  for (auto& input : input_defs) {
+    args_def->AppendInput(input.backend, input.layout, input.dtype);
+  }
+  for (auto& output : output_defs) {
+    args_def->AppendOutput(output.backend, output.layout, output.dtype);
+  }
+  for (auto& attr : attribute_defs) {
+    args_def->AppendAttribute(attr.type_index);
+  }
+}
+
+// custom pten kernel call function define
+static void RunKernelFunc(pten::KernelContext* ctx,
+                          const OpKernelInfo& op_kernel_info) {
+  VLOG(3) << "[CUSTOM KERNEL] RunKernelFunc begin...";
+
+  // input and output size is not params' num
+  // but actual Tensors' size
+  size_t input_size = ctx->InputsSize();
+  size_t output_size = ctx->OutputsSize();
+  size_t attr_size = ctx->AttrsSize();
+
+  // parameters' num of unified user kernel function
+  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
+  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
+  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
+
+  PADDLE_ENFORCE_GE(input_size, input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of ctx inputs size (%d) must be larger than "
+                        "the size of kernel input_defs (%d).",
+                        input_size, input_defs.size()));
+
+  PADDLE_ENFORCE_GE(output_size, output_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of ctx outputs size (%d) must be larger than "
+                        "the size of kernel output_defs (%d).",
+                        output_size, output_defs.size()));
+
+  PADDLE_ENFORCE_EQ(attr_size, attribute_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of ctx attribute size (%d) must be equal to "
+                        "to the size of kernel attribute_defs (%d).",
+                        attr_size, attribute_defs.size()));
+
+  VLOG(3) << "[CUSTOM KERNEL] Input num: " << input_defs.size()
+          << "[tensor size:" << input_size << "]"
+          << " Attribute num: " << attribute_defs.size()
+          << " Output num: " << output_defs.size()
+          << "[tensor size:" << output_size << "].";
+
+  // Inputs mapping
+  std::vector<paddle::experimental::Tensor> custom_ins;
+  std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
+  for (size_t in_idx = 0; in_idx < input_defs.size(); ++in_idx) {
+    VLOG(3) << "Mapping Input[" << in_idx << "]";
+    const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
+
+    // is_vector tells if this Input is Tensor or std::vector<Tensor>
+    if (!input_defs.at(in_idx).is_vector) {
+      paddle::experimental::Tensor custom_t;
+      auto& ctx_tensor = ctx->InputAt<pten::DenseTensor>(range.first);
+      custom_t.set_impl(std::make_shared<pten::DenseTensor>(ctx_tensor));
+      custom_ins.emplace_back(custom_t);
+    } else {
+      std::vector<paddle::experimental::Tensor> custom_vec_in;
+      auto ctx_tensor_vec =
+          ctx->MoveInputsBetween<pten::DenseTensor>(range.first, range.second);
+      for (auto& ctx_tensor : ctx_tensor_vec) {
+        paddle::experimental::Tensor custom_t;
+        custom_t.set_impl(std::make_shared<pten::DenseTensor>(ctx_tensor));
+        custom_vec_in.emplace_back(custom_t);
+      }
+      custom_vec_ins.emplace_back(custom_vec_in);
+    }
+    VLOG(3) << "Mapped Input[" << in_idx << "] with range[" << range.first
+            << "," << range.second << ").";
+  }
+
+  // Attributes mapping
+  std::vector<paddle::any> custom_attrs;
+  for (size_t attr_idx = 0; attr_idx < attribute_defs.size(); ++attr_idx) {
+    VLOG(3) << "Mapping Attribute[" << attr_idx << "]";
+    if (attribute_defs[attr_idx].type_index == std::type_index(typeid(bool))) {
+      bool arg = ctx->AttrAt<bool>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(int))) {
+      int arg = ctx->AttrAt<int>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(float))) {
+      float arg = ctx->AttrAt<float>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(double))) {
+      double arg = ctx->AttrAt<double>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(int64_t))) {
+      int64_t arg = ctx->AttrAt<int64_t>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(pten::dtype::float16))) {
+      pten::dtype::float16 arg = ctx->AttrAt<pten::dtype::float16>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(DataType))) {
+      DataType arg = ctx->AttrAt<DataType>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(const Scalar&))) {
+      const Scalar& arg = ctx->AttrAt<const Scalar&>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(const std::vector<int64_t>&))) {
+      const std::vector<int64_t>& arg =
+          ctx->AttrAt<const std::vector<int64_t>&>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(const ScalarArray&))) {
+      const ScalarArray& arg = ctx->AttrAt<const ScalarArray&>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else if (attribute_defs[attr_idx].type_index ==
+               std::type_index(typeid(const std::vector<int>&))) {
+      const std::vector<int>& arg =
+          ctx->AttrAt<const std::vector<int>&>(attr_idx);
+      custom_attrs.emplace_back(arg);
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported attribute attribute_defs[%d].type_index", attr_idx));
+    }
+    VLOG(3) << "Mapped Attribute[" << attr_idx << "]";
+  }
+
+  // Outputs mapping
+  std::vector<paddle::experimental::Tensor*> custom_outs;
+  std::vector<std::vector<paddle::experimental::Tensor*>> custom_vec_outs;
+  std::vector<std::shared_ptr<pten::DenseTensor>> custom_outs_ptr;
+  std::vector<std::vector<std::shared_ptr<pten::DenseTensor>>>
+      custom_vec_outs_ptr;
+
+  for (size_t out_idx = 0; out_idx < output_defs.size(); ++out_idx) {
+    VLOG(3) << "Mapping Output[" << out_idx << "]";
+    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
+
+    // is_vector tells if this Output is Tensor or std::vector<Tensor>
+    if (!output_defs.at(out_idx).is_vector) {
+      auto* ctx_tensor = ctx->MutableOutputAt<pten::DenseTensor>(range.first);
+      auto* custom_t = new paddle::experimental::Tensor();
+      auto custom_t_ptr = std::make_shared<pten::DenseTensor>(*ctx_tensor);
+      custom_t->set_impl(custom_t_ptr);
+      custom_outs.emplace_back(custom_t);
+      custom_outs_ptr.emplace_back(custom_t_ptr);
+    } else {
+      std::vector<paddle::experimental::Tensor*> custom_vec_out;
+      std::vector<std::shared_ptr<pten::DenseTensor>> custom_vec_out_ptr;
+      auto ctx_tensor_vec = ctx->MutableOutputBetween<pten::DenseTensor>(
+          range.first, range.second);
+      for (auto ctx_tensor : ctx_tensor_vec) {
+        auto* custom_t = new paddle::experimental::Tensor();
+        auto custom_t_ptr = std::make_shared<pten::DenseTensor>(*ctx_tensor);
+        custom_t->set_impl(custom_t_ptr);
+        custom_vec_out.emplace_back(custom_t);
+        custom_vec_out_ptr.emplace_back(custom_t_ptr);
+      }
+      custom_vec_outs.emplace_back(custom_vec_out);
+      custom_vec_outs_ptr.emplace_back(custom_vec_out_ptr);
+    }
+    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
+            << "," << range.second << ").";
+  }
+
+  // DeviceContext
+  // In pten, the first paramter XXContext is decided when registering
+  // through template param, but custom kernel function use unified
+  // DeviceContext as first parameter of user_kernel_fn, we use backend
+  // from OpKernelInfo to decide XXContext. In temporary simple
+  // DeviceContext, we just set necessary info to dev_ctx(such as stream
+  // in NPUContext), more related work should be done when
+  // pten::DeviceContext is exposed to outer.
+  DeviceContext dev_ctx;
+  auto& backend = OpKernelInfoHelper::GetBackend(op_kernel_info);
+  if (backend == pten::Backend::CPU) {
+    // do nothing
+  } else {
+    LOG(ERROR) << "[CUSTOM KERNEL] Unsupported kernel backend: " << backend
+               << " with compiled Paddle.";
+    return;
+  }
+
+  auto& user_kernel_fn = OpKernelInfoHelper::GetKernelFn(op_kernel_info);
+  // call user function
+  user_kernel_fn(dev_ctx, custom_ins, custom_vec_ins, custom_attrs,
+                 &custom_outs, &custom_vec_outs);
+
+  VLOG(3) << "[CUSTOM KERNEL] finished call user kernel function.";
+
+  // NOTE: Map back the output tensors with stored shared_ptrs.
+  for (int out_idx = output_defs.size() - 1; out_idx >= 0; --out_idx) {
+    VLOG(3) << "Mapping Back Output[" << out_idx << "]";
+    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
+
+    // is_vector tells if this Output is Tensor or std::vector<Tensor>
+    if (!output_defs.at(out_idx).is_vector) {
+      auto* ctx_tensor = ctx->MutableOutputAt<pten::DenseTensor>(range.first);
+      *ctx_tensor = *(custom_outs_ptr.back().get());
+      custom_outs_ptr.pop_back();
+    } else {
+      auto ctx_tensor_vec = ctx->MutableOutputBetween<pten::DenseTensor>(
+          range.first, range.second);
+      auto custom_vec_ptr_out = custom_vec_outs_ptr.back();
+      for (int idx = ctx_tensor_vec.size() - 1; idx >= 0; --idx) {
+        *(ctx_tensor_vec[idx]) = *(custom_vec_ptr_out.back().get());
+        custom_vec_ptr_out.pop_back();
+      }
+      custom_vec_outs_ptr.pop_back();
+    }
+    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
+            << "," << range.second << "].";
+  }
+
+  // delete newed paddle::Tensor for outputs while calling user kernel function
+  for (size_t i = 0; i < custom_outs.size(); ++i) {
+    delete custom_outs[i];
+  }
+  for (size_t i = 0; i < custom_vec_outs.size(); ++i) {
+    for (size_t j = 0; j < custom_vec_outs[i].size(); ++j) {
+      delete custom_vec_outs[i][j];
+    }
+  }
+}
+
+void RegisterKernelWithMetaInfo(
+    const std::vector<OpKernelInfo>& op_kernel_infos) {
+  PADDLE_ENFORCE_EQ(FLAGS_run_pten_kernel, true,
+                    platform::errors::Unimplemented(
+                        "Custom Kernel depends on pten kernel enabled,"));
+
+  for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
+    auto& kernel_info = op_kernel_infos[i];
+    auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);
+    auto kernel_key = OpKernelInfoHelper::GetKernelKey(kernel_info);
+
+    VLOG(3) << "[CUSTOM KERNEL] registering [" << op_type << "]" << kernel_key;
+
+    // 1.Check whether this kernel is valid for a specific operator
+    PADDLE_ENFORCE_EQ(
+        pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type), true,
+        platform::errors::InvalidArgument(
+            "[CUSTOM KERNEL] %s is not ready for custom kernel registering.",
+            op_type));
+
+    // 2.Check whether kernel_key has been already registed
+    PADDLE_ENFORCE_EQ(
+        pten::KernelFactory::Instance().kernels()[op_type].find(kernel_key),
+        pten::KernelFactory::Instance().kernels()[op_type].end(),
+        platform::errors::InvalidArgument(
+            "[CUSTOM KERNEL] The operator <%s>'s kernel: %s has been "
+            "already existed in Paddle, please contribute PR if need "
+            "to optimize the kernel code. Custom kernel do NOT support "
+            "to replace existing kernel in Paddle.",
+            op_type, kernel_key));
+
+    // pten::KernelFn
+    pten::KernelFn kernel_fn = [kernel_info](pten::KernelContext* ctx) {
+      VLOG(3) << "[CUSTOM KERNEL] run custom PTEN kernel func in lambda.";
+      RunKernelFunc(ctx, kernel_info);
+    };
+    // variadic_kernel_fn
+    void* variadic_kernel_fn =
+        OpKernelInfoHelper::GetVariadicKernelFn(kernel_info);
+    pten::Kernel kernel(kernel_fn, variadic_kernel_fn);
+    // args info
+    ParseArgs(kernel_info, kernel.mutable_args_def());
+    // register custom kernel to pten::KernelFactory
+    pten::KernelFactory::Instance().kernels()[op_type][kernel_key] = kernel;
+    VLOG(3) << "[CUSTOM KERNEL] Successed in registering operator <" << op_type
+            << ">'s kernel " << kernel_key << " to Paddle. "
+            << "It will be used like native ones.";
+  }
+}
+
+void RegisterKernelWithMetaInfoMap(
+    const paddle::OpKernelInfoMap& op_kernel_info_map) {
+  auto& kernel_info_map = op_kernel_info_map.GetMap();
+  VLOG(3) << "[CUSTOM KERNEL] size of op_kernel_info_map: "
+          << kernel_info_map.size();
+
+  // pair: {op_type, OpKernelInfo}
+  for (auto& pair : kernel_info_map) {
+    VLOG(3) << "[CUSTOM KERNEL] pair first -> op name: " << pair.first;
+    RegisterKernelWithMetaInfo(pair.second);
+  }
+}
+
+void LoadCustomKernelLib(const std::string& dso_lib_path) {
+#ifdef _LINUX
+  void* dso_handle = nullptr;
+  int dynload_flags = RTLD_NOW | RTLD_LOCAL;
+  dso_handle = dlopen(dso_lib_path.c_str(), dynload_flags);
+
+  // MUST valid dso_lib_path
+  PADDLE_ENFORCE_NOT_NULL(
+      dso_handle,
+      platform::errors::InvalidArgument(
+          "Fail to open library: %s with error: %s", dso_lib_path, dlerror()));
+
+  typedef OpKernelInfoMap& get_op_kernel_info_map_t();
+  auto* func = reinterpret_cast<get_op_kernel_info_map_t*>(
+      dlsym(dso_handle, "PD_GetOpKernelInfoMap"));
+
+  if (func == nullptr) {
+    LOG(INFO) << "Skipped lib [" << dso_lib_path << "]: fail to find "
+              << "PD_GetOpKernelInfoMap symbol in this lib.";
+    return;
+  }
+  auto& op_kernel_info_map = func();
+  RegisterKernelWithMetaInfoMap(op_kernel_info_map);
+  LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
+#else
+  VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
+#endif
+  return;
+}
+
+// List all libs with given path
+std::vector<std::string> ListAllLib(const std::string& libs_path) {
+  DIR* dir = nullptr;
+  dir = opendir(libs_path.c_str());
+
+  // MUST valid libs_path
+  PADDLE_ENFORCE_NOT_NULL(dir, platform::errors::InvalidArgument(
+                                   "Fail to open path: %s", libs_path));
+
+  dirent* ptr = nullptr;
+  std::vector<std::string> libs;
+  std::regex express(".*\\.so");
+  std::match_results<std::string::iterator> results;
+  while ((ptr = readdir(dir)) != nullptr) {
+    std::string filename(ptr->d_name);
+    if (std::regex_match(filename.begin(), filename.end(), results, express)) {
+      libs.emplace_back(libs_path + '/' + filename);
+      LOG(INFO) << "Found lib [" << filename << "]";
+    } else {
+      VLOG(3) << "Skipped file [" << filename << "] without .so postfix";
+    }
+  }
+  closedir(dir);
+  return libs;
+}
+
+// Load custom kernels with given path
+void LoadCustomKernel(const std::string& libs_path) {
+  VLOG(3) << "Try loading custom libs from: [" << libs_path << "]";
+  std::vector<std::string> libs = ListAllLib(libs_path);
+  for (auto& lib_path : libs) {
+    LoadCustomKernelLib(lib_path);
+  }
+  LOG(INFO) << "Finished in LoadCustomKernel with libs_path: [" << libs_path
+            << "]";
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/fluid/framework/custom_kernel.h
new file mode 100644
index 00000000000..0c12bdfa8cb
--- /dev/null
+++ b/paddle/fluid/framework/custom_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/ext/op_kernel_info.h"
+
+namespace paddle {
+namespace framework {
+
+// Load custom kernel lib from giwen path
+void LoadCustomKernel(const std::string& libs_path);
+
+void LoadCustomKernelLib(const std::string& dso_lib_path);
+
+// Load custom kernel api: register kernel after user compiled
+void LoadOpKernelInfoAndRegister(const std::string& dso_name);
+
+// Register custom kernel api: register kernel directly
+void RegisterKernelWithMetaInfoMap(
+    const paddle::OpKernelInfoMap& op_kernel_info_map);
+
+// Interface for selective register custom kernel.
+void RegisterKernelWithMetaInfo(
+    const std::vector<OpKernelInfo>& op_kernel_infos);
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/custom_kernel_test.cc b/paddle/fluid/framework/custom_kernel_test.cc
new file mode 100644
index 00000000000..708b7bbe8a5
--- /dev/null
+++ b/paddle/fluid/framework/custom_kernel_test.cc
@@ -0,0 +1,283 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined _WIN32 || defined __APPLE__
+#else
+#define _LINUX
+#endif
+
+#include "paddle/fluid/framework/custom_kernel.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include "paddle/extension.h"
+#include "paddle/fluid/framework/op_kernel_info_helper.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_factory.h"
+#include "paddle/pten/infermeta/binary.h"
+#include "paddle/utils/small_vector.h"
+
+#ifdef _LINUX
+// user kernel function
+namespace custom_kernel {
+
+// Here we use dot <CPU, ANY, UINT8> for test
+// This test will fail when these two kernels are aupported in framework
+// input 3: two Tensors and one std::vector<Tensor>
+// attribute 11: fake_attributes
+// output 2: one Tensor* and one std::vector<Tensor*>
+template <typename T>
+void FakeDot(const paddle::CPUContext& dev_ctx, const paddle::Tensor& x,
+             const paddle::Tensor& y,
+             const std::vector<paddle::Tensor>& fake_input_vec,
+             bool fake_attr_bool, int fake_attr_int, float fake_attr_float,
+             double fake_attr_double, int64_t fake_attr_int64,
+             pten::dtype::float16 fake_attr_f16, pten::DataType fake_attr_dtype,
+             const pten::Scalar& fake_attr_scalar,
+             const pten::ScalarArray& fake_attr_scalar_array,
+             const std::vector<int64_t>& fake_attr_int64_vec,
+             const std::vector<int>& fake_attr_int_vec, paddle::Tensor* out,
+             std::vector<paddle::Tensor*> fake_out_vec) {
+  // print param info
+  std::cout << "fake_input_vec.size: " << fake_input_vec.size() << std::endl;
+  std::cout << "fake_attr_bool: " << fake_attr_bool << std::endl;
+  std::cout << "fake_attr_int: " << fake_attr_int << std::endl;
+  std::cout << "fake_attr_float: " << fake_attr_float << std::endl;
+  std::cout << "fake_attr_double: " << fake_attr_double << std::endl;
+  std::cout << "fake_attr_int64: " << fake_attr_int64 << std::endl;
+  std::cout << "fake_attr_f16: " << fake_attr_f16 << std::endl;
+  std::cout << "fake_attr_dtype: " << fake_attr_dtype << std::endl;
+  std::cout << "fake_attr_int64_vec: " << fake_attr_int64_vec.size()
+            << std::endl;
+  std::cout << "fake_attr_int_vec: " << fake_attr_int_vec.size() << std::endl;
+  std::cout << "fake_out_vec: " << fake_out_vec.size() << std::endl;
+
+  // assert check
+  assert(fake_input_vec.size() == 2);
+  assert(fake_attr_bool == false);
+  assert(fake_attr_int == 1);
+  assert(fake_attr_float == 2);
+  assert(fake_attr_double == 3);
+  assert(fake_attr_int64 == 4);
+  assert(fake_attr_f16 == 5);
+  assert(fake_attr_dtype == pten::DataType::UINT32);
+  assert(fake_attr_int64_vec.size() == 0);
+  assert(fake_attr_int_vec.size() == 0);
+  assert(fake_out_vec.size() == 2);
+
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
+  auto shape = x.shape();
+  auto const N = x.numel();
+  auto const B = shape[shape.size() - 1];
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+}  // namespace custom_kernel
+
+PD_REGISTER_KERNEL(dot, CPU, ALL_LAYOUT, UINT8,
+                   custom_kernel::FakeDot<uint8_t>) {
+  /* do some args define here
+   * the only param can be used is OpKernelInfo* kernel */
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UINT8);
+}
+
+// Upper code will store dot kernels info into OpKernelInfoMap
+TEST(CustomKernel, custom_kernel_dot) {
+  std::string op_name = "dot";
+  pten::Backend backend = pten::Backend::CPU;
+  pten::DataLayout layout = pten::DataLayout::ANY;
+  pten::DataType dtype = pten::DataType::UINT8;
+
+  // 1.custom kernel info parsed and store
+  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance().GetMap().find("dot") !=
+              paddle::OpKernelInfoMap::Instance().GetMap().end());
+
+  // 2.info check
+  EXPECT_EQ(
+      1, static_cast<int>(paddle::OpKernelInfoMap::Instance()["dot"].size()));
+  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetBackend() ==
+              backend);
+  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetDataLayout() ==
+              layout);
+  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()["dot"][0].GetDataType() ==
+              dtype);
+
+  // 3.register
+  EXPECT_TRUE(pten::KernelFactory::Instance().kernels().end() !=
+              pten::KernelFactory::Instance().kernels().find("dot"));
+
+  pten::KernelKey kernel_key(backend, layout, dtype);
+  EXPECT_TRUE(
+      pten::KernelFactory::Instance().kernels()["dot"].find(kernel_key) ==
+      pten::KernelFactory::Instance().kernels()["dot"].end());
+
+  paddle::framework::RegisterKernelWithMetaInfoMap(
+      paddle::OpKernelInfoMap::Instance());
+
+  EXPECT_TRUE(
+      pten::KernelFactory::Instance().kernels()["dot"].find(kernel_key) !=
+      pten::KernelFactory::Instance().kernels()["dot"].end());
+
+  // 4.kernel select
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      op_name, kernel_key);
+
+  // 5.prepare parameters for kernel
+  const auto alloc = std::make_unique<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc.get(), pten::DenseTensorMeta(pten::DataType::UINT8,
+                                         paddle::framework::make_ddim({2, 3}),
+                                         pten::DataLayout::NCHW));
+  auto* dense_x_data =
+      dense_x->mutable_data<uint8_t>(paddle::platform::CPUPlace());
+
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      alloc.get(), pten::DenseTensorMeta(pten::DataType::UINT8,
+                                         paddle::framework::make_ddim({2, 3}),
+                                         pten::DataLayout::NCHW));
+  auto* dense_y_data =
+      dense_y->mutable_data<uint8_t>(paddle::platform::CPUPlace());
+
+  // dot x,y and result
+  uint8_t sum[2] = {0, 0};
+  for (size_t i = 0; i < 2; ++i) {
+    for (size_t j = 0; j < 3; ++j) {
+      dense_x_data[i * 3 + j] = (i * 3 + j);
+      dense_y_data[i * 3 + j] = (i * 3 + j);
+      sum[i] += (i * 3 + j) * (i * 3 + j);
+    }
+  }
+
+  // 6.prepare kernel_context
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+  kernel_context.EmplaceBackInput(dense_x.get());  // idx:0, index:[0,1)
+  kernel_context.EmplaceBackInput(dense_y.get());  // idx:1, index:[1,2)
+
+  // fake_input_vec: idx:2, index:[2,4)
+  size_t fake_input_vec_idx = 2;
+  size_t fake_input_vec_index_start = 2;
+  size_t fake_input_vec_index_end = 4;
+  kernel_context.EmplaceBackInputWithoutSetRange(dense_x.get());
+  kernel_context.EmplaceBackInputWithoutSetRange(dense_y.get());
+  kernel_context.AssignInputRange(
+      std::make_pair(fake_input_vec_index_start, fake_input_vec_index_end),
+      fake_input_vec_idx);
+
+  bool fake_attr_bool = false;
+  int fake_attr_int = 1;
+  float fake_attr_float = 2.0;
+  double fake_attr_double = 3.0;
+  int64_t fake_attr_int64 = 4;
+  pten::dtype::float16 fake_attr_f16 = pten::dtype::float16(5);
+  pten::DataType fake_attr_dtype = pten::DataType::UINT32;
+  paddle::framework::LoDTensor tmp_tensor;
+  tmp_tensor.mutable_data<uint8_t>({1}, pten::TransToFluidPlace(backend));
+  pten::Scalar fake_attr_scalar =
+      paddle::experimental::MakePtenScalar(tmp_tensor);
+  pten::ScalarArray fake_attr_scalar_array;
+  std::vector<int64_t> fake_attr_int64_vec;
+  std::vector<int> fake_attr_int_vec;
+
+  kernel_context.EmplaceBackAttr(fake_attr_bool);
+  kernel_context.EmplaceBackAttr(fake_attr_int);
+  kernel_context.EmplaceBackAttr(fake_attr_float);
+  kernel_context.EmplaceBackAttr(fake_attr_double);
+  kernel_context.EmplaceBackAttr(fake_attr_int64);
+  kernel_context.EmplaceBackAttr(fake_attr_f16);
+  kernel_context.EmplaceBackAttr(fake_attr_dtype);
+  kernel_context.EmplaceBackAttr(fake_attr_scalar);
+  kernel_context.EmplaceBackAttr(fake_attr_scalar_array);
+  kernel_context.EmplaceBackAttr(fake_attr_int64_vec);
+  kernel_context.EmplaceBackAttr(fake_attr_int_vec);
+
+  auto out_meta = pten::DotInferMeta(dense_x->meta(), dense_y->meta());
+  auto dense_out = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          pten::TransToFluidPlace(backend)),
+      std::move(out_meta));
+  kernel_context.EmplaceBackOutput(dense_out.get());  // idx:0 index:[0,1)
+
+  // fake_input_vec: idx:1, index:[1,3)
+  size_t fake_out_vec_idx = 1;
+  size_t fake_out_vec_index_start = 1;
+  size_t fake_out_vec_index_end = 3;
+  kernel_context.EmplaceBackOutputWithoutSetRange(dense_out.get());
+  kernel_context.EmplaceBackOutputWithoutSetRange(dense_out.get());
+  kernel_context.AssignOutputRange(
+      std::make_pair(fake_out_vec_index_start, fake_out_vec_index_end),
+      fake_out_vec_idx);
+
+  // 7.kernel call
+  kernel(&kernel_context);
+
+  // 8.check result
+  ASSERT_EQ(dense_out->dims().size(), 2);
+  ASSERT_EQ(dense_out->dims()[0], 2);
+  ASSERT_EQ(dense_out->numel(), 2);
+  ASSERT_EQ(dense_out->dtype(), pten::DataType::UINT8);
+  ASSERT_EQ(dense_out->layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(dense_out->initialized(), true);
+
+  auto expect_result = sum;
+  auto actual_result0 = dense_out->data<uint8_t>()[0];
+  auto actual_result1 = dense_out->data<uint8_t>()[1];
+  ASSERT_EQ(expect_result[0], actual_result0);
+  ASSERT_EQ(expect_result[1], actual_result1);
+}
+
+// test OpKernelInfoHelper
+TEST(OpKernelInfoHelper, op_kernel_info_help_getters) {
+  using OpKernelInfoHelper = paddle::framework::OpKernelInfoHelper;
+  std::string op_name = "dot";
+  pten::Backend backend = pten::Backend::CPU;
+  pten::DataLayout layout = pten::DataLayout::ANY;
+  pten::DataType dtype = pten::DataType::UINT8;
+
+  auto op_kernel_info = paddle::OpKernelInfoMap::Instance()[op_name][0];
+
+  EXPECT_EQ(op_name, OpKernelInfoHelper::GetOpName(op_kernel_info));
+  EXPECT_EQ(backend, OpKernelInfoHelper::GetBackend(op_kernel_info));
+  EXPECT_EQ(layout, OpKernelInfoHelper::GetDataLayout(op_kernel_info));
+  EXPECT_EQ(dtype, OpKernelInfoHelper::GetDataType(op_kernel_info));
+
+  EXPECT_EQ(pten::KernelKey(backend, layout, dtype),
+            OpKernelInfoHelper::GetKernelKey(op_kernel_info));
+
+  paddle::CustomKernelFunc kernel_fn =
+      PD_PT_KERNEL(custom_kernel::FakeDot<uint8_t>);
+  EXPECT_EQ(kernel_fn, OpKernelInfoHelper::GetKernelFn(op_kernel_info));
+
+  void* variadic_func = PD_PT_VARIADIC_KERNEL(custom_kernel::FakeDot<uint8_t>);
+  EXPECT_EQ(variadic_func,
+            OpKernelInfoHelper::GetVariadicKernelFn(op_kernel_info));
+
+  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
+  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
+  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
+  EXPECT_EQ(3, static_cast<int>(input_defs.size()));
+  EXPECT_EQ(2, static_cast<int>(output_defs.size()));
+  EXPECT_EQ(11, static_cast<int>(attribute_defs.size()));
+}
+#endif
diff --git a/paddle/fluid/framework/op_kernel_info_helper.h b/paddle/fluid/framework/op_kernel_info_helper.h
new file mode 100644
index 00000000000..271ac04bb19
--- /dev/null
+++ b/paddle/fluid/framework/op_kernel_info_helper.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/ext/op_kernel_info.h"
+#include "paddle/pten/core/kernel_factory.h"
+
+namespace paddle {
+namespace framework {
+
+class OpKernelInfoHelper {
+ public:
+  static const std::string& GetOpName(const paddle::OpKernelInfo& info) {
+    return info.op_name_;
+  }
+
+  static const pten::Backend& GetBackend(const paddle::OpKernelInfo& info) {
+    return info.backend_;
+  }
+
+  static const pten::DataLayout& GetDataLayout(
+      const paddle::OpKernelInfo& info) {
+    return info.layout_;
+  }
+
+  static const pten::DataType& GetDataType(const paddle::OpKernelInfo& info) {
+    return info.dtype_;
+  }
+
+  static pten::KernelKey GetKernelKey(const paddle::OpKernelInfo& info) {
+    return pten::KernelKey(info.backend_, info.layout_, info.dtype_);
+  }
+
+  static const CustomKernelFunc& GetKernelFn(const paddle::OpKernelInfo& info) {
+    return info.kernel_fn_;
+  }
+
+  static void* GetVariadicKernelFn(const paddle::OpKernelInfo& info) {
+    return info.variadic_kernel_fn_;
+  }
+
+  static const paddle::SmallVector<TensorArgDef>& GetInputDefs(
+      const paddle::OpKernelInfo& info) {
+    return info.input_defs_;
+  }
+
+  static const paddle::SmallVector<TensorArgDef>& GetOutputDefs(
+      const paddle::OpKernelInfo& info) {
+    return info.output_defs_;
+  }
+
+  static const paddle::SmallVector<AttributeArgDef>& GetAttributeDefs(
+      const paddle::OpKernelInfo& info) {
+    return info.attribute_defs_;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 53b92c13363..6c465e62780 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -30,14 +30,15 @@ cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg}
 cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor)
 cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
 
+set(paddle_inference_api_deps lod_tensor scope reset_tensor_array
+    analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator custom_kernel)
+
 if(WITH_CRYPTO)
-    cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator)
-else()
-    cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array 
-              analysis_config paddle_infer_contrib zero_copy_tensor trainer_desc_proto custom_operator)
+    list(APPEND paddle_inference_api_deps paddle_crypto)
 endif()
 
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS ${paddle_inference_api_deps})
+
 if(WIN32)
     target_link_libraries(paddle_inference_api gflags)
 endif()
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index eb7057bcd50..a151c824a22 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -116,9 +116,12 @@ endif()
 
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
+# seperate init from device_context to avoid cycle dependencies
+cc_library(init SRCS init.cc DEPS device_context custom_kernel)
+
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
-cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
+cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
     place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context)
 if(WITH_XPU)
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index e9d2f8e901e..f7a86e5aac7 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -53,6 +53,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/ipu/ipu_info.h"
 #endif
 
+#include "paddle/fluid/framework/custom_kernel.h"
+
 DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
     multiple_of_cupti_buffer_size, 1,
@@ -224,6 +226,18 @@ void InitDevices(const std::vector<int> devices) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   places.emplace_back(platform::CUDAPinnedPlace());
 #endif
+  const char *custom_kernel_root_p = std::getenv("CUSTOM_DEVICE_ROOT");
+  if (!custom_kernel_root_p) {
+    VLOG(3) << "Env [CUSTOM_DEVICE_ROOT] is not set.";
+  } else {
+    std::string custom_kernel_root(custom_kernel_root_p);
+    if (!custom_kernel_root.empty()) {
+      LOG(INFO) << "ENV [CUSTOM_DEVICE_ROOT]=" << custom_kernel_root;
+      framework::LoadCustomKernel(custom_kernel_root);
+    } else {
+      VLOG(3) << "ENV [CUSTOM_DEVICE_ROOT] is empty.";
+    }
+  }
   platform::DeviceContextPool::Init(places);
 
 #ifndef PADDLE_WITH_MKLDNN
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 08ca575c2b9..a6e155f70e6 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
+set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
   gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator
diff --git a/paddle/pten/api/all.h b/paddle/pten/api/all.h
index 4451a5c372b..5744b18c4d2 100644
--- a/paddle/pten/api/all.h
+++ b/paddle/pten/api/all.h
@@ -40,6 +40,7 @@ limitations under the License. */
 #include "paddle/pten/api/ext/dispatch.h"
 #include "paddle/pten/api/ext/dll_decl.h"
 #include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/api/ext/op_kernel_info.h"
 #include "paddle/pten/api/ext/op_meta_info.h"
 #include "paddle/pten/api/ext/place.h"
 #include "paddle/pten/api/ext/tensor_compat.h"
diff --git a/paddle/pten/api/ext/op_kernel_info.h b/paddle/pten/api/ext/op_kernel_info.h
new file mode 100644
index 00000000000..bcfff61bc6f
--- /dev/null
+++ b/paddle/pten/api/ext/op_kernel_info.h
@@ -0,0 +1,663 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include <string>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/pten/api/ext/dll_decl.h"
+#include "paddle/pten/api/ext/exception.h"
+#include "paddle/pten/api/ext/op_meta_info.h"
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/common/scalar_array.h"
+#include "paddle/utils/any.h"
+#include "paddle/utils/small_vector.h"
+
+/**
+ * Custom Kernel Info Define.
+ *
+ * Used to maintain custom kernel core information before registering.
+ * Pten is working on exposing headers, custom kernel depends on them, and
+ * we prefer outer users following pten-kernel-function-style and registering
+ * macro. So, we have to re-implement some structs or class and functions to
+ * make sure users' custom kernel functions can be registered to pten.
+ *
+ * TODO(Aganlengzi): We should upgrade following pten.
+ */
+
+namespace paddle {
+namespace framework {
+class PADDLE_API OpKernelInfoHelper;
+}  // namespace framework
+
+// TODO(Aganlengzi): Simple DeviceContext temporarily for stream getting
+// before pten::DeviceContext is exposed.
+class DeviceContext {
+ public:
+  DeviceContext() { stream_ = nullptr; }
+  void set_stream(void* stream) { stream_ = stream; }
+  void* stream() const { return stream_; }
+
+ private:
+  void* stream_;
+};
+class CPUContext : public DeviceContext {};
+
+// TODO(Aganlengzi): Use paddle::Tensor before DenseTensor is exposed
+using Tensor = paddle::experimental::Tensor;
+using Scalar = pten::Scalar;
+using ScalarArray = pten::ScalarArray;
+
+// Record custom kernel core information
+// We can not use pten::KernelFn directly, so users' custom kernel function
+// is signatured to `CustomKernelFunc', notice that the first parameter is
+// fixed to `const DeviceContext&'.
+using CustomKernelFunc =
+    void (*)(const DeviceContext& dev_ctx,
+             const std::vector<Tensor>& inputs,
+             const std::vector<std::vector<Tensor>>& vec_inputs,
+             const std::vector<paddle::any>& attrs,
+             std::vector<Tensor*>* outputs,
+             std::vector<std::vector<Tensor*>>* vec_outputs);
+
+////////////////////// Kernel Function (PD_PT_KERNEL) ////////////////////////
+#define PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(device_ctx)           \
+  template <typename... Tail>                                                \
+  struct CustomComputeCallHelper<const device_ctx&, Tail...> {               \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int vec_in_idx,                                                \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              int vec_out_idx,                                               \
+              typename... PreviousArgs>                                      \
+    static void Compute(const DeviceContext& dev_ctx,                        \
+                        const std::vector<Tensor>& inputs,                   \
+                        const std::vector<std::vector<Tensor>>& vec_inputs,  \
+                        const std::vector<paddle::any>& attrs,               \
+                        std::vector<Tensor*>* outputs,                       \
+                        std::vector<std::vector<Tensor*>>* vec_outputs,      \
+                        PreviousArgs... pargs) {                             \
+      static_assert(in_idx == 0,                                             \
+                    "Kernel's DeviceContext should appear before Inputs.");  \
+      static_assert(vec_in_idx == 0,                                         \
+                    "Kernel's DeviceContext should appear before Inputs.");  \
+      static_assert(                                                         \
+          attr_idx == 0,                                                     \
+          "Kernel's DeviceContext should appear before Attributes.");        \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's DeviceContext should appear before Outputs."); \
+      static_assert(vec_out_idx == 0,                                        \
+                    "Kernel's DeviceContext should appear before Outputs."); \
+      const device_ctx& arg = static_cast<const device_ctx&>(dev_ctx);       \
+      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx + 1,    \
+                                                         in_idx,             \
+                                                         vec_in_idx,         \
+                                                         attr_idx,           \
+                                                         out_idx,            \
+                                                         vec_out_idx>(       \
+          dev_ctx,                                                           \
+          inputs,                                                            \
+          vec_inputs,                                                        \
+          attrs,                                                             \
+          outputs,                                                           \
+          vec_outputs,                                                       \
+          pargs...,                                                          \
+          arg);                                                              \
+    }                                                                        \
+  }
+
+#define PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)               \
+  template <typename... Tail>                                               \
+  struct CustomComputeCallHelper<const tensor_type&, Tail...> {             \
+    template <int dev_ctx_idx,                                              \
+              int in_idx,                                                   \
+              int vec_in_idx,                                               \
+              int attr_idx,                                                 \
+              int out_idx,                                                  \
+              int vec_out_idx,                                              \
+              typename... PreviousArgs>                                     \
+    static void Compute(const DeviceContext& dev_ctx,                       \
+                        const std::vector<Tensor>& inputs,                  \
+                        const std::vector<std::vector<Tensor>>& vec_inputs, \
+                        const std::vector<paddle::any>& attrs,              \
+                        std::vector<Tensor*>* outputs,                      \
+                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
+                        PreviousArgs... pargs) {                            \
+      static_assert(attr_idx == 0,                                          \
+                    "Kernel's Input should appear before Attributes.");     \
+      static_assert(out_idx == 0,                                           \
+                    "Kernel's Input should appear before Outputs.");        \
+      static_assert(vec_out_idx == 0,                                       \
+                    "Kernel's Input should appear before Outputs.");        \
+      const Tensor& arg = inputs[in_idx];                                   \
+      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,       \
+                                                         in_idx + 1,        \
+                                                         vec_in_idx,        \
+                                                         attr_idx,          \
+                                                         out_idx,           \
+                                                         vec_out_idx>(      \
+          dev_ctx,                                                          \
+          inputs,                                                           \
+          vec_inputs,                                                       \
+          attrs,                                                            \
+          outputs,                                                          \
+          vec_outputs,                                                      \
+          pargs...,                                                         \
+          arg);                                                             \
+    }                                                                       \
+  }
+
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type)          \
+  template <typename... Tail>                                                \
+  struct CustomComputeCallHelper<const std::vector<tensor_type>&, Tail...> { \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int vec_in_idx,                                                \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              int vec_out_idx,                                               \
+              typename... PreviousArgs>                                      \
+    static void Compute(const DeviceContext& dev_ctx,                        \
+                        const std::vector<Tensor>& inputs,                   \
+                        const std::vector<std::vector<Tensor>>& vec_inputs,  \
+                        const std::vector<paddle::any>& attrs,               \
+                        std::vector<Tensor*>* outputs,                       \
+                        std::vector<std::vector<Tensor*>>* vec_outputs,      \
+                        PreviousArgs... pargs) {                             \
+      static_assert(attr_idx == 0,                                           \
+                    "Kernel's Input should appear before Attributes.");      \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's Input should appear before Outputs.");         \
+      static_assert(vec_out_idx == 0,                                        \
+                    "Kernel's Input should appear before Outputs.");         \
+      const std::vector<Tensor>& arg = vec_inputs[vec_in_idx];               \
+      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,        \
+                                                         in_idx,             \
+                                                         vec_in_idx + 1,     \
+                                                         attr_idx,           \
+                                                         out_idx,            \
+                                                         vec_out_idx>(       \
+          dev_ctx,                                                           \
+          inputs,                                                            \
+          vec_inputs,                                                        \
+          attrs,                                                             \
+          outputs,                                                           \
+          vec_outputs,                                                       \
+          pargs...,                                                          \
+          arg);                                                              \
+    }                                                                        \
+  }
+
+#define PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)             \
+  template <typename... Tail>                                               \
+  struct CustomComputeCallHelper<attr_type, Tail...> {                      \
+    template <int dev_ctx_idx,                                              \
+              int in_idx,                                                   \
+              int vec_in_idx,                                               \
+              int attr_idx,                                                 \
+              int out_idx,                                                  \
+              int vec_out_idx,                                              \
+              typename... PreviousArgs>                                     \
+    static void Compute(const DeviceContext& dev_ctx,                       \
+                        const std::vector<Tensor>& inputs,                  \
+                        const std::vector<std::vector<Tensor>>& vec_inputs, \
+                        const std::vector<paddle::any>& attrs,              \
+                        std::vector<Tensor*>* outputs,                      \
+                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
+                        PreviousArgs... pargs) {                            \
+      static_assert(out_idx == 0,                                           \
+                    "Kernel's Attributes should appear before Outputs.");   \
+      static_assert(vec_out_idx == 0,                                       \
+                    "Kernel's Attributes should appear before Outputs.");   \
+      try {                                                                 \
+        attr_type arg = paddle::any_cast<attr_type>(attrs[attr_idx]);       \
+        return CustomComputeCallHelper<Tail...>::template Compute<          \
+            dev_ctx_idx,                                                    \
+            in_idx,                                                         \
+            vec_in_idx,                                                     \
+            attr_idx + 1,                                                   \
+            out_idx,                                                        \
+            vec_out_idx>(dev_ctx,                                           \
+                         inputs,                                            \
+                         vec_inputs,                                        \
+                         attrs,                                             \
+                         outputs,                                           \
+                         vec_outputs,                                       \
+                         pargs...,                                          \
+                         arg);                                              \
+      } catch (paddle::bad_any_cast&) {                                     \
+        PD_THROW(                                                           \
+            "Attribute cast error in custom operator. Expected " #attr_type \
+            " value.");                                                     \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#define PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)              \
+  template <typename... Tail>                                               \
+  struct CustomComputeCallHelper<tensor_type*, Tail...> {                   \
+    template <int dev_ctx_idx,                                              \
+              int in_idx,                                                   \
+              int vec_in_idx,                                               \
+              int attr_idx,                                                 \
+              int out_idx,                                                  \
+              int vec_out_idx,                                              \
+              typename... PreviousArgs>                                     \
+    static void Compute(const DeviceContext& dev_ctx,                       \
+                        const std::vector<Tensor>& inputs,                  \
+                        const std::vector<std::vector<Tensor>>& vec_inputs, \
+                        const std::vector<paddle::any>& attrs,              \
+                        std::vector<Tensor*>* outputs,                      \
+                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
+                        PreviousArgs... pargs) {                            \
+      tensor_type* arg = (*outputs)[out_idx];                               \
+      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,       \
+                                                         in_idx,            \
+                                                         vec_in_idx,        \
+                                                         attr_idx,          \
+                                                         out_idx + 1,       \
+                                                         vec_out_idx>(      \
+          dev_ctx,                                                          \
+          inputs,                                                           \
+          vec_inputs,                                                       \
+          attrs,                                                            \
+          outputs,                                                          \
+          vec_outputs,                                                      \
+          pargs...,                                                         \
+          arg);                                                             \
+    }                                                                       \
+  }
+
+#define PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(tensor_type)        \
+  template <typename... Tail>                                               \
+  struct CustomComputeCallHelper<std::vector<tensor_type*>, Tail...> {      \
+    template <int dev_ctx_idx,                                              \
+              int in_idx,                                                   \
+              int vec_in_idx,                                               \
+              int attr_idx,                                                 \
+              int out_idx,                                                  \
+              int vec_out_idx,                                              \
+              typename... PreviousArgs>                                     \
+    static void Compute(const DeviceContext& dev_ctx,                       \
+                        const std::vector<Tensor>& inputs,                  \
+                        const std::vector<std::vector<Tensor>>& vec_inputs, \
+                        const std::vector<paddle::any>& attrs,              \
+                        std::vector<Tensor*>* outputs,                      \
+                        std::vector<std::vector<Tensor*>>* vec_outputs,     \
+                        PreviousArgs... pargs) {                            \
+      std::vector<tensor_type*> arg = (*vec_outputs)[vec_out_idx];          \
+      CustomComputeCallHelper<Tail...>::template Compute<dev_ctx_idx,       \
+                                                         in_idx,            \
+                                                         vec_in_idx,        \
+                                                         attr_idx,          \
+                                                         out_idx,           \
+                                                         vec_out_idx + 1>(  \
+          dev_ctx,                                                          \
+          inputs,                                                           \
+          vec_inputs,                                                       \
+          attrs,                                                            \
+          outputs,                                                          \
+          vec_outputs,                                                      \
+          pargs...,                                                         \
+          arg);                                                             \
+    }                                                                       \
+  }
+
+template <typename T>
+struct PtenTypeTag {};
+
+template <typename F, F f>
+struct CustomKernelFuncImpl;
+
+template <typename Return,
+          typename DevCtx,
+          typename... Args,
+          Return (*impl_fn)(DevCtx, Args...)>
+struct CustomKernelFuncImpl<Return (*)(DevCtx, Args...), impl_fn> {
+  static void Compute(const DeviceContext& dev_ctx,
+                      const std::vector<Tensor>& inputs,
+                      const std::vector<std::vector<Tensor>>& vec_inputs,
+                      const std::vector<paddle::any>& attrs,
+                      std::vector<Tensor*>* outputs,
+                      std::vector<std::vector<Tensor*>>* vec_outputs) {
+    CustomComputeCallHelper<DevCtx, Args..., PtenTypeTag<int>>::
+        template Compute<0, 0, 0, 0, 0, 0>(
+            dev_ctx, inputs, vec_inputs, attrs, outputs, vec_outputs);
+  }
+
+  // NOTE: Tensor in args is paddle::Tensor but not DenseTensor
+  static void VariadicCompute(const DeviceContext& dev_ctx, Args... args) {
+    return impl_fn(static_cast<DevCtx>(dev_ctx), std::forward<Args>(args)...);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct CustomComputeCallHelper;
+
+  /* DeviceContext Helpers */
+  PD_SPECIALIZE_KernelCallHelper_FOR_DEV_CONTEXT(CPUContext);
+
+  /* Input Helpers */
+  PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(Tensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(Tensor);
+
+  /* Attribute Helpers */
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(pten::dtype::float16);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
+  PD_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
+
+  /* Output Helpers */
+  PD_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(Tensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(Tensor);
+
+  // End: base template
+  template <typename T>
+  struct CustomComputeCallHelper<PtenTypeTag<T>> {
+    template <int dev_ctx_idx,
+              int in_idx,
+              int vec_in_idx,
+              int attr_idx,
+              int out_idx,
+              int vec_out_idx>
+    static void Compute(const DeviceContext& dev_ctx,
+                        const std::vector<Tensor>& inputs,
+                        const std::vector<std::vector<Tensor>>& vec_inputs,
+                        const std::vector<paddle::any>& attrs,
+                        std::vector<Tensor*>* outputs,
+                        std::vector<std::vector<Tensor*>>* vec_outputs,
+                        DevCtx device_ctx,
+                        Args... args) {
+      return impl_fn(device_ctx, args...);
+    }
+  };
+};
+
+#define PD_PT_KERNEL(...) \
+  ::paddle::CustomKernelFuncImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
+
+#define PD_PT_VARIADIC_KERNEL(...)                            \
+  reinterpret_cast<void*>(                                    \
+      &::paddle::CustomKernelFuncImpl<decltype(&__VA_ARGS__), \
+                                      &__VA_ARGS__>::VariadicCompute)
+
+////////////////////// Op Kernel Info depended structs //////////////////////
+// TODO(Aganlengzi): Re-define TensorArgDef and AttributeArgDef temporarily.
+// TensorArgDef follows pten::TensorArgDef in kernel_factory.h, the
+// difference is that custom_kernel needs extra `is_vector' to ensure we can
+// deal with case like vector with only one element.
+struct TensorArgDef {
+  pten::Backend backend;
+  pten::DataLayout layout;
+  pten::DataType dtype;
+  bool is_vector{false};
+
+  TensorArgDef(pten::Backend in_backend,
+               pten::DataLayout in_layout,
+               pten::DataType in_dtype,
+               bool is_vector = false)
+      : backend(in_backend),
+        layout(in_layout),
+        dtype(in_dtype),
+        is_vector(is_vector) {}
+
+  TensorArgDef& SetBackend(pten::Backend in_backend) {
+    backend = in_backend;
+    return *this;
+  }
+
+  TensorArgDef& SetDataLayout(pten::DataLayout in_layout) {
+    layout = in_layout;
+    return *this;
+  }
+
+  TensorArgDef& SetDataType(pten::DataType in_dtype) {
+    dtype = in_dtype;
+    return *this;
+  }
+};
+
+// AttributeArgDef follows pten::AttributeArgDef in kernel_factory.h
+struct AttributeArgDef {
+  std::type_index type_index;
+
+  explicit AttributeArgDef(std::type_index type_index)
+      : type_index(type_index) {}
+};
+
+////////////////////// Op Kernel Info //////////////////////
+// OpKernelInfo stores all info parsed from user kernel function, includes:
+// 0. op_name and kernel key(backend, data_layout and data_type)
+// 1. unified custom kernel function
+// 2. variadic kernel function(use paddle::Tensor)
+// 3. args info and user defined change for specific arg
+class PADDLE_API OpKernelInfo {
+ public:
+  explicit OpKernelInfo(const std::string& op_name,
+                        pten::Backend backend,
+                        pten::DataLayout data_layout,
+                        pten::DataType data_type)
+      : op_name_(op_name),
+        backend_(backend),
+        layout_(data_layout),
+        dtype_(data_type) {}
+
+  // format: PD_PT_KERNEL(...)
+  OpKernelInfo& SetKernelFn(CustomKernelFunc&& func);
+  // format: PD_PT_VARIADIC_KERNEL(...)
+  OpKernelInfo& SetVariadicKernelFn(void* func);
+
+  // for Args parsing and storing
+  void AppendInput(pten::Backend backend,
+                   pten::DataLayout layout,
+                   pten::DataType dtype,
+                   bool is_vector = false) {
+    input_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector));
+  }
+
+  void AppendOutput(pten::Backend backend,
+                    pten::DataLayout layout,
+                    pten::DataType dtype,
+                    bool is_vector = false) {
+    output_defs_.emplace_back(TensorArgDef(backend, layout, dtype, is_vector));
+  }
+
+  void AppendAttribute(std::type_index type_index) {
+    attribute_defs_.emplace_back(AttributeArgDef(type_index));
+  }
+
+  // for Args user-def function
+  TensorArgDef& InputAt(size_t idx) { return input_defs_.at(idx); }
+  TensorArgDef& OutputAt(size_t idx) { return output_defs_.at(idx); }
+
+  const pten::Backend& GetBackend() const { return backend_; }
+  const pten::DataLayout& GetDataLayout() const { return layout_; }
+  const pten::DataType& GetDataType() const { return dtype_; }
+
+ private:
+  friend class framework::OpKernelInfoHelper;
+
+  // 1. op info
+  std::string op_name_;
+
+  // 2. kernel key info
+  pten::Backend backend_{pten::Backend::UNDEFINED};
+  pten::DataLayout layout_{pten::DataLayout::UNDEFINED};
+  pten::DataType dtype_{pten::DataType::UNDEFINED};
+
+  // 3. args info
+  paddle::SmallVector<TensorArgDef> input_defs_{{}};
+  paddle::SmallVector<TensorArgDef> output_defs_{{}};
+  paddle::SmallVector<AttributeArgDef> attribute_defs_{{}};
+
+  // 4. func info
+  CustomKernelFunc kernel_fn_{nullptr};
+  void* variadic_kernel_fn_{nullptr};
+};
+
+////////////////////// Op Kernel Args Parser //////////////////////
+// Define CustomKernelArgsParseFunctor for args parsing
+// We have to store parsed info into OpKernelInfo before
+// mapping to pten::KernelArgsDef in pten::Kernel
+template <typename Func>
+struct CustomKernelArgsParseFunctor;
+
+template <typename Return_, typename... Args_>
+struct CustomKernelArgsParseFunctor<Return_ (*)(Args_...)> {
+  using Args = std::tuple<Args_...>;
+  enum : std::size_t { Arity = sizeof...(Args_) };
+  using Indices = std::make_index_sequence<Arity>;
+  template <std::size_t Index>
+  using Arg = typename std::tuple_element<Index, Args>::type;
+
+  static void Parse(OpKernelInfo* op_kernel_info) {
+    const pten::Backend& backend = op_kernel_info->GetBackend();
+    const pten::DataLayout& layout = op_kernel_info->GetDataLayout();
+    const pten::DataType& dtype = op_kernel_info->GetDataType();
+
+    auto default_tensor_layout = pten::DataLayout::NCHW;
+    if (layout != pten::DataLayout::ANY) {
+      default_tensor_layout = layout;
+    }
+    auto args_type = ParseArgType(Indices{});
+    for (auto arg_type : args_type) {
+      if (arg_type == std::type_index(typeid(const CPUContext&))) {
+        // do nothing, skip context arg now
+      } else if (arg_type == std::type_index(typeid(const Tensor&))) {
+        op_kernel_info->AppendInput(backend, default_tensor_layout, dtype);
+      } else if (arg_type ==
+                 std::type_index(typeid(const std::vector<Tensor>&))) {
+        op_kernel_info->AppendInput(
+            backend, default_tensor_layout, dtype, true);
+      } else if (arg_type == std::type_index(typeid(Tensor*))) {
+        op_kernel_info->AppendOutput(backend, default_tensor_layout, dtype);
+      } else if (arg_type == std::type_index(typeid(std::vector<Tensor*>))) {
+        op_kernel_info->AppendOutput(
+            backend, default_tensor_layout, dtype, true);
+      } else {
+        op_kernel_info->AppendAttribute(arg_type);
+      }
+    }
+  }
+
+ private:
+  template <std::size_t... INDEX>
+  static std::vector<std::type_index> ParseArgType(
+      std::index_sequence<INDEX...>) {
+    return {std::type_index(typeid(Arg<INDEX>))...};
+  }
+};
+
+#define PD_PT_ARGS_PARSE(...) \
+  ::paddle::CustomKernelArgsParseFunctor<decltype(&__VA_ARGS__)>::Parse
+
+//////////////// Op Kernel Info Map /////////////////
+// all user custom kernels information are stored in this map
+class PADDLE_API OpKernelInfoMap {
+ public:
+  static OpKernelInfoMap& Instance() {
+    static OpKernelInfoMap g_custom_kernel_info_map;
+    return g_custom_kernel_info_map;
+  }
+
+  std::vector<OpKernelInfo>& operator[](const std::string& name);
+
+  const std::unordered_map<std::string, std::vector<OpKernelInfo>>& GetMap()
+      const;
+
+ private:
+  OpKernelInfoMap() = default;
+  std::unordered_map<std::string, std::vector<OpKernelInfo>> map_;
+
+  PD_DISABLE_COPY_AND_ASSIGN(OpKernelInfoMap);
+};
+
+//////////////// Op Kernel Info Builder /////////////////
+// format: PD_PT_ARGS_PARSE(...)
+using CustomKernelArgsParseFn = void (*)(OpKernelInfo* op_kernel_info);
+using CustomKernelArgsDefFn = void (*)(OpKernelInfo* kernel);
+
+class PADDLE_API OpKernelInfoBuilder {
+ public:
+  explicit OpKernelInfoBuilder(std::string&& op_name,
+                               pten::Backend backend,
+                               pten::DataLayout data_layout,
+                               pten::DataType data_type);
+
+  OpKernelInfoBuilder& SetKernelFn(CustomKernelFunc func);
+  OpKernelInfoBuilder& SetVariadicKernelFn(void* func);
+  OpKernelInfoBuilder& ArgsParse(CustomKernelArgsParseFn func);
+  OpKernelInfoBuilder& ArgsDef(CustomKernelArgsDefFn func);
+
+ private:
+  // op name
+  std::string op_name_;
+
+  // kernel key info
+  pten::Backend backend_{pten::Backend::UNDEFINED};
+  pten::DataLayout layout_{pten::DataLayout::UNDEFINED};
+  pten::DataType dtype_{pten::DataType::UNDEFINED};
+
+  // ref current info ptr
+  OpKernelInfo* info_ptr_;
+};
+/////////////////////// Custom kernel register API /////////////////////////
+// For inference: compile directly with framework
+// Call after PD_REGISTER_KERNEL(...)
+void RegisterAllCustomKernel();
+
+// Using this api to load compiled custom kernel's dynamic library and
+// register custom kernels
+void LoadCustomKernelLib(const std::string& dso_name);
+
+//////////////// Custom kernel register macro /////////////////
+#define PD_BACKEND(arg__) pten::Backend::arg__
+#define PD_DATALAYOUT(arg__) pten::DataLayout::arg__
+#define PD_DATATYPE(arg__) pten::DataType::arg__
+
+#define PD_REGISTER_KERNEL(name, backend, layout, dtype, func)                \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
+      __reg_kernel__##name##_##backend##_##layout##_##dtype,                  \
+      "PD_REGISTER_KERNEL must be called in global namespace.");              \
+  void __PD_USER_args_def_##name##_##backend##_##layout_##dtype(              \
+      ::paddle::OpKernelInfo* op_kernel_info);                                \
+  static ::paddle::OpKernelInfoBuilder                                        \
+      __op_kernel_info_##name##_##backend##_##layout##_##dtype =              \
+          ::paddle::OpKernelInfoBuilder(#name,                                \
+                                        PD_BACKEND(backend),                  \
+                                        PD_DATALAYOUT(layout),                \
+                                        PD_DATATYPE(dtype))                   \
+              .SetKernelFn(PD_PT_KERNEL(func))                                \
+              .SetVariadicKernelFn(PD_PT_VARIADIC_KERNEL(func))               \
+              .ArgsParse(PD_PT_ARGS_PARSE(func))                              \
+              .ArgsDef(                                                       \
+                  &__PD_USER_args_def_##name##_##backend##_##layout_##dtype); \
+  void __PD_USER_args_def_##name##_##backend##_##layout_##dtype(              \
+      ::paddle::OpKernelInfo* kernel)
+
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index d3088c44834..3fe4baca773 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -3,16 +3,17 @@ add_subdirectory(utils)
 cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place)
 
 if (WITH_GPU)
-  nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
+  nv_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
 elseif (WITH_ROCM)
-  hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
+  hip_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
 else()
-  cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce)
+  cc_library(pten_tensor SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce utils_api pten_function_api)
 endif()
 
 cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context kernel_factory)
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
+cc_library(op_kernel_info SRCS op_kernel_info.cc DEPS pten_tensor)
 
 # forward api file
 set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
diff --git a/paddle/pten/api/lib/op_kernel_info.cc b/paddle/pten/api/lib/op_kernel_info.cc
new file mode 100644
index 00000000000..db474d457c3
--- /dev/null
+++ b/paddle/pten/api/lib/op_kernel_info.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/ext/op_kernel_info.h"
+#include "paddle/fluid/framework/custom_kernel.h"
+
+namespace paddle {
+
+////////////////////// Op Kernel Info //////////////////////
+
+OpKernelInfo& OpKernelInfo::SetKernelFn(CustomKernelFunc&& func) {
+  kernel_fn_ = std::forward<CustomKernelFunc>(func);
+  return *this;
+}
+
+OpKernelInfo& OpKernelInfo::SetVariadicKernelFn(void* func) {
+  variadic_kernel_fn_ = func;
+  return *this;
+}
+
+//////////////// Op Kernel Info Map /////////////////
+
+std::vector<OpKernelInfo>& OpKernelInfoMap::operator[](
+    const std::string& name) {
+  return map_[name];
+}
+
+const std::unordered_map<std::string, std::vector<OpKernelInfo>>&
+OpKernelInfoMap::GetMap() const {
+  return map_;
+}
+
+//////////////// Op Kernel Info Builder /////////////////
+
+OpKernelInfoBuilder::OpKernelInfoBuilder(std::string&& op_name,
+                                         pten::Backend backend,
+                                         pten::DataLayout data_layout,
+                                         pten::DataType data_type) {
+  // 1. member assign
+  op_name_ = std::forward<std::string>(op_name);
+  backend_ = backend;
+  layout_ = data_layout;
+  dtype_ = data_type;
+
+  // 2. info parse
+  auto& info_vector = OpKernelInfoMap::Instance()[op_name_];
+  auto op_kernel_info = OpKernelInfo(op_name_, backend_, layout_, dtype_);
+  info_vector.emplace_back(std::move(op_kernel_info));
+
+  // 3. get current info ptr
+  info_ptr_ = &(info_vector.back());
+}
+
+OpKernelInfoBuilder& OpKernelInfoBuilder::SetKernelFn(CustomKernelFunc func) {
+  info_ptr_->SetKernelFn(std::forward<CustomKernelFunc>(func));
+  return *this;
+}
+
+OpKernelInfoBuilder& OpKernelInfoBuilder::SetVariadicKernelFn(void* func) {
+  info_ptr_->SetVariadicKernelFn(func);
+  return *this;
+}
+
+OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsParse(
+    CustomKernelArgsParseFn func) {
+  func(this->info_ptr_);
+  return *this;
+}
+
+OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsDef(CustomKernelArgsDefFn func) {
+  func(this->info_ptr_);
+  return *this;
+}
+
+/////////////////////// Op register API /////////////////////////
+
+// For inference: compile directly with framework
+// Call after PD_REGISTER_KERNEL(...)
+void RegisterAllCustomKernel() {
+  auto& op_kernel_info_map = OpKernelInfoMap::Instance();
+  framework::RegisterKernelWithMetaInfoMap(op_kernel_info_map);
+}
+
+// Using this api to load compiled custom kernel's dynamic library and
+// register custom kernels
+void LoadCustomKernelLib(const std::string& dso_name) {
+  framework::LoadCustomKernelLib(dso_name);
+}
+
+}  // namespace paddle
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C-API to get global OpKernelInfoMap.
+paddle::OpKernelInfoMap& PD_GetOpKernelInfoMap() {
+  return paddle::OpKernelInfoMap::Instance();
+}
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif
diff --git a/paddle/pten/core/kernel_def.h b/paddle/pten/core/kernel_def.h
index 3884bb55e47..9b91720d86f 100644
--- a/paddle/pten/core/kernel_def.h
+++ b/paddle/pten/core/kernel_def.h
@@ -26,7 +26,7 @@ class KernelSignature;
 class ArgumentMappingContext;
 class InferMetaContext;
 
-using KernelFn = void (*)(KernelContext* ctx);
+using KernelFn = std::function<void(KernelContext* ctx)>;
 using KernelArgsDefFn = void (*)(Kernel* kernel);
 using KernelArgsParseFn = void (*)(const KernelKey& default_key,
                                    KernelArgsDef* args_def);
diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h
index 25e3439a640..b21c71f3fa1 100644
--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -49,8 +49,6 @@ using DataLayout = paddle::experimental::DataLayout;
 
 class KernelContext;
 
-using KernelFn = void (*)(KernelContext* ctx);
-
 class KernelKey {
  public:
   KernelKey() = default;
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index cd8ce07f800..4208132b980 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -1,6 +1,6 @@
 # for paddle test case
 
 if(WITH_TESTING)
-  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags)
+  cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags)
 endif()
 cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags)
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 03b9ad7fc2d..5e023e9248c 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -371,6 +371,17 @@ if load_noavx:
         raise e
 
 
+def set_paddle_custom_device_lib_path(lib_path):
+    if os.environ.get('CUSTOM_DEVICE_ROOT', None) is not None:
+        # use setted environment value
+        return
+    if os.path.exists(lib_path):
+        # set CUSTOM_DEVICE_ROOT default path
+        os.environ['CUSTOM_DEVICE_ROOT'] = os.path.normpath(lib_path)
+    else:
+        os.environ['CUSTOM_DEVICE_ROOT'] = ''
+
+
 # set paddle lib path
 def set_paddle_lib_path():
     site_dirs = site.getsitepackages() if hasattr(
@@ -380,11 +391,15 @@ def set_paddle_lib_path():
         lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
         if os.path.exists(lib_dir):
             _set_paddle_lib_path(lib_dir)
+            set_paddle_custom_device_lib_path(
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
             return
     if hasattr(site, 'USER_SITE'):
         lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
         if os.path.exists(lib_dir):
             _set_paddle_lib_path(lib_dir)
+            set_paddle_custom_device_lib_path(
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
 
 
 set_paddle_lib_path()
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index d73c4e3acb9..587d4aee34c 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -9,5 +9,6 @@ endforeach()
 add_subdirectory(unittests)
 add_subdirectory(book)
 add_subdirectory(custom_op)
+add_subdirectory(custom_kernel)
 
 set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt b/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt
new file mode 100644
index 00000000000..b2bdfac9080
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/CMakeLists.txt
@@ -0,0 +1,2 @@
+py_test(test_custom_kernel_dot SRCS test_custom_kernel_dot.py)
+py_test(test_custom_kernel_load SRCS test_custom_kernel_load.py)
diff --git a/python/paddle/fluid/tests/custom_kernel/__init__.py b/python/paddle/fluid/tests/custom_kernel/__init__.py
new file mode 100644
index 00000000000..97043fd7ba6
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
new file mode 100644
index 00000000000..e61b7314ef6
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+namespace paddle {
+
+namespace custom_kernel {
+
+// Here we use dot <CPU, ANY, INT8> for test
+// This test will fail when this kernel is supported in framework
+template <typename T>
+void Dot(const paddle::CPUContext& dev_ctx,
+         const paddle::Tensor& x,
+         const paddle::Tensor& y,
+         paddle::Tensor* out) {
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
+
+  // Loop over the total N elements of both operands while sum-reducing every
+  // B pairs along the way where B is the dimension of the least ordered axis
+  auto shape = x.shape();
+  auto const N = x.numel();
+  auto const B = shape[shape.size() - 1];
+
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+
+}  // namespace custom_kernel
+}  // namespace paddle
+
+PD_REGISTER_KERNEL(
+    dot, CPU, ALL_LAYOUT, INT8, paddle::custom_kernel::Dot<int8_t>) {
+  /* do some args define here
+   * the only param can be used is OpKernelInfo* kernel */
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::INT8);
+}
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
new file mode 100644
index 00000000000..5e3bd2f8ed9
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from paddle.fluid import core
+from distutils.sysconfig import get_python_lib
+from distutils.core import setup, Extension
+
+# cc flags
+paddle_extra_compile_args = ['-std=c++14', '-shared', '-fPIC']
+if core.is_compiled_with_npu():
+    paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
+
+# include path
+site_packages_path = get_python_lib()
+paddle_custom_kernel_include = [
+    os.path.join(site_packages_path, 'paddle', 'include'),
+]
+
+# libs path
+paddle_custom_kernel_library_dir = [
+    os.path.join(site_packages_path, 'paddle', 'fluid'),
+]
+
+# libs
+libs = [':core_avx.so']
+if not core.has_avx_core and core.has_noavx_core:
+    libs = [':core_noavx.so']
+
+custom_kernel_dot_module = Extension(
+    'custom_kernel_dot',
+    sources=['custom_kernel_dot.cc'],
+    include_dirs=paddle_custom_kernel_include,
+    library_dirs=paddle_custom_kernel_library_dir,
+    libraries=libs,
+    extra_compile_args=paddle_extra_compile_args)
+
+setup(
+    name='custom_kernel_dot',
+    version='1.0',
+    description='custom kernel fot compiling',
+    ext_modules=[custom_kernel_dot_module])
diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
new file mode 100644
index 00000000000..13d8a29e71b
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_dot.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import site
+import unittest
+import numpy as np
+
+
+# use dot <CPU, ANY, INT8> as test case.
+class TestCustomKernelDot(unittest.TestCase):
+    def setUp(self):
+        # compile so and set to current path
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+        # --inplace to place output so file to current dir
+        cmd = 'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format(
+            cur_dir, sys.executable)
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = cur_dir
+
+    def test_custom_kernel_dot_run(self):
+        # test dot run
+        x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        result = np.sum(x_data * y_data, axis=1).reshape([2, 1])
+
+        import paddle
+        paddle.set_device('cpu')
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        out = paddle.dot(x, y)
+
+        self.assertTrue(
+            np.array_equal(out.numpy(), result),
+            "custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(),
+                                                                    result))
+
+    def tearDown(self):
+        del os.environ['CUSTOM_DEVICE_ROOT']
+
+
+if __name__ == '__main__':
+    if os.name == 'nt' or sys.platform.startswith('darwin'):
+        # only support Linux now
+        exit()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
new file mode 100644
index 00000000000..1d7b29e8511
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import site
+import unittest
+import numpy as np
+
+
+class TestCustomKernelLoad(unittest.TestCase):
+    def setUp(self):
+        # compile so and set to current path
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+        # --inplace to place output so file to current dir
+        cmd = 'cd {} && {} custom_kernel_dot_setup.py build_ext --inplace'.format(
+            cur_dir, sys.executable)
+        os.system(cmd)
+
+        # get paddle lib path and place so
+        paddle_lib_path = ''
+        site_dirs = site.getsitepackages() if hasattr(
+            site, 'getsitepackages'
+        ) else [x for x in sys.path if 'site-packages' in x]
+        for site_dir in site_dirs:
+            lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
+            if os.path.exists(lib_dir):
+                paddle_lib_path = lib_dir
+                break
+        if paddle_lib_path == '':
+            if hasattr(site, 'USER_SITE'):
+                lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
+                if os.path.exists(lib_dir):
+                    paddle_lib_path = lib_dir
+        self.default_path = os.path.sep.join(
+            [paddle_lib_path, '..', '..', 'paddle-plugins'])
+        # copy so to defalut path
+        cmd = 'mkdir -p {} && cp ./*.so {}'.format(self.default_path,
+                                                   self.default_path)
+        os.system(cmd)  # wait
+
+    def test_custom_kernel_dot_load(self):
+        # test dot load
+        x_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        y_data = np.random.uniform(1, 5, [2, 10]).astype(np.int8)
+        result = np.sum(x_data * y_data, axis=1).reshape([2, 1])
+
+        import paddle
+        paddle.set_device('cpu')
+        x = paddle.to_tensor(x_data)
+        y = paddle.to_tensor(y_data)
+        out = paddle.dot(x, y)
+
+        self.assertTrue(
+            np.array_equal(out.numpy(), result),
+            "custom kernel dot out: {},\n numpy dot out: {}".format(out.numpy(),
+                                                                    result))
+
+    def tearDown(self):
+        cmd = 'rm -rf {}'.format(self.default_path)
+        os.system(cmd)
+
+
+if __name__ == '__main__':
+    if os.name == 'nt' or sys.platform.startswith('darwin'):
+        # only support Linux now
+        exit()
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index e8cc2914521..d1c0157c2b3 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -573,7 +573,8 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/pten/common')) +  # pten common headers
     # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform`
     # to `paddle/pten/api/ext`,
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'])
+    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
+    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
-- 
GitLab