diff --git a/paddle/extension.h b/paddle/extension.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c64b92c5a3746b6f7f240c87bfb03515d80ca16
--- /dev/null
+++ b/paddle/extension.h
@@ -0,0 +1,18 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// All paddle apis in C++ frontend
+#include "paddle/fluid/extension/include/all.h"
diff --git a/paddle/fluid/extension/include/all.h b/paddle/fluid/extension/include/all.h
new file mode 100644
index 0000000000000000000000000000000000000000..5aa61f8203e75320cfdf11ed34fe9a7462548c60
--- /dev/null
+++ b/paddle/fluid/extension/include/all.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if !defined(_MSC_VER) && __cplusplus < 199711L
+#error C++11 or later compatible compiler is required to use Paddle.
+#endif
+
+#include "paddle/fluid/extension/include/dispatch.h"
+#include "paddle/fluid/extension/include/dtype.h"
+#include "paddle/fluid/extension/include/op_meta_info.h"
+#include "paddle/fluid/extension/include/place.h"
+#include "paddle/fluid/extension/include/tensor.h"
diff --git a/paddle/fluid/extension/include/dispatch.h b/paddle/fluid/extension/include/dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c22971039521c364f0f280a8d90e38b0b768e926
--- /dev/null
+++ b/paddle/fluid/extension/include/dispatch.h
@@ -0,0 +1,168 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/extension/include/dtype.h"
+
+namespace paddle {
+
+///////// Basic Marco ///////////
+
+#define PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \
+  case enum_type: {                                                       \
+    using HINT = type;                                                    \
+    __VA_ARGS__();                                                        \
+    break;                                                                \
+  }
+
+#define PD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \
+  PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, data_t, __VA_ARGS__)
+
+///////// Floating Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                    \
+  [&] {                                                                \
+    const auto& __dtype__ = TYPE;                                      \
+    switch (__dtype__) {                                               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,   \
+                           __VA_ARGS__)                                \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,  \
+                           __VA_ARGS__)                                \
+      default:                                                         \
+        throw std::runtime_error("function " #NAME                     \
+                                 " not implemented for data type `" +  \
+                                 ::paddle::ToString(__dtype__) + "`"); \
+    }                                                                  \
+  }()
+
+///////// Integral Dispatch Marco ///////////
+
+#define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)                           \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
+                           __VA_ARGS__)                                       \
+      default:                                                                \
+        throw std::runtime_error("function " #NAME                            \
+                                 " not implemented for data type `" +         \
+                                 ::paddle::ToString(__dtype__) + "`");        \
+    }                                                                         \
+  }()
+
+///////// Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                     \
+  [&] {                                                                \
+    const auto& __dtype__ = TYPE;                                      \
+    switch (__dtype__) {                                               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,        \
+                           ::paddle::complex64, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,       \
+                           ::paddle::complex128, __VA_ARGS__)          \
+      default:                                                         \
+        throw std::runtime_error("function " #NAME                     \
+                                 " not implemented for data type `" +  \
+                                 ::paddle::ToString(__dtype__) + "`"); \
+    }                                                                  \
+  }()
+
+///////// Floating and Integral Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)              \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
+                           __VA_ARGS__)                                       \
+      default:                                                                \
+        throw std::runtime_error("function " #NAME                            \
+                                 " not implemented for data type `" +         \
+                                 ::paddle::ToString(__dtype__) + "`");        \
+    }                                                                         \
+  }()
+
+///////// Floating and Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)        \
+  [&] {                                                                \
+    const auto& __dtype__ = TYPE;                                      \
+    switch (__dtype__) {                                               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,   \
+                           __VA_ARGS__)                                \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,  \
+                           __VA_ARGS__)                                \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,        \
+                           ::paddle::complex64, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,       \
+                           ::paddle::complex128, __VA_ARGS__)          \
+      default:                                                         \
+        throw std::runtime_error("function " #NAME                     \
+                                 " not implemented for data type `" +  \
+                                 ::paddle::ToString(__dtype__) + "`"); \
+    }                                                                  \
+  }()
+
+///////// Floating, Integral and Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)  \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,               \
+                           ::paddle::complex64, __VA_ARGS__)                  \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,              \
+                           ::paddle::complex128, __VA_ARGS__)                 \
+      default:                                                                \
+        throw std::runtime_error("function " #NAME                            \
+                                 " not implemented for data type `" +         \
+                                 ::paddle::ToString(__dtype__) + "`");        \
+    }                                                                         \
+  }()
+
+// TODO(chenweihang): Add more Marcos in the future if needed
+
+}  // namespace paddle
diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5d2e0f820555a5d4c0da33a35475a0fce44ec64
--- /dev/null
+++ b/paddle/fluid/extension/include/dtype.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+
+using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
+using complex64 = paddle::platform::complex64;
+using complex128 = paddle::platform::complex128;
+
+enum DataType {
+  BOOL,
+  INT8,
+  UINT8,
+  INT16,
+  INT32,
+  INT64,
+  FLOAT16,
+  BFLOAT16,
+  FLOAT32,
+  FLOAT64,
+  COMPLEX64,
+  COMPLEX128,
+  // TODO(JiabinYang) support more data types if needed.
+};
+
+inline std::string ToString(DataType dtype) {
+  switch (dtype) {
+    case DataType::BOOL:
+      return "bool";
+    case DataType::INT8:
+      return "int8_t";
+    case DataType::UINT8:
+      return "uint8_t";
+    case DataType::INT16:
+      return "int16_t";
+    case DataType::INT32:
+      return "int32_t";
+    case DataType::INT64:
+      return "int64_t";
+    case DataType::FLOAT16:
+      return "float16";
+    case DataType::BFLOAT16:
+      return "bfloat16";
+    case DataType::FLOAT32:
+      return "float";
+    case DataType::FLOAT64:
+      return "double";
+    case DataType::COMPLEX64:
+      return "complex64";
+    case DataType::COMPLEX128:
+      return "complex128";
+    default:
+      throw std::runtime_error("Unsupported paddle enum data type.");
+  }
+}
+
+#define PD_FOR_EACH_DATA_TYPE(_)    \
+  _(bool, DataType::BOOL)           \
+  _(int8_t, DataType::INT8)         \
+  _(uint8_t, DataType::UINT8)       \
+  _(int16_t, DataType::INT16)       \
+  _(int, DataType::INT32)           \
+  _(int64_t, DataType::INT64)       \
+  _(float16, DataType::FLOAT16)     \
+  _(bfloat16, DataType::BFLOAT16)   \
+  _(float, DataType::FLOAT32)       \
+  _(double, DataType::FLOAT64)      \
+  _(complex64, DataType::COMPLEX64) \
+  _(complex128, DataType::COMPLEX128)
+
+template <paddle::DataType T>
+struct DataTypeToCPPType;
+
+#define PD_SPECIALIZE_DataTypeToCPPType(cpp_type, data_type) \
+  template <>                                                \
+  struct DataTypeToCPPType<data_type> {                      \
+    using type = cpp_type;                                   \
+  };
+
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_DataTypeToCPPType)
+
+#undef PD_SPECIALIZE_DataTypeToCPPType
+
+}  // namespace paddle
diff --git a/paddle/fluid/extension/include/op_meta_info.h b/paddle/fluid/extension/include/op_meta_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..920049e2390ed38b12f3466fb35bf37c77dfbbe2
--- /dev/null
+++ b/paddle/fluid/extension/include/op_meta_info.h
@@ -0,0 +1,325 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <boost/any.hpp>
+
+#include "paddle/fluid/extension/include/tensor.h"
+
+/**
+ * Op Meta Info Related Define.
+ *
+ * Used to maintain operator core information.
+ *
+ */
+
+namespace paddle {
+namespace framework {
+class OpMetaInfoHelper;
+}  // namespace framework
+
+using Tensor = paddle::Tensor;
+
+#define PD_DISABLE_COPY_AND_ASSIGN(classname)      \
+ private:                                          \
+  classname(const classname&) = delete;            \
+  classname(classname&&) = delete;                 \
+  classname& operator=(const classname&) = delete; \
+  classname& operator=(classname&&) = delete
+
+///////////////// Util Define and Function ////////////////
+
+inline std::string Grad(const std::string& var_name) {
+  std::string result;
+  result.reserve(var_name.size() + 5U);
+  result += var_name;
+  result += "@GRAD";
+  return result;
+}
+
+////////////////////// Kernel Function (PD_KERNEL) ////////////////////////
+
+// Record Op kernel core function
+using KernelFunc = std::vector<Tensor> (*)(std::vector<Tensor> inputs,
+                                           std::vector<boost::any> attrs);
+
+template <typename T>
+struct TypeTag {};
+
+template <typename F, F f>
+struct KernelFuncImpl;
+
+template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
+struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
+  static Return Compute(std::vector<Tensor> inputs,
+                        std::vector<boost::any> attrs) {
+    return ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0>(
+        inputs, attrs);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct ComputeCallHelper;
+
+  // for Tensor input
+  template <typename... Tail>
+  struct ComputeCallHelper<const Tensor&, Tail...> {
+    template <int in_idx, int attr_idx, typename... PreviousArgs>
+    static Return Compute(std::vector<Tensor> inputs,
+                          std::vector<boost::any> attrs,
+                          const PreviousArgs&... pargs) {
+      static_assert(attr_idx == 0,
+                    "Input tensor should appear before attributes.");
+      const Tensor& arg = inputs[in_idx];
+      return ComputeCallHelper<Tail...>::template Compute<in_idx + 1, attr_idx>(
+          inputs, attrs, pargs..., arg);
+    }
+  };
+
+  // TODO(chenweihang): add support for attribute input
+  // int attribute input (not used now)
+  template <typename... Tail>
+  struct ComputeCallHelper<int, Tail...> {
+    template <int in_idx, int attr_idx, typename... PreviousArgs>
+    static Return Compute(std::vector<Tensor> inputs,
+                          std::vector<boost::any> attrs,
+                          const PreviousArgs&... pargs) {
+      try {
+        int arg = boost::any_cast<int>(attrs[attr_idx]);
+        return ComputeCallHelper<Tail...>::template Compute<in_idx,
+                                                            attr_idx + 1>(
+            inputs, attrs, pargs..., arg);
+      } catch (boost::bad_any_cast&) {
+        throw std::runtime_error(
+            "Attribute cast error in custom operator. Expected int value.");
+      }
+    }
+  };
+
+  // end: base template
+  template <typename T>
+  struct ComputeCallHelper<TypeTag<T>> {
+    template <int in_idx, int attr_idx>
+    static Return Compute(std::vector<Tensor> inputs,
+                          std::vector<boost::any> attrs, const Args&... args) {
+      return impl_fn(args...);
+    }
+  };
+};
+
+#define PD_KERNEL(...) \
+  ::paddle::KernelFuncImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
+
+/////////////// InferShape Function (PD_INFER_SHAPE) ///////////////
+
+// Record Op infershape core function
+using InferShapeFunc = std::vector<std::vector<int64_t>> (*)(
+    std::vector<std::vector<int64_t>> input_shapes);
+
+template <typename F, F f>
+struct InferShapeFuncImpl;
+
+template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
+struct InferShapeFuncImpl<Return (*)(Args...), impl_fn> {
+  static Return InferShape(std::vector<std::vector<int64_t>> input_shapes) {
+    return InferShapeCallHelper<Args..., TypeTag<int>>::template InferShape<0>(
+        input_shapes);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct InferShapeCallHelper;
+
+  // only one type input: std::vector<int64_t>
+  template <typename... Tail>
+  struct InferShapeCallHelper<std::vector<int64_t>, Tail...> {
+    template <int in_idx, typename... PreviousArgs>
+    static Return InferShape(std::vector<std::vector<int64_t>> input_shapes,
+                             const PreviousArgs&... pargs) {
+      std::vector<int64_t> arg = input_shapes[in_idx];
+      return InferShapeCallHelper<Tail...>::template InferShape<in_idx + 1>(
+          input_shapes, pargs..., arg);
+    }
+  };
+
+  // end: base template
+  template <typename T>
+  struct InferShapeCallHelper<TypeTag<T>> {
+    template <int in_idx>
+    static Return InferShape(std::vector<std::vector<int64_t>> input_shapes,
+                             const Args&... args) {
+      return impl_fn(args...);
+    }
+  };
+};
+
+#define PD_INFER_SHAPE(...) \
+  ::paddle::InferShapeFuncImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::InferShape
+
+/////////////// InferDataType Function (PD_INFER_DTYPE) ///////////////
+
+// Record Op Infer dtype core function
+using InferDtypeFunc =
+    std::vector<DataType> (*)(std::vector<DataType> input_dtypes);
+
+template <typename F, F f>
+struct InferDtypeFuncImpl;
+
+template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
+struct InferDtypeFuncImpl<Return (*)(Args...), impl_fn> {
+  static Return InferDtype(std::vector<DataType> input_dtypes) {
+    return InferDtypeCallHelper<Args..., TypeTag<int>>::template InferDtype<0>(
+        input_dtypes);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct InferDtypeCallHelper;
+
+  // Only one type input now: DataType
+  template <typename... Tail>
+  struct InferDtypeCallHelper<DataType, Tail...> {
+    template <int in_idx, typename... PreviousArgs>
+    static Return InferDtype(std::vector<DataType> input_dtypes,
+                             const PreviousArgs&... pargs) {
+      DataType arg = input_dtypes[in_idx];
+      return InferDtypeCallHelper<Tail...>::template InferDtype<in_idx + 1>(
+          input_dtypes, pargs..., arg);
+    }
+  };
+
+  // end: base template
+  template <typename T>
+  struct InferDtypeCallHelper<TypeTag<T>> {
+    template <int in_idx>
+    static Return InferDtype(std::vector<DataType> input_dtypes,
+                             const Args&... args) {
+      return impl_fn(args...);
+    }
+  };
+};
+
+#define PD_INFER_DTYPE(...) \
+  ::paddle::InferDtypeFuncImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::InferDtype
+
+////////////////////// Op Meta Info //////////////////////
+
+class OpMetaInfo {
+ public:
+  explicit OpMetaInfo(const std::string& op_name) : name_(op_name) {}
+  OpMetaInfo& Inputs(std::vector<std::string>&& inputs);
+  OpMetaInfo& Outputs(std::vector<std::string>&& outputs);
+  OpMetaInfo& SetKernelFn(KernelFunc&& func);
+  OpMetaInfo& SetInferShapeFn(InferShapeFunc&& func);
+  OpMetaInfo& SetInferDtypeFn(InferDtypeFunc&& func);
+
+ private:
+  friend class framework::OpMetaInfoHelper;
+
+  // 1. desc info
+  std::string name_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  std::vector<std::string> attrs_;
+
+  // 2. func info
+  KernelFunc kernel_fn_;
+  InferShapeFunc infer_shape_fn_;
+  InferDtypeFunc infer_dtype_fn_;
+};
+
+//////////////// Op Meta Info Map /////////////////
+
+class OpMetaInfoMap {
+ public:
+  // this function's impl should keep in header file.
+  // if move to cc file, meta info can not be added
+  // into map
+  static OpMetaInfoMap& Instance() {
+    static OpMetaInfoMap g_custom_op_meta_info_map;
+    return g_custom_op_meta_info_map;
+  }
+
+  std::vector<OpMetaInfo>& operator[](const std::string& name);
+
+  const std::unordered_map<std::string, std::vector<OpMetaInfo>>& GetMap()
+      const;
+
+ private:
+  OpMetaInfoMap() = default;
+  std::unordered_map<std::string, std::vector<OpMetaInfo>> map_;
+
+  PD_DISABLE_COPY_AND_ASSIGN(OpMetaInfoMap);
+};
+
+//////////////// Op Meta Info Builder /////////////////
+
+class OpMetaInfoBuilder {
+ public:
+  explicit OpMetaInfoBuilder(std::string&& name);
+  OpMetaInfoBuilder& Inputs(std::vector<std::string>&& inputs);
+  OpMetaInfoBuilder& Outputs(std::vector<std::string>&& outputs);
+  OpMetaInfoBuilder& SetKernelFn(KernelFunc&& func);
+  OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc&& func);
+  OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc&& func);
+  OpMetaInfoBuilder& SetBackwardOp(const std::string& bwd_op_name);
+
+ private:
+  // Forward Op name
+  std::string name_;
+  // Point to the currently constructed op meta info
+  OpMetaInfo* info_ptr_;
+};
+
+/////////////////////// Op register API /////////////////////////
+
+// For inference: compile directly with framework
+// Call after PD_BUILD_OP(...)
+void RegisterAllCustomOperator();
+
+// Using this api to load compiled custom operator's dynamic library and
+// register Custom
+// Operator into it
+void LoadCustomOperatorLib(const std::string& dso_name);
+
+/////////////////////// Op register Macro /////////////////////////
+
+#define PD_BUILD_OP_WITH_COUNTER(op_name, counter)                  \
+  static ::paddle::OpMetaInfoBuilder __op_meta_info_##counter##__ = \
+      ::paddle::OpMetaInfoBuilder(op_name)
+
+#define PD_BUILD_OP_INNER(op_name, counter) \
+  PD_BUILD_OP_WITH_COUNTER(op_name, counter)
+
+#define PD_BUILD_OP(op_name) PD_BUILD_OP_INNER(op_name, __COUNTER__)
+
+}  // namespace paddle
+
+///////////////////// C API ///////////////////
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C-API to get global OpMetaInfoMap.
+paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap();
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/paddle/fluid/extension/include/place.h b/paddle/fluid/extension/include/place.h
new file mode 100644
index 0000000000000000000000000000000000000000..91d4f41c2135145062fb98b2df77b14ec4e1a32b
--- /dev/null
+++ b/paddle/fluid/extension/include/place.h
@@ -0,0 +1,22 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+
+// TODO(yangjiabin): Add other place support in next PR
+enum class PlaceType { kUNK = -1, kCPU, kGPU };
+
+}  // namespace paddle
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5ce0d1a5858b0422e6187bf2ca0e7198b87ed57
--- /dev/null
+++ b/paddle/fluid/extension/include/tensor.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "paddle/fluid/extension/include/dtype.h"
+#include "paddle/fluid/extension/include/place.h"
+
+namespace paddle {
+namespace framework {
+class CustomTensorUtils;
+}  // namespace framework
+class Tensor {
+ public:
+  /// \brief Construct a Tensor on target Place for CustomOp.
+  /// Generally it's only used for user to create Tensor.
+  explicit Tensor(const PlaceType& place);
+  /// \brief Reset the shape of the tensor.
+  /// Generally it's only used for the input tensor.
+  /// Reshape must be called before calling
+  /// mutable_data() or copy_to(const PlaceType& place)
+  /// \param shape The shape to set.
+  void reshape(const std::vector<int>& shape);
+
+  /// \brief Get the memory pointer in CPU or GPU with
+  /// specific data type.
+  /// Please Reshape the tensor first before call this.
+  /// It's usually used to get input data pointer.
+  /// \param place The place of the tensor this will
+  /// override the original place of current tensor.
+  template <typename T>
+  T* mutable_data(const PlaceType& place);
+
+  /// \brief Get the memory pointer in CPU or GPU with
+  /// specific data type. Please Reshape the tensor
+  /// first before call this.It's usually used to get
+  /// input data pointer.
+  template <typename T>
+  T* mutable_data();
+
+  /// \brief Get the memory pointer directly.
+  /// It's usually used to get the output data pointer.
+  /// \return The tensor data buffer pointer.
+  template <typename T>
+  T* data() const;
+
+  /// \brief Copy the host memory to tensor data.
+  /// It's usually used to set the input tensor data.
+  /// \param PlaceType of target place, of which
+  /// the tensor will copy to.
+
+  template <typename T>
+  Tensor copy_to(const PlaceType& place) const;
+
+  /// \brief Return the shape of the Tensor.
+  std::vector<int> shape() const;
+
+  /// \brief Return the data type of the tensor.
+  /// It's usually used to get the output tensor data type.
+  /// \return The data type of the tensor.
+  DataType type() const;
+
+  /// \brief Get the size of current tensor.
+  /// Use this method to get the size of tensor
+  /// \return int64_t.
+  int64_t size() const;
+
+  /// \brief Get the place of current tensor.
+  /// Use this method to get the place of tensor
+  /// \return Place.
+  const PlaceType& place() const;
+
+  /// \brief Cast datatype from one to another
+  Tensor cast(const DataType& target_type) const;
+
+ private:
+  friend class framework::CustomTensorUtils;
+  mutable std::shared_ptr<void> tensor_;
+  mutable PlaceType place_;
+};
+
+}  // namespace paddle
diff --git a/paddle/fluid/extension/src/op_meta_info.cc b/paddle/fluid/extension/src/op_meta_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f31723e5ac83675884f950c1c4e8917c220bc474
--- /dev/null
+++ b/paddle/fluid/extension/src/op_meta_info.cc
@@ -0,0 +1,123 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/extension/include/op_meta_info.h"
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/custom_operator.h"
+
+namespace paddle {
+
+////////////////////// Op Meta Info //////////////////////
+
+OpMetaInfo& OpMetaInfo::Inputs(std::vector<std::string>&& inputs) {
+  inputs_ = std::forward<std::vector<std::string>>(inputs);
+  return *this;
+}
+OpMetaInfo& OpMetaInfo::Outputs(std::vector<std::string>&& outputs) {
+  outputs_ = std::forward<std::vector<std::string>>(outputs);
+  return *this;
+}
+OpMetaInfo& OpMetaInfo::SetKernelFn(KernelFunc&& func) {
+  kernel_fn_ = std::forward<KernelFunc>(func);
+  return *this;
+}
+OpMetaInfo& OpMetaInfo::SetInferShapeFn(InferShapeFunc&& func) {
+  infer_shape_fn_ = std::forward<InferShapeFunc>(func);
+  return *this;
+}
+OpMetaInfo& OpMetaInfo::SetInferDtypeFn(InferDtypeFunc&& func) {
+  infer_dtype_fn_ = std::forward<InferDtypeFunc>(func);
+  return *this;
+}
+
+//////////////// Op Meta Info Map /////////////////
+
+std::vector<OpMetaInfo>& OpMetaInfoMap::operator[](const std::string& name) {
+  return map_[name];
+}
+
+const std::unordered_map<std::string, std::vector<OpMetaInfo>>&
+OpMetaInfoMap::GetMap() const {
+  return map_;
+}
+
+//////////////// Op Meta Info Builder /////////////////
+
+OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name) {
+  name_ = std::forward<std::string>(name);
+  auto& info_vector = OpMetaInfoMap::Instance()[name_];
+  auto op_meta = OpMetaInfo(name_);
+  info_vector.emplace_back(std::move(op_meta));
+  info_ptr_ = &(info_vector.back());
+}
+
+OpMetaInfoBuilder& OpMetaInfoBuilder::Inputs(
+    std::vector<std::string>&& inputs) {
+  info_ptr_->Inputs(std::forward<std::vector<std::string>>(inputs));
+  return *this;
+}
+
+OpMetaInfoBuilder& OpMetaInfoBuilder::Outputs(
+    std::vector<std::string>&& outputs) {
+  info_ptr_->Outputs(std::forward<std::vector<std::string>>(outputs));
+  return *this;
+}
+
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc&& func) {
+  info_ptr_->SetKernelFn(std::forward<KernelFunc>(func));
+  return *this;
+}
+
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc&& func) {
+  info_ptr_->SetInferShapeFn(std::forward<InferShapeFunc>(func));
+  return *this;
+}
+
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc&& func) {
+  info_ptr_->SetInferDtypeFn(std::forward<InferDtypeFunc>(func));
+  return *this;
+}
+
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetBackwardOp(
+    const std::string& bwd_op_name) {
+  auto& info_vector = OpMetaInfoMap::Instance()[name_];
+  auto op_meta = OpMetaInfo(bwd_op_name);
+  info_vector.emplace_back(std::move(op_meta));
+  info_ptr_ = &(info_vector.back());
+  return *this;
+}
+
+/////////////////////// Op register API /////////////////////////
+
+void RegisterAllCustomOperator() {
+  auto& op_meta_info_map = OpMetaInfoMap::Instance();
+  framework::RegisterOperatorWithMetaInfoMap(op_meta_info_map);
+}
+
+void LoadCustomOperatorLib(const std::string& dso_name) {
+  paddle::framework::LoadOpMetaInfoAndRegisterOp(dso_name);
+}
+}  // namespace paddle
+
+extern "C" {
+
+paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap() {
+  return paddle::OpMetaInfoMap::Instance();
+}
+
+}  // end extern "C"
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11d505a5aab4f4d33926162445cffd3f5ca4db32
--- /dev/null
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -0,0 +1,383 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/extension/include/tensor.h"
+#include <utility>
+#include "paddle/fluid/framework/custom_tensor_utils.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor &in, framework::Tensor *out,
+               const platform::DeviceContext *ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor *out_;
+  const platform::DeviceContext *ctx_;
+
+  template <typename OutType>
+  void apply() {
+    auto *in_begin = in_.data<InType>();
+    auto *in_end = in_begin + in_.numel();
+    auto *out_begin = out_->mutable_data<OutType>(in_.place());
+
+    if (platform::is_cpu_place(in_.place())) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto *context = static_cast<const platform::CPUDeviceContext *>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+#ifdef __NVCC__
+    } else if (platform::is_gpu_place(in_.place())) {
+      platform::Transform<platform::CUDADeviceContext> trans;
+      auto *context = static_cast<const platform::CUDADeviceContext *>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+      context->Wait();
+#endif
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Place type is not supported when casting data type."));
+    }
+  }
+};
+template <typename T>
+void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc,
+             int64_t ele_size) {
+#ifdef PADDLE_WITH_CUDA
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  int device_num = paddle::platform::GetCurrentDeviceId();
+  platform::CUDAPlace gpu_place(device_num);
+  auto *dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+  if ((src_plc == PlaceType::kGPU) && (dst_plc == PlaceType::kCPU)) {
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(dst), gpu_place, src,
+                 ele_size, dev_ctx->stream());
+  } else if ((src_plc == PlaceType::kGPU) && (dst_plc == PlaceType::kGPU)) {
+    memory::Copy(gpu_place, static_cast<void *>(dst), gpu_place, src, ele_size,
+                 dev_ctx->stream());
+  } else if ((src_plc == PlaceType::kCPU) && (dst_plc == PlaceType::kGPU)) {
+    memory::Copy(gpu_place, static_cast<void *>(dst), platform::CPUPlace(), src,
+                 ele_size, dev_ctx->stream());
+  } else {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Only GPU related Copy can reach this func."));
+  }
+  cudaStreamSynchronize(dev_ctx->stream());
+#endif
+}
+
+#define GET_CASTED_TENSOR                               \
+  if (!tensor_) {                                       \
+    tensor_ = std::make_shared<framework::LoDTensor>(); \
+  }                                                     \
+  auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
+
+void Tensor::reshape(const std::vector<int> &shape) {
+  GET_CASTED_TENSOR
+  tensor->Resize(framework::make_ddim(shape));
+}
+
+Tensor::Tensor(const PlaceType &place)
+    : tensor_(std::make_shared<framework::LoDTensor>()), place_(place) {}
+
+template <typename T>
+T *Tensor::mutable_data(const PlaceType &place) {
+  place_ = place;
+  return mutable_data<T>();
+}
+
+template <typename T>
+T *Tensor::mutable_data() {
+  GET_CASTED_TENSOR
+  PADDLE_ENFORCE_GT(
+      tensor->numel(), 0,
+      platform::errors::PreconditionNotMet(
+          "You should call Tensor::Reshape(const std::vector<int> "
+          "&shape)"
+          "function before retrieving mutable_data from input tensor."));
+  switch (static_cast<int>(place_)) {
+    case static_cast<int>(PlaceType::kCPU): {
+      return tensor->mutable_data<T>(platform::CPUPlace());
+    }
+#ifdef PADDLE_WITH_CUDA
+    case static_cast<int>(PlaceType::kGPU): {
+      int device_num = platform::GetCurrentDeviceId();
+      return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
+    }
+#endif
+    default:
+      PADDLE_THROW(platform::errors::Unavailable(
+          "Custom operator unsupported place id(%d)",
+          static_cast<int>(place_)));
+  }
+}
+
+template <typename T>
+T *Tensor::data() const {
+  GET_CASTED_TENSOR;
+  auto *res = tensor->data<T>();
+  return res;
+}
+
+DataType Tensor::type() const {
+  GET_CASTED_TENSOR;
+  auto type = tensor->type();
+  if (type == framework::proto::VarType::FP32) {
+    return DataType::FLOAT32;
+  } else if (type == framework::proto::VarType::INT64) {
+    return DataType::INT64;
+  } else if (type == framework::proto::VarType::INT32) {
+    return DataType::INT32;
+  } else if (type == framework::proto::VarType::INT16) {
+    return DataType::INT16;
+  } else if (type == framework::proto::VarType::INT8) {
+    return DataType::INT8;
+  } else if (type == framework::proto::VarType::UINT8) {
+    return DataType::UINT8;
+  } else if (type == framework::proto::VarType::FP64) {
+    return DataType::FLOAT64;
+  } else if (type == framework::proto::VarType::BF16) {
+    return DataType::BFLOAT16;
+  } else if (type == framework::proto::VarType::FP16) {
+    return DataType::FLOAT16;
+  } else if (type == framework::proto::VarType::COMPLEX64) {
+    return DataType::COMPLEX64;
+  } else if (type == framework::proto::VarType::COMPLEX128) {
+    return DataType::COMPLEX128;
+  } else if (type == framework::proto::VarType::BOOL) {
+    return DataType::BOOL;
+  }
+  return DataType::FLOAT32;
+}
+
+template <typename T>
+Tensor Tensor::copy_to(const PlaceType &target_place) const {
+  GET_CASTED_TENSOR;
+  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "You should call Tensor::Reshape(const "
+                        "std::vector<int> &shape)"
+                        "function before copying data from cpu."));
+  size_t ele_size = tensor->numel() * sizeof(T);
+  auto *p_src_data = tensor->data<T>();
+  auto src_place = place();
+  Tensor target = Tensor(target_place);
+  target.reshape(shape());
+  auto *p_target_data = target.template mutable_data<T>();
+
+  if ((src_place == PlaceType::kCPU) && (target_place == PlaceType::kCPU)) {
+    std::memcpy(static_cast<void *>(p_target_data), p_src_data, ele_size);
+  } else if ((src_place == PlaceType::kGPU) &&
+             (target_place == PlaceType::kCPU)) {
+    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
+  } else if ((src_place == PlaceType::kCPU) &&
+             (target_place == PlaceType::kGPU)) {
+    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
+  } else if ((src_place == PlaceType::kGPU) &&
+             (target_place == PlaceType::kGPU)) {
+    GpuCopy<T>(p_src_data, p_target_data, src_place, target_place, ele_size);
+  } else {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not supported place transform of place: %d to place: %d",
+        static_cast<int>(src_place), static_cast<int>(target_place)));
+  }
+  return target;
+}
+
+template Tensor Tensor::copy_to<paddle::platform::float16>(
+    const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<paddle::platform::bfloat16>(
+    const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<paddle::platform::complex64>(
+    const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<paddle::platform::complex128>(
+    const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<float>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<double>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<bool>(const PlaceType &target_place) const;
+
+template float *Tensor::data<float>() const;
+template double *Tensor::data<double>() const;
+template int64_t *Tensor::data<int64_t>() const;
+template int32_t *Tensor::data<int32_t>() const;
+template uint8_t *Tensor::data<uint8_t>() const;
+template int8_t *Tensor::data<int8_t>() const;
+template paddle::platform::float16 *Tensor::data<paddle::platform::float16>()
+    const;
+template paddle::platform::bfloat16 *Tensor::data<paddle::platform::bfloat16>()
+    const;
+template paddle::platform::complex128 *
+Tensor::data<paddle::platform::complex128>() const;
+template paddle::platform::complex64 *
+Tensor::data<paddle::platform::complex64>() const;
+template int16_t *Tensor::data<int16_t>() const;
+template bool *Tensor::data<bool>() const;
+
+template float *Tensor::mutable_data<float>();
+template double *Tensor::mutable_data<double>();
+template int64_t *Tensor::mutable_data<int64_t>();
+template int32_t *Tensor::mutable_data<int32_t>();
+template uint8_t *Tensor::mutable_data<uint8_t>();
+template int8_t *Tensor::mutable_data<int8_t>();
+template paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>();
+template paddle::platform::bfloat16 *
+Tensor::mutable_data<paddle::platform::bfloat16>();
+template paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>();
+template paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>();
+template int16_t *Tensor::mutable_data<int16_t>();
+template bool *Tensor::mutable_data<bool>();
+
+template float *Tensor::mutable_data<float>(const PlaceType &place);
+template double *Tensor::mutable_data<double>(const PlaceType &place);
+template int64_t *Tensor::mutable_data<int64_t>(const PlaceType &place);
+template int32_t *Tensor::mutable_data<int32_t>(const PlaceType &place);
+template uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType &place);
+template int8_t *Tensor::mutable_data<int8_t>(const PlaceType &place);
+template paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
+template paddle::platform::bfloat16 *
+Tensor::mutable_data<paddle::platform::bfloat16>(const PlaceType &place);
+template paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
+template paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
+template int16_t *Tensor::mutable_data<int16_t>(const PlaceType &place);
+template bool *Tensor::mutable_data<bool>(const PlaceType &place);
+
+std::vector<int> Tensor::shape() const {
+  GET_CASTED_TENSOR
+  return framework::vectorize<int>(tensor->dims());
+}
+
+const PlaceType &Tensor::place() const {
+  GET_CASTED_TENSOR;
+  if (platform::is_cpu_place(tensor->place())) {
+    place_ = PlaceType::kCPU;
+  } else if (platform::is_gpu_place(tensor->place())) {
+    place_ = PlaceType::kGPU;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Current Tensor hold unsupported Place Type, Please Init it"
+        "using Tensor::mutable_data<T>(PaddlePlace) which T is"
+        "either Place::kCPU or Place::kGPU"));
+  }
+  return place_;
+}
+
+Tensor Tensor::cast(const DataType &target_type) const {
+  GET_CASTED_TENSOR;
+  Tensor rlt = Tensor(place());
+  rlt.reshape(this->shape());
+  auto rlt_tensor_ = static_cast<framework::LoDTensor *>(rlt.tensor_.get());
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto ctx = pool.Get(tensor->place());
+  auto src_type = tensor->type();
+  auto dst_type =
+      framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(target_type);
+  switch (src_type) {
+    case framework::proto::VarType::FP16:
+      framework::VisitDataType(
+          dst_type, CastDataType<platform::float16>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::BF16:
+      framework::VisitDataType(dst_type, CastDataType<platform::bfloat16>(
+                                             *tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::FP32:
+      framework::VisitDataType(dst_type,
+                               CastDataType<float>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::FP64:
+      framework::VisitDataType(dst_type,
+                               CastDataType<double>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::INT32:
+      framework::VisitDataType(dst_type,
+                               CastDataType<int>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::INT64:
+      framework::VisitDataType(
+          dst_type, CastDataType<int64_t>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::BOOL:
+      framework::VisitDataType(dst_type,
+                               CastDataType<bool>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::INT16:
+      framework::VisitDataType(
+          dst_type, CastDataType<int16_t>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::UINT8:
+      framework::VisitDataType(
+          dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::COMPLEX64:
+      framework::VisitDataType(dst_type, CastDataType<platform::complex64>(
+                                             *tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::COMPLEX128:
+      framework::VisitDataType(dst_type, CastDataType<platform::complex128>(
+                                             *tensor, rlt_tensor_, ctx));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          framework::DataTypeToString(src_type)));
+  }
+  return rlt;
+}
+
+int64_t Tensor::size() const {
+  GET_CASTED_TENSOR;
+  return tensor->numel();
+}
+
+namespace framework {
+
+void CustomTensorUtils::ShareDataTo(const paddle::Tensor &src, void *dst) {
+  static_cast<framework::LoDTensor *>(dst)->ShareDataWith(
+      *static_cast<framework::LoDTensor *>(src.tensor_.get()));
+}
+
+void CustomTensorUtils::ShareDataFrom(const void *src,
+                                      const paddle::Tensor &dst) {
+  if (!dst.tensor_) {
+    dst.tensor_ = std::make_shared<framework::LoDTensor>();
+  }
+  auto *tensor = static_cast<framework::LoDTensor *>(dst.tensor_.get());
+  tensor->ShareDataWith(*static_cast<const framework::LoDTensor *>(src));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 6caa352f7ad66dd3b8225d15e8b3aa067f33ebe7..2f4dcf465de780ac963680140b9d02a0324531b1 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -321,11 +321,17 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 
 configure_file(commit.h.in commit.h)
 
-set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer)
+cc_library(custom_tensor SRCS ../extension/src/tensor.cc DEPS lod_tensor)
+cc_library(op_meta_info SRCS ../extension/src/op_meta_info.cc DEPS custom_tensor)
+cc_library(custom_operator SRCS custom_operator.cc DEPS operator op_registry device_context dynamic_loader custom_tensor op_meta_info)
+cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog)
+
+set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator)
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 cc_library(paddle_framework_shared
-    SHARED SRCS executor.cc operator.cc
+    SHARED SRCS executor.cc operator.cc custom_operator.cc ../extension/src/tensor.cc
+    ../extension/src/op_meta_info.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc
     ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
     DEPS ${FLUID_FRAMEWORK_MODULES})
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e2a77e915dea4e19046c68e176ba49637ece9ac
--- /dev/null
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -0,0 +1,534 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/custom_operator.h"
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/extension/include/tensor.h"
+#include "paddle/fluid/framework/attribute.h"
+#include "paddle/fluid/framework/c/c_api.h"
+#include "paddle/fluid/framework/custom_tensor_utils.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/string/string_helper.h"
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+// dynamic lib load func
+template <typename T>
+static T* DynLoad(void* handle, std::string name) {
+  T* func = reinterpret_cast<T*>(dlsym(handle, name.c_str()));
+#if !defined(_WIN32)
+  auto errorno = dlerror();
+#else
+  auto errorno = GetLastError();
+#endif  // !_WIN32
+  PADDLE_ENFORCE_NOT_NULL(
+      func, platform::errors::NotFound(
+                "Failed to load dynamic operator library, error message(%s).",
+                errorno));
+  return func;
+}
+
+inline bool IsGradVar(const std::string& var_name) {
+  std::string suffix = kGradVarSuffix;
+  return var_name.rfind(suffix) != std::string::npos;
+}
+
+inline std::string NoGrad(const std::string& var_name) {
+  std::string suffix = kGradVarSuffix;
+  return var_name.substr(0, var_name.size() - kGradVarSuffixSize);
+}
+
+inline bool IsMemberOf(const std::vector<std::string>& vec,
+                       const std::string& name) {
+  return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
+}
+
+}  // namespace detail
+
+////////////////// Kernel Define ////////////////////
+
+// custom op kernel call function define
+static void RunKernelFunc(const framework::ExecutionContext& ctx,
+                          const paddle::KernelFunc& func,
+                          const std::vector<std::string>& inputs,
+                          const std::vector<std::string>& outputs) {
+  VLOG(1) << "Custom Operator: Start run KernelFunc.";
+  std::vector<paddle::Tensor> custom_ins;
+  for (auto& in_name : inputs) {
+    VLOG(1) << "Custom Operator: input name - " << in_name;
+    auto* x = ctx.Input<Tensor>(in_name);
+    PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound(
+                                   "Input tensor (%s) is nullptr.", in_name));
+    PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
+                      platform::errors::InvalidArgument(
+                          "Input tensor (%s) is not initialized."));
+    auto custom_in = paddle::Tensor(
+        CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place()));
+    CustomTensorUtils::ShareDataFrom(static_cast<const void*>(x), custom_in);
+    custom_ins.emplace_back(custom_in);
+  }
+
+  std::vector<boost::any> attrs;
+
+  VLOG(1) << "Run ComputeFunc.";
+  auto outs = func(custom_ins, attrs);
+
+  VLOG(1) << "Custom Operator: Share outputs into ExecutionContext.";
+  for (size_t i = 0; i < outputs.size(); ++i) {
+    auto* true_out = ctx.Output<Tensor>(outputs[i]);
+    CustomTensorUtils::ShareDataTo(outs.at(i), true_out);
+  }
+}
+
+//////////////////// Operator Define /////////////////
+
+class CustomOperator : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+  // Dummy infershape
+  // Because it is a pure virtual function, it must be implemented
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    VLOG(1) << "Custom Operator: Dummy infer shape of custom operator.";
+  }
+
+  /**
+   * NOTE: [Skip the Kernel Selection]
+   * Custom Op only registers one Op kernel on each device, so that the
+   * data type selection and promotion that depends on GetExpectedKernelType,
+   * as well as the adaptation of various other special situations,
+   * need users to implement, to avoid users needs to implement
+   * GetExpectedKernelType function when expanding other cases.
+   * The RAW type is used here as the data type, indicating that
+   * it can only be determined at runtime.
+   */
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    return framework::OpKernelType(proto::VarType::RAW, ctx.GetPlace());
+  }
+
+  /**
+   * NOTE: [Skip Input Variable Cast for DataType]
+   * Because the kernel data type is RAW, we should skip the cast for
+   * data type difference when PrepareData.
+   */
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const OpKernelType& expected_kernel_type) {
+    return OpKernelType(expected_kernel_type.data_type_,
+                        expected_kernel_type.place_, tensor.layout());
+  }
+};
+
+class CustomOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  explicit CustomOpMaker(const std::vector<std::string>& inputs,
+                         const std::vector<std::string>& outputs,
+                         const std::vector<std::string>& attrs)
+      : inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
+
+  void Make() override {
+    for (auto& in_name : inputs_) {
+      AddInput(in_name, "The input " + in_name + "of Custom operator.");
+    }
+    for (auto& out_name : outputs_) {
+      AddOutput(out_name, "The output " + out_name + "of Custom Operator.");
+    }
+    // TODO(chenweihang): support attrs in later PR
+    AddComment(R"DOC(
+Custom Operator.
+
+According to the Tensor operation function implemented by the user 
+independently of the framework, it is encapsulated into a framework 
+operator to adapt to various execution scenarios such as dynamic graph, 
+mode static graph mode, and inference mode.
+
+)DOC");
+  }
+
+ private:
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  std::vector<std::string> attrs_;
+};
+
+template <typename T>
+class CustomGradOpMaker;
+
+template <>
+class CustomGradOpMaker<OpDesc> : public SingleGradOpMaker<OpDesc> {
+ public:
+  explicit CustomGradOpMaker(
+      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
+      std::unordered_map<std::string, std::string>* grad_to_var,
+      const std::vector<BlockDesc*>& grad_block, const std::string& name,
+      const std::vector<std::string>& inputs,
+      const std::vector<std::string>& outputs)
+      : SingleGradOpMaker<OpDesc>(fwd_op, no_grad_set, grad_to_var, grad_block),
+        name_(name),
+        inputs_(inputs),
+        outputs_(outputs) {}
+
+ protected:
+  void Apply(GradOpPtr<OpDesc> grad_op) const override {
+    grad_op->SetType(name_);
+
+    auto fwd_op_inputs = this->InputNames();
+    auto fwd_op_outputs = this->OutputNames();
+
+    for (auto& in_name : inputs_) {
+      VLOG(1) << "Custom Operator: GradOpDescMaker - input: " << in_name;
+      if (!detail::IsGradVar(in_name)) {
+        if (detail::IsMemberOf(fwd_op_inputs, in_name)) {
+          grad_op->SetInput(in_name, this->Input(in_name));
+        } else if (detail::IsMemberOf(fwd_op_outputs, in_name)) {
+          grad_op->SetInput(in_name, this->Output(in_name));
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The input tensor name `%s` is invalid, expected it is the input "
+              "or output of forward operator.",
+              in_name));
+        }
+      } else {
+        grad_op->SetInput(in_name, this->OutputGrad(detail::NoGrad(in_name)));
+      }
+    }
+    for (auto& out_name : outputs_) {
+      VLOG(1) << "Custom Operator: GradOpDescMaker - output: " << out_name;
+      grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name)));
+    }
+    // TODO(chenweihang): support attrs in later PR
+  }
+
+ private:
+  std::string name_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+};
+
+template <>
+class CustomGradOpMaker<imperative::OpBase>
+    : public SingleGradOpMaker<imperative::OpBase> {
+ public:
+  explicit CustomGradOpMaker(
+      const std::string& type,
+      const imperative::NameVarBaseMap& var_base_map_in,
+      const imperative::NameVarBaseMap& var_base_map_out,
+      const AttributeMap& attrs,
+      const std::map<std::string, std::string>& inplace_map,
+      const std::string& name, const std::vector<std::string>& inputs,
+      const std::vector<std::string>& outputs)
+      : SingleGradOpMaker<imperative::OpBase>(
+            type, var_base_map_in, var_base_map_out, attrs, inplace_map),
+        name_(name),
+        inputs_(inputs),
+        outputs_(outputs) {}
+
+ protected:
+  // TODO(chenweihang): The code is duplicated with the previous one, because
+  // ere OpMaker's Input, Output and other methods are protected. Putting the
+  // function implementation outside the class will cause the method to be
+  // uncallable,
+  // so it is still implemented in the class for the time being.
+  void Apply(GradOpPtr<imperative::OpBase> grad_op) const override {
+    grad_op->SetType(name_);
+
+    auto fwd_op_inputs = this->InputNames();
+    auto fwd_op_outputs = this->OutputNames();
+
+    for (auto& in_name : inputs_) {
+      VLOG(1) << "Custom Operator: GradOpBaseMaker - input: " << in_name;
+      if (!detail::IsGradVar(in_name)) {
+        if (detail::IsMemberOf(fwd_op_inputs, in_name)) {
+          grad_op->SetInput(in_name, this->Input(in_name));
+        } else if (detail::IsMemberOf(fwd_op_outputs, in_name)) {
+          grad_op->SetInput(in_name, this->Output(in_name));
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "The input tensor name `%s` is invalid, expected it is the input "
+              "or output of forward operator.",
+              in_name));
+        }
+      } else {
+        grad_op->SetInput(in_name, this->OutputGrad(detail::NoGrad(in_name)));
+      }
+    }
+    for (auto& out_name : outputs_) {
+      VLOG(1) << "Custom Operator: GradOpBaseMaker - output: " << out_name;
+      grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name)));
+    }
+    // TODO(chenweihang): support attrs in later PR
+  }
+
+ private:
+  std::string name_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+};
+
+//////////// Operator and Kernel Register //////////////
+
+void RegisterOperatorKernelWithPlace(const std::string& name,
+                                     const paddle::KernelFunc& kernel_func,
+                                     const proto::VarType::Type type,
+                                     const PlaceType& place,
+                                     const std::vector<std::string>& inputs,
+                                     const std::vector<std::string>& outputs) {
+  OpKernelType key(type,
+                   CustomTensorUtils::ConvertEnumPlaceToInnerPlace(place));
+  VLOG(1) << "Custom Operator: op kernel key: " << key;
+  OperatorWithKernel::AllOpKernels()[name][key] =
+      [kernel_func, inputs, outputs](const framework::ExecutionContext& ctx) {
+        VLOG(1) << "Custom Operator: run custom kernel func in lambda.";
+        RunKernelFunc(ctx, kernel_func, inputs, outputs);
+      };
+}
+
+void RegisterOperatorKernel(const std::string& name,
+                            const paddle::KernelFunc& kernel_func,
+                            const std::vector<std::string>& inputs,
+                            const std::vector<std::string>& outputs) {
+  VLOG(1) << "Custom Operator: op name in kernel: " << name;
+  // NOTE [ Dummy Op Kernel Key ]
+  // TODO(chenweihang): Because execute engine need get device context based
+  // op_kernel_key.place_, so we should register kernel for each
+  // device. But this is not entirely correct, if user only give a cpu kernel,
+  // but call api in gpu device, it will cause error.
+  RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
+                                  PlaceType::kCPU, inputs, outputs);
+  RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
+                                  PlaceType::kGPU, inputs, outputs);
+}
+
+void RegisterOperatorWithMetaInfo(
+    const std::vector<OpMetaInfo>& op_meta_infos) {
+  /* Op register */
+  OpInfo info;
+
+  auto& base_op_meta = op_meta_infos.front();
+
+  auto op_name = OpMetaInfoHelper::GetOpName(base_op_meta);
+  auto& op_inputs = OpMetaInfoHelper::GetInputs(base_op_meta);
+  auto& op_outputs = OpMetaInfoHelper::GetOutputs(base_op_meta);
+  auto& op_attrs = OpMetaInfoHelper::GetAttrs(base_op_meta);
+  auto& kernel_fn = OpMetaInfoHelper::GetKernelFn(base_op_meta);
+  auto& infer_shape_func = OpMetaInfoHelper::GetInferShapeFn(base_op_meta);
+  auto& infer_dtype_func = OpMetaInfoHelper::GetInferDtypeFn(base_op_meta);
+
+  VLOG(1) << "Custom Operator: forward, op name: " << op_name;
+  VLOG(1) << "Custom Operator: forward, op inputs: "
+          << string::join_strings(op_inputs, ',');
+  VLOG(1) << "Custom Operator: forward, op outputs: "
+          << string::join_strings(op_outputs, ',');
+
+  // Op
+  info.creator_ = [](const std::string& op_name, const VariableNameMap& inputs,
+                     const VariableNameMap& outputs,
+                     const AttributeMap& attrs) {
+    return new CustomOperator(op_name, inputs, outputs, attrs);
+  };
+
+  // OpMaker
+  info.proto_ = new proto::OpProto;
+  info.proto_->set_type(op_name);
+
+  info.checker_ = new OpAttrChecker();
+  CustomOpMaker custom_maker(op_inputs, op_outputs, op_attrs);
+  custom_maker(info.proto_, info.checker_);
+  PADDLE_ENFORCE_EQ(
+      info.proto_->IsInitialized(), true,
+      platform::errors::PreconditionNotMet(
+          "Fail to initialize %s's OpProto, because %s is not initialized.",
+          op_name, info.proto_->InitializationErrorString()));
+
+  // InferShape
+  PADDLE_ENFORCE_NOT_NULL(
+      infer_shape_func,
+      platform::errors::PreconditionNotMet(
+          "InferShapeFn is nullptr. Need to set the InferShapeFn of custom "
+          "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))"));
+  info.infer_shape_ = [op_inputs, op_outputs,
+                       infer_shape_func](InferShapeContext* ctx) {
+    std::vector<std::vector<int64_t>> input_shapes;
+
+    VLOG(1) << "Custom Operator: InferShape - get input ddim.";
+    for (auto& in_name : op_inputs) {
+      OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom");
+      auto ddim = ctx->GetInputDim(in_name);
+      input_shapes.emplace_back(framework::vectorize(ddim));
+    }
+
+    VLOG(1) << "Custom Operator: InferShape - calc output ddim.";
+    auto output_shapes = infer_shape_func(input_shapes);
+
+    VLOG(1) << "Custom Operator: InferShape - set output ddim.";
+    for (size_t i = 0; i < op_outputs.size(); ++i) {
+      ctx->SetOutputDim(op_outputs[i], framework::make_ddim(output_shapes[i]));
+    }
+  };
+
+  // Infer Dtype
+  PADDLE_ENFORCE_NOT_NULL(
+      infer_dtype_func,
+      platform::errors::PreconditionNotMet(
+          "InferDtypeFn is nullptr. Need to set the InferDtypeFn of custom "
+          "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))"));
+  info.infer_var_type_ = [op_inputs, op_outputs,
+                          infer_dtype_func](InferVarTypeContext* ctx) {
+    std::vector<DataType> input_dtypes;
+
+    VLOG(1) << "Custom Operator: InferDtype - get input dtype.";
+    for (auto& in_name : op_inputs) {
+      auto dtype = ctx->GetInputDataType(in_name);
+      input_dtypes.emplace_back(
+          CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype));
+    }
+
+    VLOG(1) << "Custom Operator: InferDtype - infer output dtype.";
+    auto output_dtypes = infer_dtype_func(input_dtypes);
+
+    VLOG(1) << "Custom Operator: InferDtype - set output dtype.";
+    for (size_t i = 0; i < op_outputs.size(); ++i) {
+      ctx->SetOutputDataType(
+          op_outputs[i],
+          CustomTensorUtils::ConvertEnumDTypeToInnerDType(output_dtypes[i]));
+    }
+  };
+
+  // Kernel func
+  RegisterOperatorKernel(op_name, kernel_fn, op_inputs, op_outputs);
+
+  // If grad op or double grad op exists
+  std::string cur_op_name = op_name;
+  for (size_t i = 1; i < op_meta_infos.size(); ++i) {
+    auto& cur_grad_op = op_meta_infos[i];
+
+    auto& grad_op_name = OpMetaInfoHelper::GetOpName(cur_grad_op);
+    auto& grad_op_inputs = OpMetaInfoHelper::GetInputs(cur_grad_op);
+    auto& grad_op_outputs = OpMetaInfoHelper::GetOutputs(cur_grad_op);
+    auto& grad_kernel_fn = OpMetaInfoHelper::GetKernelFn(cur_grad_op);
+
+    VLOG(1) << "Custom Operator: backward, op name: " << grad_op_name;
+    VLOG(1) << "Custom Operator: backward, op inputs: "
+            << string::join_strings(grad_op_inputs, ',');
+    VLOG(1) << "Custom Operator: backward, op outputs: "
+            << string::join_strings(grad_op_outputs, ',');
+
+    // GradOpDescMaker
+    info.grad_op_maker_ = [grad_op_name, grad_op_inputs, grad_op_outputs](
+        const OpDesc& fwd_op,
+        const std::unordered_set<std::string>& no_grad_set,
+        std::unordered_map<std::string, std::string>* grad_to_var,
+        const std::vector<BlockDesc*>& grad_block) {
+      CustomGradOpMaker<paddle::framework::OpDesc> maker(
+          fwd_op, no_grad_set, grad_to_var, grad_block, grad_op_name,
+          grad_op_inputs, grad_op_outputs);
+      return maker();
+    };
+
+    // GradOpBaseMaker
+    info.dygraph_grad_op_maker_ = [grad_op_name, grad_op_inputs,
+                                   grad_op_outputs](
+        const std::string& type,
+        const imperative::NameVarBaseMap& var_base_map_in,
+        const imperative::NameVarBaseMap& var_base_map_out,
+        const framework::AttributeMap& attrs,
+        const std::map<std::string, std::string>& inplace_map) {
+      CustomGradOpMaker<paddle::imperative::OpBase> maker(
+          type, var_base_map_in, var_base_map_out, attrs, inplace_map,
+          grad_op_name, grad_op_inputs, grad_op_outputs);
+      return maker();
+    };
+
+    /* Grad op register */
+    OpInfo grad_info;
+
+    // Grad Op
+    grad_info.creator_ = [](
+        const std::string& type, const VariableNameMap& inputs,
+        const VariableNameMap& outputs, const AttributeMap& attrs) {
+      return new CustomOperator(type, inputs, outputs, attrs);
+    };
+
+    // Grad InferShape (gradient's shape is same with forward input default)
+    grad_info.infer_shape_ = [grad_op_outputs](InferShapeContext* ctx) {
+      for (auto& out_name : grad_op_outputs) {
+        ctx->ShareDim(detail::NoGrad(out_name), out_name);
+      }
+    };
+
+    // Kernel func
+    RegisterOperatorKernel(grad_op_name, grad_kernel_fn, grad_op_inputs,
+                           grad_op_outputs);
+
+    // update current info
+    OpInfoMap::Instance().Insert(cur_op_name, info);
+    cur_op_name = grad_op_name;
+    info = grad_info;
+  }
+  // insert last info
+  OpInfoMap::Instance().Insert(cur_op_name, info);
+}
+
+void RegisterOperatorWithMetaInfoMap(
+    const paddle::OpMetaInfoMap& op_meta_info_map) {
+  auto& meta_info_map = op_meta_info_map.GetMap();
+
+  PADDLE_ENFORCE_EQ(meta_info_map.empty(), false,
+                    platform::errors::PreconditionNotMet(
+                        "No custom operator that needs to be registered."));
+  VLOG(1) << "Custom Operator: size of op meta info map - "
+          << meta_info_map.size();
+  // pair: {op_type, OpMetaInfo}
+  for (auto& pair : meta_info_map) {
+    VLOG(1) << "Custom Operator: pair first -> op name: " << pair.first;
+    RegisterOperatorWithMetaInfo(pair.second);
+  }
+}
+
+////////////////////// User APIs ///////////////////////
+
+// load op api
+void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name) {
+  void* handle = paddle::platform::dynload::GetOpDsoHandle(dso_name);
+
+  typedef OpMetaInfoMap& get_op_meta_info_map_t();
+  auto* get_op_meta_info_map =
+      detail::DynLoad<get_op_meta_info_map_t>(handle, "PD_GetOpMetaInfoMap");
+  auto& op_meta_info_map = get_op_meta_info_map();
+
+  RegisterOperatorWithMetaInfoMap(op_meta_info_map);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2f97e5e5822a291d404f1c4f5883e50be01fe1d
--- /dev/null
+++ b/paddle/fluid/framework/custom_operator.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/extension/include/op_meta_info.h"
+
+namespace paddle {
+namespace framework {
+
+// Load custom op api: register op after user compiled
+void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name);
+
+// Register custom op api: register op directly
+void RegisterOperatorWithMetaInfoMap(
+    const paddle::OpMetaInfoMap& op_meta_info_map);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33b662454286f2210a989b87d6f4cb0129732520
--- /dev/null
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -0,0 +1,249 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/extension/include/all.h"
+#include "paddle/fluid/framework/custom_tensor_utils.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+
+template <typename T>
+paddle::Tensor InitCPUTensorForTest() {
+  std::vector<int> tensor_shape{5, 5};
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
+  t1.reshape(tensor_shape);
+  auto* p_data_ptr = t1.mutable_data<T>(paddle::PlaceType::kCPU);
+  for (int64_t i = 0; i < t1.size(); i++) {
+    p_data_ptr[i] = T(5);
+  }
+  return t1;
+}
+
+template <typename T>
+void TestCopyTensor() {
+  auto t1 = InitCPUTensorForTest<T>();
+  auto t1_cpu_cp = t1.template copy_to<T>(paddle::PlaceType::kCPU);
+  CHECK((paddle::PlaceType::kCPU == t1_cpu_cp.place()));
+  for (int64_t i = 0; i < t1.size(); i++) {
+    CHECK_EQ(t1_cpu_cp.template data<T>()[i], T(5));
+  }
+#ifdef PADDLE_WITH_CUDA
+  VLOG(2) << "Do GPU copy test";
+  auto t1_gpu_cp = t1_cpu_cp.template copy_to<T>(paddle::PlaceType::kGPU);
+  CHECK((paddle::PlaceType::kGPU == t1_gpu_cp.place()));
+  auto t1_gpu_cp_cp = t1_gpu_cp.template copy_to<T>(paddle::PlaceType::kGPU);
+  CHECK((paddle::PlaceType::kGPU == t1_gpu_cp_cp.place()));
+  auto t1_gpu_cp_cp_cpu =
+      t1_gpu_cp.template copy_to<T>(paddle::PlaceType::kCPU);
+  CHECK((paddle::PlaceType::kCPU == t1_gpu_cp_cp_cpu.place()));
+  for (int64_t i = 0; i < t1.size(); i++) {
+    CHECK_EQ(t1_gpu_cp_cp_cpu.template data<T>()[i], T(5));
+  }
+#endif
+}
+
+void TestAPIPlace() {
+  std::vector<int> tensor_shape = {5, 5};
+#ifdef PADDLE_WITH_CUDA
+  auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
+  t1.reshape(tensor_shape);
+  t1.mutable_data<float>();
+  CHECK((paddle::PlaceType::kGPU == t1.place()));
+#endif
+  auto t2 = paddle::Tensor(paddle::PlaceType::kCPU);
+  t2.reshape(tensor_shape);
+  t2.mutable_data<float>();
+  CHECK((paddle::PlaceType::kCPU == t2.place()));
+}
+
+void TestAPISizeAndShape() {
+  std::vector<int> tensor_shape = {5, 5};
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
+  t1.reshape(tensor_shape);
+  CHECK_EQ(t1.size(), 25);
+  CHECK(t1.shape() == tensor_shape);
+}
+
+template <typename T>
+paddle::DataType TestDtype() {
+  std::vector<int> tensor_shape = {5, 5};
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
+  t1.reshape(tensor_shape);
+  t1.template mutable_data<T>();
+  return t1.type();
+}
+
+template <typename T>
+void TestCast(paddle::DataType data_type) {
+  std::vector<int> tensor_shape = {5, 5};
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
+  t1.reshape(tensor_shape);
+  t1.template mutable_data<T>();
+  auto t2 = t1.cast(data_type);
+  CHECK_EQ(t2.type(), data_type);
+}
+
+void GroupTestCopy() {
+  VLOG(2) << "Float cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<float>();
+  VLOG(2) << "Double cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<double>();
+  VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::float16>();
+  VLOG(2) << "BF16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::bfloat16>();
+  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::complex128>();
+  VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::complex64>();
+  VLOG(2) << "int cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<int>();
+  VLOG(2) << "int64 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<int64_t>();
+  VLOG(2) << "int16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<int16_t>();
+  VLOG(2) << "int8 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<int8_t>();
+  VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<uint8_t>();
+}
+
+void GroupTestCast() {
+  VLOG(2) << "int cast";
+  TestCast<int>(paddle::DataType::FLOAT32);
+  VLOG(2) << "int32 cast";
+  TestCast<int32_t>(paddle::DataType::FLOAT32);
+  VLOG(2) << "int64 cast";
+  TestCast<int64_t>(paddle::DataType::FLOAT32);
+  VLOG(2) << "double cast";
+  TestCast<double>(paddle::DataType::FLOAT32);
+  VLOG(2) << "bfloat16 cast";
+  TestCast<paddle::platform::bfloat16>(paddle::DataType::FLOAT32);
+  VLOG(2) << "float16 cast";
+  TestCast<paddle::platform::float16>(paddle::DataType::FLOAT32);
+  VLOG(2) << "bool cast";
+  TestCast<bool>(paddle::DataType::FLOAT32);
+  VLOG(2) << "uint8 cast";
+  TestCast<uint8_t>(paddle::DataType::FLOAT32);
+  VLOG(2) << "float cast";
+  TestCast<float>(paddle::DataType::FLOAT32);
+  VLOG(2) << "complex64 cast";
+  TestCast<float>(paddle::DataType::FLOAT32);
+  VLOG(2) << "complex128 cast";
+  TestCast<float>(paddle::DataType::FLOAT32);
+}
+
+void GroupTestDtype() {
+  CHECK(TestDtype<float>() == paddle::DataType::FLOAT32);
+  CHECK(TestDtype<double>() == paddle::DataType::FLOAT64);
+  CHECK(TestDtype<paddle::platform::float16>() == paddle::DataType::FLOAT16);
+  CHECK(TestDtype<paddle::platform::bfloat16>() == paddle::DataType::BFLOAT16);
+  CHECK(TestDtype<paddle::platform::complex128>() ==
+        paddle::DataType::COMPLEX128);
+  CHECK(TestDtype<paddle::platform::complex64>() ==
+        paddle::DataType::COMPLEX64);
+  CHECK(TestDtype<int>() == paddle::DataType::INT32);
+  CHECK(TestDtype<int64_t>() == paddle::DataType::INT64);
+  CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
+  CHECK(TestDtype<int8_t>() == paddle::DataType::INT8);
+  CHECK(TestDtype<uint8_t>() == paddle::DataType::UINT8);
+}
+
+void GroupTestDtypeConvert() {
+  // enum -> proto
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::COMPLEX128) ==
+        paddle::framework::proto::VarType::COMPLEX128);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::COMPLEX64) ==
+        paddle::framework::proto::VarType::COMPLEX64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::FLOAT64) ==
+        paddle::framework::proto::VarType::FP64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::FLOAT32) ==
+        paddle::framework::proto::VarType::FP32);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::FLOAT16) ==
+        paddle::framework::proto::VarType::FP16);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::BFLOAT16) ==
+        paddle::framework::proto::VarType::BF16);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::UINT8) ==
+        paddle::framework::proto::VarType::UINT8);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::INT8) == paddle::framework::proto::VarType::INT8);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::INT32) ==
+        paddle::framework::proto::VarType::INT32);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::INT64) ==
+        paddle::framework::proto::VarType::INT64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::INT16) ==
+        paddle::framework::proto::VarType::INT16);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType(
+            paddle::DataType::BOOL) == paddle::framework::proto::VarType::BOOL);
+  // proto -> enum
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::COMPLEX128) ==
+        paddle::DataType::COMPLEX128);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::COMPLEX64) ==
+        paddle::DataType::COMPLEX64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::FP64) ==
+        paddle::DataType::FLOAT64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::FP32) ==
+        paddle::DataType::FLOAT32);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::FP16) ==
+        paddle::DataType::FLOAT16);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::BF16) ==
+        paddle::DataType::BFLOAT16);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::INT64) ==
+        paddle::DataType::INT64);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::INT32) ==
+        paddle::DataType::INT32);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::INT8) == paddle::DataType::INT8);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::UINT8) ==
+        paddle::DataType::UINT8);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::INT16) ==
+        paddle::DataType::INT16);
+  CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType(
+            paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL);
+}
+
+TEST(CustomTensor, copyTest) {
+  VLOG(2) << "TestCopy";
+  GroupTestCopy();
+  VLOG(2) << "TestDtype";
+  GroupTestDtype();
+  VLOG(2) << "TestShape";
+  TestAPISizeAndShape();
+  VLOG(2) << "TestPlace";
+  TestAPIPlace();
+  VLOG(2) << "TestCast";
+  GroupTestCast();
+  VLOG(2) << "TestDtypeConvert";
+  GroupTestDtypeConvert();
+}
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b465d3911df19eb61a65aa411c31a2841a10efb
--- /dev/null
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/extension/include/tensor.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+class CustomTensorUtils {
+ public:
+  /// \brief Share data TO another tensor.
+  /// Use this to pass tensor from op to op
+  /// \return void.
+  static void ShareDataTo(const paddle::Tensor& src, void* dst);
+
+  /// \brief Share data FROM another tensor.
+  /// Use this to pass tensor from op to op
+  /// \return void.
+  static void ShareDataFrom(const void* src, const Tensor& dst);
+
+  static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType(
+      const paddle::DataType& dtype) {
+    switch (dtype) {
+      case paddle::DataType::COMPLEX128:
+        return framework::proto::VarType::COMPLEX128;
+      case paddle::DataType::COMPLEX64:
+        return framework::proto::VarType::COMPLEX64;
+      case paddle::DataType::FLOAT64:
+        return framework::proto::VarType::FP64;
+      case paddle::DataType::FLOAT32:
+        return framework::proto::VarType::FP32;
+      case paddle::DataType::FLOAT16:
+        return framework::proto::VarType::FP16;
+      case paddle::DataType::BFLOAT16:
+        return framework::proto::VarType::BF16;
+      case paddle::DataType::UINT8:
+        return framework::proto::VarType::UINT8;
+      case paddle::DataType::INT8:
+        return framework::proto::VarType::INT8;
+      case paddle::DataType::INT32:
+        return framework::proto::VarType::INT32;
+      case paddle::DataType::INT64:
+        return framework::proto::VarType::INT64;
+      case paddle::DataType::INT16:
+        return framework::proto::VarType::INT16;
+      case paddle::DataType::BOOL:
+        return framework::proto::VarType::BOOL;
+      default:
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported data type code(%d) when casting enum data type into "
+            "paddle data type.",
+            static_cast<int>(dtype)));
+    }
+  }
+
+  static paddle::DataType ConvertInnerDTypeToEnumDType(
+      const framework::proto::VarType::Type& dtype) {
+    switch (dtype) {
+      case framework::proto::VarType::COMPLEX128:
+        return paddle::DataType::COMPLEX128;
+      case framework::proto::VarType::COMPLEX64:
+        return paddle::DataType::COMPLEX64;
+      case framework::proto::VarType::FP64:
+        return paddle::DataType::FLOAT64;
+      case framework::proto::VarType::FP32:
+        return paddle::DataType::FLOAT32;
+      case framework::proto::VarType::FP16:
+        return paddle::DataType::FLOAT16;
+      case framework::proto::VarType::BF16:
+        return paddle::DataType::BFLOAT16;
+      case framework::proto::VarType::INT64:
+        return paddle::DataType::INT64;
+      case framework::proto::VarType::INT32:
+        return paddle::DataType::INT32;
+      case framework::proto::VarType::INT8:
+        return paddle::DataType::INT8;
+      case framework::proto::VarType::UINT8:
+        return paddle::DataType::UINT8;
+      case framework::proto::VarType::INT16:
+        return paddle::DataType::INT16;
+      case framework::proto::VarType::BOOL:
+        return paddle::DataType::BOOL;
+      default:
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported data type `%s` when casting paddle data type into "
+            "enum data type.",
+            DataTypeToString(dtype)));
+    }
+  }
+
+  // PaddlePlace <-> platform::Place
+  static platform::Place ConvertEnumPlaceToInnerPlace(const PlaceType& pc) {
+    if (pc == PlaceType::kCPU) {
+      return platform::Place(platform::CPUPlace());
+    } else if (pc == PlaceType::kGPU) {
+#ifdef PADDLE_WITH_CUDA
+      return platform::Place(
+          platform::CUDAPlace(platform::GetCurrentDeviceId()));
+#endif
+    } else {
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported place type code(%d) when "
+          "casting enum place to paddle place.",
+          static_cast<int>(pc)));
+    }
+    return platform::Place();
+  }
+
+  static PlaceType ConvertInnerPlaceToEnumPlace(const platform::Place& pc) {
+    if (platform::is_cpu_place(pc)) {
+      return PlaceType::kCPU;
+    } else if (platform::is_gpu_place(pc)) {
+#ifdef PADDLE_WITH_CUDA
+      return PlaceType::kGPU;
+#endif
+    } else {
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unsupported place type `%s` when "
+                                          "casting paddle place to enum place.",
+                                          pc));
+    }
+    return PlaceType::kUNK;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc
index 0959a06051502ad4ac0a840021ce7424327c4b97..05d3541d6a9f8501d985a575ba50b0b8a048c08f 100644
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@@ -84,6 +84,10 @@ std::string DataTypeToString(const proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_str_.end()) {
     return it->second;
   }
+  // deal with RAW type
+  if (type == proto::VarType::RAW) {
+    return "RAW(runtime decided type)";
+  }
   PADDLE_THROW(platform::errors::Unimplemented(
       "Not support proto::VarType::Type(%d) as tensor type.",
       static_cast<int>(type)));
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 30a2ac2c6f6bebb6bb9b189bfd7aaa562764706a..084c6e6816bd567339c49b1751209b280f550f45 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -97,10 +97,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
       framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
       break;
     case proto::VarType::INT16:
-      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      framework::VisitDataType(dst_type, CastDataType<int16_t>(in, out, ctx));
       break;
     case proto::VarType::UINT8:
-      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      framework::VisitDataType(dst_type, CastDataType<uint8_t>(in, out, ctx));
       break;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/framework/op_meta_info_helper.h b/paddle/fluid/framework/op_meta_info_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..06d9c94172df9976d5911fda4f71d0c86329e916
--- /dev/null
+++ b/paddle/fluid/framework/op_meta_info_helper.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/extension/include/op_meta_info.h"
+
+namespace paddle {
+namespace framework {
+
+class OpMetaInfoHelper {
+ public:
+  static const std::string& GetOpName(const paddle::OpMetaInfo& info) {
+    return info.name_;
+  }
+  static const std::vector<std::string>& GetInputs(
+      const paddle::OpMetaInfo& info) {
+    return info.inputs_;
+  }
+  static const std::vector<std::string>& GetOutputs(
+      const paddle::OpMetaInfo& info) {
+    return info.outputs_;
+  }
+  static const std::vector<std::string>& GetAttrs(
+      const paddle::OpMetaInfo& info) {
+    return info.attrs_;
+  }
+  static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info) {
+    return info.kernel_fn_;
+  }
+  static const InferShapeFunc& GetInferShapeFn(const paddle::OpMetaInfo& info) {
+    return info.infer_shape_fn_;
+  }
+  static const InferDtypeFunc& GetInferDtypeFn(const paddle::OpMetaInfo& info) {
+    return info.infer_dtype_fn_;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 2615b98d30dba539fe41e5ad2f24b94429c0e338..792e181047f62bcb565f10c987e65a90e6ac4c51 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
-  gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper)
+  gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator)
 
 if (WITH_NCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 72b3c9645ba2d5571fd815e64b7af74e24e29948..0ebf77c4c2947ece80726ec9591fcbd3d406b7e5 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
@@ -386,7 +387,7 @@ PYBIND11_MODULE(core_noavx, m) {
         PyCapsule_GetPointer(dltensor->ptr(), "dltensor"));
     PyCapsule_SetName(dltensor->ptr(), "used_dltensor");
     DLTensor dl = dmt->dl_tensor;
-    Tensor tensor;
+    framework::Tensor tensor;
 
     if (dl.ctx.device_type == kDLCPU) {
       paddle::framework::TensorFromDLPack(dl, &tensor);
@@ -524,77 +525,80 @@ PYBIND11_MODULE(core_noavx, m) {
 
   BindImperative(&m);
 
-  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
-      .def("__array__", [](Tensor &self) { return TensorToPyArray(self); })
+  py::class_<framework::Tensor>(m, "Tensor", py::buffer_protocol())
+      .def("__array__",
+           [](framework::Tensor &self) { return TensorToPyArray(self); })
       .def("_is_initialized",
-           [](const Tensor &self) { return self.IsInitialized(); })
+           [](const framework::Tensor &self) { return self.IsInitialized(); })
       .def("_get_dims",
-           [](const Tensor &self) { return vectorize(self.dims()); })
+           [](const framework::Tensor &self) { return vectorize(self.dims()); })
       .def("_set_dims",
-           [](Tensor &self, const std::vector<int64_t> &dim) {
+           [](framework::Tensor &self, const std::vector<int64_t> &dim) {
              self.Resize(make_ddim(dim));
            })
       .def("_set_layout",
-           [](Tensor &self, const std::string &layout) {
+           [](framework::Tensor &self, const std::string &layout) {
              self.set_layout(StringToDataLayout(layout));
            })
       .def("_alloc_float",
-           [](Tensor &self, paddle::platform::CUDAPlace &place) {
+           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_alloc_float",
-           [](Tensor &self, paddle::platform::XPUPlace &place) {
+           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_alloc_float",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_alloc_double",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<double>(place);
            })
       .def("_alloc_int",
-           [](Tensor &self, paddle::platform::CPUPlace &place) {
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("_alloc_int",
-           [](Tensor &self, paddle::platform::XPUPlace &place) {
+           [](framework::Tensor &self, paddle::platform::XPUPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("_alloc_int",
-           [](Tensor &self, paddle::platform::CUDAPlace &place) {
+           [](framework::Tensor &self, paddle::platform::CUDAPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("_alloc_int",
-           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<int>(place);
            })
       .def("_alloc_float",
-           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) {
+           [](framework::Tensor &self,
+              paddle::platform::CUDAPinnedPlace &place) {
              self.mutable_data<float>(place);
            })
       .def("_mutable_data",
-           [](Tensor &self, paddle::platform::CPUPlace &place,
+           [](framework::Tensor &self, paddle::platform::CPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
       .def("_mutable_data",
-           [](Tensor &self, paddle::platform::XPUPlace &place,
+           [](framework::Tensor &self, paddle::platform::XPUPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
       .def("_mutable_data",
-           [](Tensor &self, paddle::platform::CUDAPlace &place,
+           [](framework::Tensor &self, paddle::platform::CUDAPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
       .def("_mutable_data",
-           [](Tensor &self, paddle::platform::CUDAPinnedPlace &place,
+           [](framework::Tensor &self, paddle::platform::CUDAPinnedPlace &place,
               paddle::framework::proto::VarType::Type type) {
              return reinterpret_cast<uintptr_t>(self.mutable_data(place, type));
            })
-      .def("_clear", &Tensor::clear)
+      .def("_clear", &framework::Tensor::clear)
       .def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
            py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
       .def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
@@ -626,7 +630,9 @@ PYBIND11_MODULE(core_noavx, m) {
                 t.set(np.ndarray([5, 30]), fluid.CPUPlace())
           )DOC")
 
-      .def("shape", [](Tensor &self) { return vectorize(self.dims()); }, R"DOC(
+      .def("shape",
+           [](framework::Tensor &self) { return vectorize(self.dims()); },
+           R"DOC(
            Return the shape of LoDTensor.
 
            Returns:
@@ -644,7 +650,7 @@ PYBIND11_MODULE(core_noavx, m) {
                   print(t.shape())  # [5, 30]
            )DOC")
       .def("_to_dlpack",
-           [](Tensor &self) {
+           [](framework::Tensor &self) {
              DLPackTensor dlpack_tensor(self, 1);
              DLManagedTensor *dmt =
                  dlpack_tensor.ToCudfCompatibleDLManagedTensor();
@@ -669,20 +675,22 @@ PYBIND11_MODULE(core_noavx, m) {
       .def("_get_float_element", TensorGetElement<float>)
       .def("_set_double_element", TensorSetElement<double>)
       .def("_get_double_element", TensorGetElement<double>)
-      .def("_place", [](Tensor &self) { return self.place(); })
-      .def("_dtype", [](Tensor &self) { return self.type(); })
+      .def("_place", [](framework::Tensor &self) { return self.place(); })
+      .def("_dtype", [](framework::Tensor &self) { return self.type(); })
       .def("_layout",
-           [](Tensor &self) { return DataLayoutToString(self.layout()); })
-      .def("_share_data_with", &Tensor::ShareDataWith)
+           [](framework::Tensor &self) {
+             return DataLayoutToString(self.layout());
+           })
+      .def("_share_data_with", &framework::Tensor::ShareDataWith)
       .def("__getitem__", PySliceTensor, py::return_value_policy::reference)
-      .def("__str__", [](const Tensor &self) {
+      .def("__str__", [](const framework::Tensor &self) {
         std::stringstream ostr;
         ostr << self;
         return ostr.str();
       });
 
   // TODO(cql): add reference: en_user_guide_lod_tensor
-  py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
+  py::class_<LoDTensor, framework::Tensor>(m, "LoDTensor", R"DOC(
     LoDTensor is a Tensor with optional LoD (Level of Details) information, 
     it can be used for variable-length sequences, 
     see :ref:`user_guide_lod_tensor` for details.
@@ -766,7 +774,8 @@ PYBIND11_MODULE(core_noavx, m) {
           t = fluid.LoDTensor()
 
         )DOC")
-      .def("__array__", [](Tensor &self) { return TensorToPyArray(self); })
+      .def("__array__",
+           [](framework::Tensor &self) { return TensorToPyArray(self); })
       .def("__init__",
            [](LoDTensor &instance, const std::vector<std::vector<size_t>>
                                        &recursive_sequence_lengths) {
@@ -1724,6 +1733,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("init_gflags", framework::InitGflags);
   m.def("init_glog", framework::InitGLOG);
   m.def("load_op_library", framework::LoadOpLib);
+  m.def("load_op_meta_info_and_register_op",
+        framework::LoadOpMetaInfoAndRegisterOp);
   m.def("init_devices", []() { framework::InitDevices(); });
 
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index b8de693b1fe3733e137e7660fabcb2c32c866693..e59ee7186e9d119d1ebcfdcbeb567f8efbf769d4 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1988,9 +1988,13 @@ class OpProtoHolder(object):
 
     def update_op_proto(self):
         op_protos = get_all_op_protos()
+        custom_op_names = []
         for proto in op_protos:
             if proto.type not in self.op_proto_map:
                 self.op_proto_map[proto.type] = proto
+                custom_op_names.append(proto.type)
+
+        return custom_op_names
 
     @staticmethod
     def generated_op_attr_names():
@@ -5699,6 +5703,9 @@ def load_op_library(lib_filename):
 
     Args:
         lib_filename (str): name of dynamic library.
+    
+    Returns:
+        list[str]: new registered custom op names.
 
     Examples:
         .. code-block:: python
@@ -5708,7 +5715,7 @@ def load_op_library(lib_filename):
 
     """
     core.load_op_library(lib_filename)
-    OpProtoHolder.instance().update_op_proto()
+    return OpProtoHolder.instance().update_op_proto()
 
 
 def switch_device(device):
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index bb74c37c043ebe8fddcee557ecb56423a2da1926..df1dc75a38c8362ce33af5854a5c2f3c14635609 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -20,9 +20,15 @@ set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES}
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# for coverage
-LIST(REMOVE_ITEM TEST_OPS test_custom_op)
-
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+
+# Compiling .so will cost some time, but running process is very fast.
+set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_install PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
+set_tests_properties(test_dispatch PROPERTIES TIMEOUT 180)
+
+set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
+set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
diff --git a/python/paddle/fluid/tests/custom_op/__init__.py b/python/paddle/fluid/tests/custom_op/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f0ea85344b7e0c679730356928c8749cf71cd66
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e09ac2f87c80639ac92467e7b50094e7752d500f
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+template <typename data_t>
+void assign_cpu_kernel(const data_t* x_data,
+                       data_t* out_data,
+                       int64_t x_numel) {
+  for (int i = 0; i < x_numel; ++i) {
+    out_data[i] = x_data[i];
+  }
+}
+
+std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> InferDType(paddle::DataType x_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> DispatchTestInterger(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_INTEGRAL_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_integer")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestInterger))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+
+std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_complex")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestComplex))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_float_and_integer")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_float_and_complex")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndComplex))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_float_and_integer_and_complex")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cc b/python/paddle/fluid/tests/custom_op/relu_op3.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ace9598c586866edd4be8cf99e4d3a783e18788b
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/relu_op3.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class Relu3Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto in_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Y", in_dims);
+  }
+};
+
+class Relu3OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddOutput("Y", "Output of relu_op");
+    AddComment(R"DOC(
+Relu3 Operator.
+)DOC");
+  }
+};
+
+class Relu3GradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto in_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
+  }
+};
+
+template <typename T>
+class Relu3GradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("relu3_grad");
+    op->SetInput("Y", this->Output("Y"));
+    op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+  }
+};
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class Relu3Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_t = ctx.Input<Tensor>("X");
+    auto* out_t = ctx.Output<Tensor>("Y");
+    auto x = in_t->data<T>();
+    auto y = out_t->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < in_t->numel(); ++i) {
+      y[i] = std::max(static_cast<T>(0.), x[i]);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Relu3GradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* y_t = ctx.Input<Tensor>("Y");
+    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto dy = dy_t->data<T>();
+    auto y = y_t->data<T>();
+    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
+
+    for (int i = 0; i < y_t->numel(); ++i) {
+      dx[i] = dy[i] * (y[i] > static_cast<T>(0) ? 1. : 0.);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+REGISTER_OPERATOR(relu3,
+                  ops::Relu3Op,
+                  ops::Relu3OpMaker,
+                  ops::Relu3GradMaker<paddle::framework::OpDesc>,
+                  ops::Relu3GradMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(relu3_grad, ops::Relu3GradOp);
+REGISTER_OP_CPU_KERNEL(relu3,
+                       ops::Relu3Kernel<CPU, float>,
+                       ops::Relu3Kernel<CPU, double>);
+REGISTER_OP_CPU_KERNEL(relu3_grad,
+                       ops::Relu3GradKernel<CPU, float>,
+                       ops::Relu3GradKernel<CPU, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cu b/python/paddle/fluid/tests/custom_op/relu_op3.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8a229cafebb1d028059af9f63f28ded8117b1d12
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/relu_op3.cu
@@ -0,0 +1,87 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+__global__ void KeRelu3(const T* x, const int num, T* y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+    y[i] = max(x[i], static_cast<T>(0.));
+  }
+}
+
+template <typename DeviceContext, typename T>
+class Relu3CUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_t = ctx.Input<Tensor>("X");
+    auto* out_t = ctx.Output<Tensor>("Y");
+    auto x = in_t->data<T>();
+    auto y = out_t->mutable_data<T>(ctx.GetPlace());
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    int num = in_t->numel();
+    int block = 512;
+    int grid = (num + block - 1) / block;
+    KeRelu3<T><<<grid, block, 0, dev_ctx.stream()>>>(x, num, y);
+  }
+};
+
+template <typename T>
+__global__ void KeRelu3Grad(const T* y, const T* dy, const int num, T* dx) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class Relu3GradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dy_t = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto* y_t = ctx.Input<Tensor>("Y");
+    auto* dx_t = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto dy = dy_t->data<T>();
+    auto y = y_t->data<T>();
+    auto dx = dx_t->mutable_data<T>(ctx.GetPlace());
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    int num = dy_t->numel();
+    int block = 512;
+    int grid = (num + block - 1) / block;
+    KeRelu3Grad<T><<<grid, block, 0, dev_ctx.stream()>>>(y, dy, num, dx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+using CUDA = paddle::platform::CUDADeviceContext;
+REGISTER_OP_CUDA_KERNEL(relu3,
+                        paddle::operators::Relu3CUDAKernel<CUDA, float>,
+                        paddle::operators::Relu3CUDAKernel<CUDA, double>);
+
+REGISTER_OP_CUDA_KERNEL(relu3_grad,
+                        paddle::operators::Relu3GradCUDAKernel<CUDA, float>,
+                        paddle::operators::Relu3GradCUDAKernel<CUDA, double>);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op3_simple.cc b/python/paddle/fluid/tests/custom_op/relu_op3_simple.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec64bce18736b7b3ef67b5ba58f9429913b3a92e
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/relu_op3_simple.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x);
+std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
+                                               const paddle::Tensor& out,
+                                               const paddle::Tensor& grad_out);
+
+std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x);
+
+std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
+                                         const paddle::Tensor& out,
+                                         const paddle::Tensor& grad_out);
+
+std::vector<std::vector<int64_t>> ReluInferShape(std::vector<int64_t> x_shape);
+
+std::vector<paddle::DataType> ReluInferDType(paddle::DataType x_dtype);
+
+// Reuse codes in `relu_op_simple.cc/cu` to register another custom operator
+// to test jointly compile multi operators at same time.
+PD_BUILD_OP("relu3")
+    .Inputs({"X"})
+    .Outputs({"Out", "Fake_float64", "ZFake_int32"})
+    .SetKernelFn(PD_KERNEL(ReluForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ReluInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ReluInferDType))
+    .SetBackwardOp("relu3_grad")
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackward));
diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b02ecba6826fa0b9dc4bcc6db07b9b8717f834aa
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+template <typename data_t>
+void fill_constant_cpu_kernel(data_t* out_data, int64_t x_numel, data_t value) {
+  for (int i = 0; i < x_numel; ++i) {
+    out_data[i] = value;
+  }
+}
+
+template <typename data_t>
+void relu_cpu_forward_kernel(const data_t* x_data,
+                             data_t* out_data,
+                             int64_t x_numel) {
+  for (int i = 0; i < x_numel; ++i) {
+    out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
+  }
+}
+
+template <typename data_t>
+void relu_cpu_backward_kernel(const data_t* grad_out_data,
+                              const data_t* out_data,
+                              data_t* grad_x_data,
+                              int64_t out_numel) {
+  for (int i = 0; i < out_numel; ++i) {
+    grad_x_data[i] =
+        grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
+  }
+}
+
+std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cpu_forward", ([&] {
+        relu_cpu_forward_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.size());
+      }));
+  // fake multi output: Fake_float64 with float64 dtype
+  auto fake_float64 = paddle::Tensor(paddle::PlaceType::kCPU);
+  fake_float64.reshape(x.shape());
+
+  fill_constant_cpu_kernel<double>(
+      fake_float64.mutable_data<double>(x.place()), x.size(), 0.);
+
+  // fake multi output: ZFake_int32 with int32 dtype
+  auto zfake_int32 = paddle::Tensor(paddle::PlaceType::kCPU);
+  zfake_int32.reshape(x.shape());
+
+  fill_constant_cpu_kernel<int32_t>(
+      zfake_int32.mutable_data<int32_t>(x.place()), x.size(), 1);
+
+  return {out, fake_float64, zfake_int32};
+}
+
+std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
+                                              const paddle::Tensor& out,
+                                              const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
+  grad_x.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
+                               relu_cpu_backward_kernel<data_t>(
+                                   grad_out.data<data_t>(),
+                                   out.data<data_t>(),
+                                   grad_x.mutable_data<data_t>(x.place()),
+                                   out.size());
+                             }));
+
+  return {grad_x};
+}
+
+std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x);
+std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
+                                               const paddle::Tensor& out,
+                                               const paddle::Tensor& grad_out);
+
+std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
+  // TODO(chenweihang): Check Input
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_forward(x);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_forward(x);
+  } else {
+    throw std::runtime_error("Not implemented.");
+  }
+}
+
+std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
+                                         const paddle::Tensor& out,
+                                         const paddle::Tensor& grad_out) {
+  // TODO(chenweihang): Check Input
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_backward(x, out, grad_out);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_backward(x, out, grad_out);
+  } else {
+    throw std::runtime_error("Not implemented.");
+  }
+}
+
+std::vector<std::vector<int64_t>> ReluInferShape(std::vector<int64_t> x_shape) {
+  return {x_shape, x_shape, x_shape};
+}
+
+std::vector<paddle::DataType> ReluInferDType(paddle::DataType x_dtype) {
+  return {x_dtype, paddle::DataType::FLOAT64, paddle::DataType::INT32};
+}
+
+PD_BUILD_OP("relu2")
+    .Inputs({"X"})
+    .Outputs({"Out", "Fake_float64", "ZFake_int32"})
+    .SetKernelFn(PD_KERNEL(ReluForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ReluInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ReluInferDType))
+    .SetBackwardOp("relu2_grad")
+    .Inputs({"X", "Out", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("X")})
+    .SetKernelFn(PD_KERNEL(ReluBackward));
diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2ef6a5c1451e7409faca951cf9542c20cef59466
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
@@ -0,0 +1,93 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/extension.h"
+
+template <typename data_t>
+__global__ void fill_constant_cuda_kernel(data_t* y,
+                                          const int num,
+                                          data_t value) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+    y[i] = value;
+  }
+}
+
+template <typename data_t>
+__global__ void relu_cuda_forward_kernel(const data_t* x,
+                                         data_t* y,
+                                         const int num) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+    y[i] = max(x[i], static_cast<data_t>(0.));
+  }
+}
+
+template <typename data_t>
+__global__ void relu_cuda_backward_kernel(const data_t* dy,
+                                          const data_t* y,
+                                          data_t* dx,
+                                          const int num) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+    dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.);
+  }
+}
+
+std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kGPU);
+  out.reshape(x.shape());
+
+  int numel = x.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_TYPES(
+      x.type(), "relu_cuda_forward_kernel", ([&] {
+        relu_cuda_forward_kernel<data_t><<<grid, block>>>(
+            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+      }));
+  // fake multi output: Fake_1
+  auto fake_float64 = paddle::Tensor(paddle::PlaceType::kGPU);
+  fake_float64.reshape(x.shape());
+  fill_constant_cuda_kernel<double><<<grid, block>>>(
+      fake_float64.mutable_data<double>(x.place()), numel, 0.);
+  // fake multi output: ZFake_1
+  auto zfake_int32 = paddle::Tensor(paddle::PlaceType::kGPU);
+  zfake_int32.reshape(x.shape());
+  fill_constant_cuda_kernel<int32_t><<<grid, block>>>(
+      zfake_int32.mutable_data<int32_t>(x.place()), numel, 1);
+
+  return {out, fake_float64, zfake_int32};
+}
+
+std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
+                                               const paddle::Tensor& out,
+                                               const paddle::Tensor& grad_out) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU);
+  grad_x.reshape(x.shape());
+
+  int numel = out.size();
+  int block = 512;
+  int grid = (numel + block - 1) / block;
+  PD_DISPATCH_FLOATING_TYPES(
+      out.type(), "relu_cuda_backward_kernel", ([&] {
+        relu_cuda_backward_kernel<data_t><<<grid, block>>>(
+            grad_out.data<data_t>(),
+            out.data<data_t>(),
+            grad_x.mutable_data<data_t>(x.place()),
+            numel);
+      }));
+
+  return {grad_x};
+}
diff --git a/python/paddle/fluid/tests/custom_op/setup_build.py b/python/paddle/fluid/tests/custom_op/setup_build.py
new file mode 100644
index 0000000000000000000000000000000000000000..16a747793079e37dca6cc133093fb5ae7d824d73
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/setup_build.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from utils import paddle_includes, extra_compile_args
+from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup
+from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
+
+# switch to old custom op method
+use_new_custom_op_load_method(False)
+
+file_dir = os.path.dirname(os.path.abspath(__file__))
+
+setup(
+    name='librelu2_op_from_setup',
+    ext_modules=[
+        CUDAExtension(
+            sources=['relu_op3.cc', 'relu_op3.cu', 'relu_op.cc',
+                     'relu_op.cu'],  # test for multi ops
+            include_dirs=paddle_includes,
+            extra_compile_args=extra_compile_args)
+    ],
+    cmdclass={
+        'build_ext': BuildExtension.with_options(
+            no_python_abi_suffix=True, output_dir=file_dir)  # for unittest
+    })
diff --git a/python/paddle/fluid/tests/custom_op/setup_install.py b/python/paddle/fluid/tests/custom_op/setup_install.py
new file mode 100644
index 0000000000000000000000000000000000000000..18fbfbaf8b64b328be108a837c54c55492bebe9e
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/setup_install.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+from utils import paddle_includes, extra_compile_args
+from paddle.utils.cpp_extension import CUDAExtension, setup
+from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
+
+# switch to old custom op method
+use_new_custom_op_load_method(False)
+
+setup(
+    name='custom_relu2',
+    ext_modules=CUDAExtension(  # test for not specific name here.
+        sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc',
+                 'relu_op3.cu'],  # test for multi ops
+        include_dirs=paddle_includes,
+        extra_compile_args=extra_compile_args))
diff --git a/python/paddle/fluid/tests/custom_op/setup_install_simple.py b/python/paddle/fluid/tests/custom_op/setup_install_simple.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed236ccbd4c59c830eaa04761b23135fb8bc16d2
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/setup_install_simple.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from utils import paddle_includes, extra_compile_args
+from paddle.utils.cpp_extension import CUDAExtension, setup
+
+setup(
+    name='simple_setup_relu2',
+    ext_modules=CUDAExtension(  # test for not specific name here.
+        sources=[
+            'relu_op_simple.cc', 'relu_op_simple.cu', 'relu_op3_simple.cc'
+        ],  # test for multi ops
+        include_dirs=paddle_includes,
+        extra_compile_args=extra_compile_args))
diff --git a/python/paddle/fluid/tests/custom_op/test_check_abi.py b/python/paddle/fluid/tests/custom_op/test_check_abi.py
new file mode 100644
index 0000000000000000000000000000000000000000..b171fca2076ac1395543558b23c4d990fb331d10
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import warnings
+
+import paddle.utils.cpp_extension.extension_utils as utils
+
+
+class TestABIBase(unittest.TestCase):
+    def test_environ(self):
+        compiler = 'gcc'
+        for flag in ['1', 'True', 'true']:
+            os.environ['PADDLE_SKIP_CHECK_ABI'] = flag
+            self.assertTrue(utils.check_abi_compatibility(compiler))
+
+    def del_environ(self):
+        key = 'PADDLE_SKIP_CHECK_ABI'
+        if key in os.environ:
+            del os.environ[key]
+
+
+class TestCheckLinux(TestABIBase):
+    def test_expected_compiler(self):
+        if utils.OS_NAME.startswith('linux'):
+            gt = ['gcc', 'g++', 'gnu-c++', 'gnu-cc']
+            self.assertListEqual(utils._expected_compiler_current_platform(),
+                                 gt)
+
+    def test_gcc_version(self):
+        # clear environ
+        self.del_environ()
+        compiler = 'g++'
+        if utils.OS_NAME.startswith('linux'):
+            # all CI gcc version > 5.4.0
+            self.assertTrue(
+                utils.check_abi_compatibility(
+                    compiler, verbose=True))
+
+    def test_wrong_compiler_warning(self):
+        # clear environ
+        self.del_environ()
+        compiler = 'nvcc'  # fake wrong compiler
+        if utils.OS_NAME.startswith('linux'):
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return False
+                self.assertFalse(flag)
+                # check Compiler Compatibility WARNING
+                self.assertTrue(len(error) == 1)
+                self.assertTrue(
+                    "Compiler Compatibility WARNING" in str(error[0].message))
+
+    def test_exception(self):
+        # clear environ
+        self.del_environ()
+        compiler = 'python'  # fake command
+        if utils.OS_NAME.startswith('linux'):
+            # to skip _expected_compiler_current_platform
+            def fake():
+                return [compiler]
+
+            # mock a fake function
+            raw_func = utils._expected_compiler_current_platform
+            utils._expected_compiler_current_platform = fake
+            with warnings.catch_warnings(record=True) as error:
+                flag = utils.check_abi_compatibility(compiler, verbose=True)
+                # check return False
+                self.assertFalse(flag)
+                # check ABI Compatibility WARNING
+                self.assertTrue(len(error) == 1)
+                self.assertTrue("Failed to check compiler version for" in
+                                str(error[0].message))
+
+            # restore
+            utils._expected_compiler_current_platform = raw_func
+
+
+class TestCheckMacOs(TestABIBase):
+    def test_expected_compiler(self):
+        if utils.OS_NAME.startswith('darwin'):
+            gt = ['clang', 'clang++']
+            self.assertListEqual(utils._expected_compiler_current_platform(),
+                                 gt)
+
+    def test_gcc_version(self):
+        # clear environ
+        self.del_environ()
+
+        if utils.OS_NAME.startswith('darwin'):
+            # clang has no version limitation.
+            self.assertTrue(utils.check_abi_compatibility())
+
+
+class TestCheckWindows(TestABIBase):
+    def test_gcc_version(self):
+        # clear environ
+        self.del_environ()
+
+        if utils.IS_WINDOWS:
+            # we skip windows now
+            self.assertTrue(utils.check_abi_compatibility())
+
+
+class TestJITCompilerException(unittest.TestCase):
+    def test_exception(self):
+        with self.assertRaisesRegexp(RuntimeError,
+                                     "Failed to check Python interpreter"):
+            file_path = os.path.abspath(__file__)
+            utils._jit_compile(file_path, interpreter='fake_cmd', verbose=True)
+
+
+class TestRunCMDException(unittest.TestCase):
+    def test_exception(self):
+        for verbose in [True, False]:
+            with self.assertRaisesRegexp(RuntimeError, "Failed to run command"):
+                cmd = "fake cmd"
+                utils.run_cmd(cmd, verbose)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_op.py b/python/paddle/fluid/tests/custom_op/test_custom_op.py
index c9f7d0b7c966ad1f99160de4b879f09b013bc513..1c0db0be154d585e0a6d1514558e116ee83c9701 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_op.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_op.py
@@ -20,11 +20,16 @@ import contextlib
 
 import paddle
 import paddle.fluid as fluid
-
 paddle.enable_static()
 
-file_dir = os.path.dirname(os.path.abspath(__file__))
-fluid.load_op_library(os.path.join(file_dir, 'librelu2_op.so'))
+
+def load_so(so_name):
+    """
+    Load .so file and parse custom op into OpInfoMap.
+    """
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+    fluid.load_op_library(os.path.join(file_dir, so_name))
+
 
 from paddle.fluid.layer_helper import LayerHelper
 
@@ -111,4 +116,5 @@ class CustomOpTest(unittest.TestCase):
 
 
 if __name__ == '__main__':
+    load_so(so_name='librelu2_op.so')
     unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch.py b/python/paddle/fluid/tests/custom_op/test_dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..1766a6042f395f34a39fc6da8d93646ca6b50597
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import paddle
+import numpy as np
+from paddle.utils.cpp_extension import load
+from utils import paddle_includes, extra_compile_args
+
+dispatch_op = load(
+    name='dispatch_op',
+    sources=['dispatch_test_op.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cflags=extra_compile_args)  # add for Coverage CI
+
+
+class TestJitDispatch(unittest.TestCase):
+    def setUp(self):
+        paddle.set_device('cpu')
+
+    def run_dispatch_test(self, func, dtype):
+        np_x = np.ones([2, 2]).astype(dtype)
+        x = paddle.to_tensor(np_x)
+        out = func(x)
+        np_x = x.numpy()
+        np_out = out.numpy()
+        self.assertTrue(dtype in str(np_out.dtype))
+        self.assertTrue(
+            np.array_equal(np_x, np_out),
+            "custom op x: {},\n custom op out: {}".format(np_x, np_out))
+
+    def test_dispatch_integer(self):
+        dtypes = ["int32", "int64", "int8", "uint8", "int16"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_integer, dtype)
+
+    def test_dispatch_complex(self):
+        dtypes = ["complex64", "complex128"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_complex, dtype)
+
+    def test_dispatch_float_and_integer(self):
+        dtypes = [
+            "float32", "float64", "int32", "int64", "int8", "uint8", "int16"
+        ]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_integer,
+                                   dtype)
+
+    def test_dispatch_float_and_complex(self):
+        dtypes = ["float32", "float64", "complex64", "complex128"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_complex,
+                                   dtype)
+
+    def test_dispatch_float_and_integer_and_complex(self):
+        dtypes = [
+            "float32", "float64", "int32", "int64", "int8", "uint8", "int16",
+            "complex64", "complex128"
+        ]
+        for dtype in dtypes:
+            self.run_dispatch_test(
+                dispatch_op.dispatch_test_float_and_integer_and_complex, dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_jit_load.py b/python/paddle/fluid/tests/custom_op/test_jit_load.py
new file mode 100644
index 0000000000000000000000000000000000000000..222c69f5edcc56747bf3819ac4bfd5b1915e3ded
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_jit_load.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import paddle
+import numpy as np
+from paddle.utils.cpp_extension import load
+from utils import paddle_includes, extra_compile_args
+from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
+
+# switch to old custom op method
+use_new_custom_op_load_method(False)
+
+# Compile and load custom op Just-In-Time.
+custom_module = load(
+    name='custom_relu2',
+    sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc', 'relu_op3.cu'],
+    interpreter='python',  # add for unittest
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cflags=extra_compile_args,  # add for Coverage CI
+    verbose=True  # add for unittest
+)
+
+
+class TestJITLoad(unittest.TestCase):
+    def test_api(self):
+        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
+        gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')
+        x = paddle.to_tensor(raw_data, dtype='float32')
+        # use custom api
+        out = custom_module.relu2(x)
+        out3 = custom_module.relu3(x)
+
+        self.assertTrue(np.array_equal(out.numpy(), gt_data))
+        self.assertTrue(np.array_equal(out3.numpy(), gt_data))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_setup_build.py b/python/paddle/fluid/tests/custom_op/test_setup_build.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ef14c2e3aaa3c9412660b7afcc14e22aa6402ea
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_setup_build.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import numpy as np
+from test_custom_op import CustomOpTest, load_so
+import paddle
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
+
+# switch to old custom op method
+use_new_custom_op_load_method(False)
+
+
+def compile_so():
+    """
+    Compile .so file by running setup.py config.
+    """
+    # build .so with setup.py
+    file_dir = os.path.dirname(os.path.abspath(__file__))
+    cmd = 'cd {} && python setup_build.py build'.format(file_dir)
+    run_cmd(cmd)
+
+
+# `setup.py build` only produce .so file containing multi operators.
+#  Python Interface should be added manually. `relu2` api is in `test_custom_op.py`
+def relu3(x, name=None):
+    helper = LayerHelper("relu3", **locals())
+    out = helper.create_variable(
+        type=x.type, name=name, dtype=x.dtype, persistable=False)
+    helper.append_op(type="relu3", inputs={"X": x}, outputs={"Y": out})
+    return out
+
+
+class TestCompileMultiOp(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+    def test_relu3(self):
+        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
+        x = paddle.to_tensor(raw_data, dtype='float32')
+        # use custom api
+        out = relu3(x)
+
+        self.assertTrue(
+            np.array_equal(out.numpy(),
+                           np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')))
+
+    def tearDown(self):
+        paddle.enable_static()
+
+
+if __name__ == '__main__':
+    compile_so()
+    load_so(so_name='librelu2_op_from_setup.so')
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_setup_install.py b/python/paddle/fluid/tests/custom_op/test_setup_install.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fd7b8a06f952336274a78b253d096644fbdf08f
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_setup_install.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import site
+import unittest
+import paddle
+import subprocess
+import numpy as np
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method
+
+# switch to old custom op method
+use_new_custom_op_load_method(False)
+
+
+class TestSetUpInstall(unittest.TestCase):
+    def setUp(self):
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        # compile, install the custom op egg into site-packages under background
+        cmd = 'cd {} && python setup_install.py install'.format(cur_dir)
+        run_cmd(cmd)
+
+        # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
+        # But we simulate to pip install in current process, so interpreter don't snap
+        # sys.path has been updated. So we update it manually.
+
+        # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3
+        site_dir = site.getsitepackages()[0]
+        custom_egg_path = [
+            x for x in os.listdir(site_dir) if 'custom_relu2' in x
+        ]
+        assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
+            custom_egg_path)
+        sys.path.append(os.path.join(site_dir, custom_egg_path[0]))
+
+    def test_api(self):
+        # usage: import the package directly
+        import custom_relu2
+
+        raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32')
+        gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32')
+        x = paddle.to_tensor(raw_data, dtype='float32')
+        # use custom api
+        out = custom_relu2.relu2(x)
+        out3 = custom_relu2.relu3(x)
+
+        self.assertTrue(np.array_equal(out.numpy(), gt_data))
+        self.assertTrue(np.array_equal(out3.numpy(), gt_data))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c0dc1a4ca6a119c1dc9dd0bf8add15e677aaf43
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import paddle
+import numpy as np
+from paddle.utils.cpp_extension import load
+from utils import paddle_includes, extra_compile_args
+from test_simple_custom_op_setup import relu2_dynamic, relu2_static
+
+# Compile and load custom op Just-In-Time.
+custom_module = load(
+    name='simple_jit_relu2',
+    sources=['relu_op_simple.cc', 'relu_op_simple.cu', 'relu_op3_simple.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cflags=extra_compile_args)  # add for Coverage CI
+
+
+class TestJITLoad(unittest.TestCase):
+    def setUp(self):
+        self.custom_ops = [custom_module.relu2, custom_module.relu3]
+        self.dtypes = ['float32', 'float64']
+        self.devices = ['cpu', 'gpu']
+
+    def test_static(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                for custom_op in self.custom_ops:
+                    out = relu2_static(custom_op, device, dtype, x)
+                    pd_out = relu2_static(custom_op, device, dtype, x, False)
+                    self.assertTrue(
+                        np.array_equal(out, pd_out),
+                        "custom op out: {},\n paddle api out: {}".format(
+                            out, pd_out))
+
+    def test_dynamic(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                for custom_op in self.custom_ops:
+                    out, x_grad = relu2_dynamic(custom_op, device, dtype, x)
+                    pd_out, pd_x_grad = relu2_dynamic(custom_op, device, dtype,
+                                                      x, False)
+                    self.assertTrue(
+                        np.array_equal(out, pd_out),
+                        "custom op out: {},\n paddle api out: {}".format(
+                            out, pd_out))
+                    self.assertTrue(
+                        np.array_equal(x_grad, pd_x_grad),
+                        "custom op x grad: {},\n paddle api x grad: {}".format(
+                            x_grad, pd_x_grad))
+
+
+class TestMultiOutputDtypes(unittest.TestCase):
+    def setUp(self):
+        self.custom_op = custom_module.relu2
+        self.dtypes = ['float32', 'float64']
+        self.devices = ['cpu', 'gpu']
+
+    def test_static(self):
+        paddle.enable_static()
+        for device in self.devices:
+            for dtype in self.dtypes:
+                res = self.run_static(device, dtype)
+                self.check_multi_outputs(res)
+        paddle.disable_static()
+
+    def test_dynamic(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                paddle.set_device(device)
+                x_data = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                x = paddle.to_tensor(x_data)
+                outs = self.custom_op(x)
+
+                self.assertTrue(len(outs) == 3)
+                self.check_multi_outputs(outs, True)
+
+    def check_multi_outputs(self, outs, is_dynamic=False):
+        out, zero_float64, one_int32 = outs
+        if is_dynamic:
+            zero_float64 = zero_float64.numpy()
+            one_int32 = one_int32.numpy()
+        # Fake_float64
+        self.assertTrue('float64' in str(zero_float64.dtype))
+        self.assertTrue(
+            np.array_equal(zero_float64, np.zeros([4, 8]).astype('float64')))
+        # ZFake_int32
+        self.assertTrue('int32' in str(one_int32.dtype))
+        self.assertTrue(
+            np.array_equal(one_int32, np.ones([4, 8]).astype('int32')))
+
+    def run_static(self, device, dtype):
+        paddle.set_device(device)
+        x_data = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+
+        with paddle.static.scope_guard(paddle.static.Scope()):
+            with paddle.static.program_guard(paddle.static.Program()):
+                x = paddle.static.data(name='X', shape=[None, 8], dtype=dtype)
+                outs = self.custom_op(x)
+
+                exe = paddle.static.Executor()
+                exe.run(paddle.static.default_startup_program())
+                res = exe.run(paddle.static.default_main_program(),
+                              feed={'X': x_data},
+                              fetch_list=outs)
+
+                return res
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfa2db0ba24a49a20b825e47d2b90077c3b6d463
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import site
+import unittest
+import paddle
+import paddle.static as static
+import subprocess
+import numpy as np
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+
+
+def relu2_dynamic(func, device, dtype, np_x, use_func=True):
+    paddle.set_device(device)
+
+    t = paddle.to_tensor(np_x)
+    t.stop_gradient = False
+
+    out = func(t)[0] if use_func else paddle.nn.functional.relu(t)
+    out.stop_gradient = False
+
+    out.backward()
+
+    return out.numpy(), t.grad
+
+
+def relu2_static(func, device, dtype, np_x, use_func=True):
+    paddle.enable_static()
+    paddle.set_device(device)
+
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name='X', shape=[None, 8], dtype=dtype)
+            x.stop_gradient = False
+            # out, fake_float64, fake_int32
+            out = func(x)[0] if use_func else paddle.nn.functional.relu(x)
+            static.append_backward(out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+            # in static mode, x data has been covered by out
+            out_v = exe.run(static.default_main_program(),
+                            feed={'X': np_x},
+                            fetch_list=[out.name])
+
+    paddle.disable_static()
+    return out_v
+
+
+def relu2_static_pe(func, device, dtype, np_x, use_func=True):
+    paddle.enable_static()
+    paddle.set_device(device)
+
+    places = static.cpu_places() if device is 'cpu' else static.cuda_places()
+    with static.scope_guard(static.Scope()):
+        with static.program_guard(static.Program()):
+            x = static.data(name='X', shape=[None, 8], dtype=dtype)
+            x.stop_gradient = False
+            out = func(x)[0] if use_func else paddle.nn.functional.relu(x)
+            static.append_backward(out)
+
+            exe = static.Executor()
+            exe.run(static.default_startup_program())
+
+            # in static mode, x data has been covered by out
+            compiled_prog = static.CompiledProgram(static.default_main_program(
+            )).with_data_parallel(
+                loss_name=out.name, places=places)
+            out_v = exe.run(compiled_prog,
+                            feed={'X': np_x},
+                            fetch_list=[out.name])
+
+    paddle.disable_static()
+    return out_v
+
+
+class TestNewCustomOpSetUpInstall(unittest.TestCase):
+    def setUp(self):
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        # compile, install the custom op egg into site-packages under background
+        cmd = 'cd {} && python setup_install_simple.py install'.format(cur_dir)
+        run_cmd(cmd)
+
+        # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
+        # But we simulate to pip install in current process, so interpreter don't snap
+        # sys.path has been updated. So we update it manually.
+
+        # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3
+        site_dir = site.getsitepackages()[0]
+        custom_egg_path = [
+            x for x in os.listdir(site_dir) if 'simple_setup_relu2' in x
+        ]
+        assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
+            custom_egg_path)
+        sys.path.append(os.path.join(site_dir, custom_egg_path[0]))
+
+        # usage: import the package directly
+        import simple_setup_relu2
+        self.custom_ops = [simple_setup_relu2.relu2, simple_setup_relu2.relu3]
+
+        self.dtypes = ['float32', 'float64']
+        self.devices = ['cpu', 'gpu']
+
+    def test_static(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                for custom_op in self.custom_ops:
+                    out = relu2_static(custom_op, device, dtype, x)
+                    pd_out = relu2_static(custom_op, device, dtype, x, False)
+                    self.assertTrue(
+                        np.array_equal(out, pd_out),
+                        "custom op out: {},\n paddle api out: {}".format(
+                            out, pd_out))
+
+    def test_static_pe(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                for custom_op in self.custom_ops:
+                    out = relu2_static_pe(custom_op, device, dtype, x)
+                    pd_out = relu2_static_pe(custom_op, device, dtype, x, False)
+                    self.assertTrue(
+                        np.array_equal(out, pd_out),
+                        "custom op out: {},\n paddle api out: {}".format(
+                            out, pd_out))
+
+    def test_dynamic(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                for custom_op in self.custom_ops:
+                    out, x_grad = relu2_dynamic(custom_op, device, dtype, x)
+                    pd_out, pd_x_grad = relu2_dynamic(custom_op, device, dtype,
+                                                      x, False)
+                    self.assertTrue(
+                        np.array_equal(out, pd_out),
+                        "custom op out: {},\n paddle api out: {}".format(
+                            out, pd_out))
+                    self.assertTrue(
+                        np.array_equal(x_grad, pd_x_grad),
+                        "custom op x grad: {},\n paddle api x grad: {}".format(
+                            x_grad, pd_x_grad))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/utils.py b/python/paddle/fluid/tests/custom_op/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f293c751942cda432400ce1786326eb14cf6a9b2
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/utils.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import six
+from distutils.sysconfig import get_python_lib
+from paddle.utils.cpp_extension.extension_utils import IS_WINDOWS
+
+site_packages_path = get_python_lib()
+# Note(Aurelius84): We use `add_test` in Cmake to config how to run unittest in CI.
+# `PYTHONPATH` will be set as `build/python/paddle` that will make no way to find
+# paddle include directory. Because the following path is generated after insalling
+# PaddlePaddle whl. So here we specific `include_dirs` to avoid errors in CI.
+paddle_includes = [
+    os.path.join(site_packages_path, 'paddle/include'),
+    os.path.join(site_packages_path, 'paddle/include/third_party')
+]
+
+# TODO(Aurelius84): Memory layout is different if build paddle with PADDLE_WITH_MKLDNN=ON,
+# and will lead to ABI problem on Coverage CI. We will handle it in next PR.
+extra_compile_args = ['-DPADDLE_WITH_MKLDNN'
+                      ] if six.PY2 and not IS_WINDOWS else []
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index faf0fd4984d7ca0c1d1c2457cf9b6f6186852eb9..1db1b66426c8313f66e16081a55e420a2e34138b 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -25,6 +25,8 @@ from ..fluid.framework import require_version
 
 from . import download
 
+from . import cpp_extension
+
 __all__ = ['dump_config', 'deprecated', 'download', 'run_check']
 
 #TODO: define new api under this directory
diff --git a/python/paddle/utils/cpp_extension/__init__.py b/python/paddle/utils/cpp_extension/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..024fbb6bf7c4ed1a156322c83f34aed59e8da25b
--- /dev/null
+++ b/python/paddle/utils/cpp_extension/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .cpp_extension import CUDAExtension
+from .cpp_extension import CppExtension
+from .cpp_extension import BuildExtension
+from .cpp_extension import load, setup
+
+from .extension_utils import parse_op_info
+from .extension_utils import get_build_directory
+from .extension_utils import load_op_meta_info_and_register_op
+
+from . import cpp_extension
+from . import extension_utils
+
+__all__ = [
+    'CppExtension', 'CUDAExtension', 'BuildExtension', 'load', 'setup',
+    'get_build_directory'
+]
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
new file mode 100644
index 0000000000000000000000000000000000000000..121c1626125af9974519e30ac87d8130c7466f25
--- /dev/null
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -0,0 +1,471 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import six
+import sys
+import textwrap
+import copy
+
+import setuptools
+from setuptools.command.easy_install import easy_install
+from setuptools.command.build_ext import build_ext
+
+from .extension_utils import find_cuda_home, normalize_extension_kwargs, add_compile_flag, bootstrap_context
+from .extension_utils import is_cuda_file, prepare_unix_cflags, add_std_without_repeat, get_build_directory
+from .extension_utils import _import_module_from_library, CustomOpInfo, _write_setup_file, _jit_compile, parse_op_name_from
+from .extension_utils import check_abi_compatibility, log_v, IS_WINDOWS
+from .extension_utils import use_new_custom_op_load_method
+
+CUDA_HOME = find_cuda_home()
+
+
+def setup(**attr):
+    """
+    Wrapper setuptools.setup function to valid `build_ext` command and
+    implement paddle api code injection by switching `write_stub`
+    function in bdist_egg with `custom_write_stub`.
+
+    Its usage is almost same as `setuptools.setup` except for `ext_modules`
+    arguments. For compiling multi custom operators, all necessary source files
+    can be include into just one Extension (CppExtension/CUDAExtension).
+    Moreover, only one `name` argument is required in `setup` and no need to spcific
+    `name` in Extension.
+
+    Example:
+
+        >> from paddle.utils.cpp_extension import CUDAExtension, setup
+        >> setup(name='custom_module',
+                 ext_modules=CUDAExtension(
+                    sources=['relu_op.cc', 'relu_op.cu'],
+                    include_dirs=[],       # specific user-defined include dirs
+                    extra_compile_args=[]) # specific user-defined compil arguments.
+    """
+    cmdclass = attr.get('cmdclass', {})
+    assert isinstance(cmdclass, dict)
+    # if not specific cmdclass in setup, add it automaticaly.
+    if 'build_ext' not in cmdclass:
+        cmdclass['build_ext'] = BuildExtension.with_options(
+            no_python_abi_suffix=True)
+        attr['cmdclass'] = cmdclass
+
+    error_msg = """
+    Required to specific `name` argument in paddle.utils.cpp_extension.setup.
+    It's used as `import XXX` when you want install and import your custom operators.\n
+    For Example:
+        # setup.py file
+        from paddle.utils.cpp_extension import CUDAExtension, setup
+        setup(name='custom_module',
+              ext_modules=CUDAExtension(
+              sources=['relu_op.cc', 'relu_op.cu'])
+        
+        # After running `python setup.py install`
+        from custom_module import relue
+    """
+    # name argument is required
+    if 'name' not in attr:
+        raise ValueError(error_msg)
+
+    ext_modules = attr.get('ext_modules', [])
+    if not isinstance(ext_modules, list):
+        ext_modules = [ext_modules]
+    assert len(
+        ext_modules
+    ) == 1, "Required only one Extension, but received {}. If you want to compile multi operators, you can include all necessary source files in one Extenion.".format(
+        len(ext_modules))
+    # replace Extension.name with attr['name] to keep consistant with Package name.
+    for ext_module in ext_modules:
+        ext_module.name = attr['name']
+
+    attr['ext_modules'] = ext_modules
+
+    # Add rename .so hook in easy_install
+    assert 'easy_install' not in cmdclass
+    cmdclass['easy_install'] = EasyInstallCommand
+
+    # Always set zip_safe=False to make compatible in PY2 and PY3
+    # See http://peak.telecommunity.com/DevCenter/setuptools#setting-the-zip-safe-flag
+    attr['zip_safe'] = False
+
+    # switch `write_stub` to inject paddle api in .egg
+    with bootstrap_context():
+        setuptools.setup(**attr)
+
+
+def CppExtension(sources, *args, **kwargs):
+    """
+    Returns setuptools.CppExtension instance for setup.py to make it easy
+    to specify compile flags while building C++ custommed op kernel.
+
+    Args:
+           sources(list[str]): The C++/CUDA source file names
+           args(list[options]): list of config options used to compile shared library
+           kwargs(dict[option]): dict of config options used to compile shared library
+           
+       Returns:
+           Extension: An instance of setuptools.Extension
+    """
+    kwargs = normalize_extension_kwargs(kwargs, use_cuda=False)
+    # Note(Aurelius84): While using `setup` and `jit`, the Extension `name` will
+    # be replaced as `setup.name` to keep consistant with package. Because we allow
+    # users can not specific name in Extension.
+    # See `paddle.utils.cpp_extension.setup` for details.
+    name = kwargs.get('name', None)
+    if name is None:
+        name = _generate_extension_name(sources)
+
+    return setuptools.Extension(name, sources, *args, **kwargs)
+
+
+def CUDAExtension(sources, *args, **kwargs):
+    """
+    Returns setuptools.CppExtension instance for setup.py to make it easy
+    to specify compile flags while build CUDA custommed op kernel.
+
+    Args:
+           sources(list[str]): The C++/CUDA source file names
+           args(list[options]): list of config options used to compile shared library
+           kwargs(dict[option]): dict of config options used to compile shared library
+           
+       Returns:
+           Extension: An instance of setuptools.Extension
+    """
+    kwargs = normalize_extension_kwargs(kwargs, use_cuda=True)
+    # Note(Aurelius84): While using `setup` and `jit`, the Extension `name` will
+    # be replaced as `setup.name` to keep consistant with package. Because we allow
+    # users can not specific name in Extension.
+    # See `paddle.utils.cpp_extension.setup` for details.
+    name = kwargs.get('name', None)
+    if name is None:
+        name = _generate_extension_name(sources)
+
+    return setuptools.Extension(name, sources, *args, **kwargs)
+
+
+def _generate_extension_name(sources):
+    """
+    Generate extension name by source files.
+    """
+    assert len(sources) > 0, "source files is empty"
+    file_prefix = []
+    for source in sources:
+        source = os.path.basename(source)
+        filename, _ = os.path.splitext(source)
+        # Use list to generate same order.
+        if filename not in file_prefix:
+            file_prefix.append(filename)
+
+    return '_'.join(file_prefix)
+
+
+class BuildExtension(build_ext, object):
+    """
+    Inherited from setuptools.command.build_ext to customize how to apply
+    compilation process with share library.
+    """
+
+    @classmethod
+    def with_options(cls, **options):
+        """
+        Returns a BuildExtension subclass containing use-defined options.
+        """
+
+        class cls_with_options(cls):
+            def __init__(self, *args, **kwargs):
+                kwargs.update(options)
+                cls.__init__(self, *args, **kwargs)
+
+        return cls_with_options
+
+    def __init__(self, *args, **kwargs):
+        """
+        Attributes is initialized with following oreder:
+        
+            1. super(self).__init__()
+            2. initialize_options(self)
+            3. the reset of current __init__()
+            4. finalize_options(self)
+        
+        So, it is recommended to set attribute value in `finalize_options`.
+        """
+        super(BuildExtension, self).__init__(*args, **kwargs)
+        self.no_python_abi_suffix = kwargs.get("no_python_abi_suffix", True)
+        self.output_dir = kwargs.get("output_dir", None)
+        # for compatible two custom op define method
+        use_new_custom_op_load_method(
+            kwargs.get("use_new_method", use_new_custom_op_load_method()))
+
+    def initialize_options(self):
+        super(BuildExtension, self).initialize_options()
+
+    def finalize_options(self):
+        super(BuildExtension, self).finalize_options()
+        # NOTE(Aurelius84): Set location of compiled shared library.
+        # Carefully to modify this because `setup.py build/install`
+        # and `load` interface rely on this attribute.
+        if self.output_dir is not None:
+            self.build_lib = self.output_dir
+
+    def build_extensions(self):
+        self._check_abi()
+        for extension in self.extensions:
+            # check settings of compiler
+            if isinstance(extension.extra_compile_args, dict):
+                for compiler in ['cxx', 'nvcc']:
+                    if compiler not in extension.extra_compile_args:
+                        extension.extra_compile_args[compiler] = []
+            # add determine compile flags
+            add_compile_flag(extension, '-std=c++11')
+
+        # Consider .cu, .cu.cc as valid source extensions.
+        self.compiler.src_extensions += ['.cu', '.cu.cc']
+        # Save the original _compile method for later.
+        if self.compiler.compiler_type == 'msvc' or IS_WINDOWS:
+            raise NotImplementedError("Not support on MSVC currently.")
+        else:
+            original_compile = self.compiler._compile
+
+        def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
+                                        pp_opts):
+            """
+            Monkey patch machanism to replace inner compiler to custom complie process on Unix platform.
+            """
+            # use abspath to ensure no warning and don't remove deecopy because modify params
+            # with dict type is dangerous.
+            src = os.path.abspath(src)
+            cflags = copy.deepcopy(extra_postargs)
+            try:
+                original_compiler = self.compiler.compiler_so
+                # ncvv compile CUDA source
+                if is_cuda_file(src):
+                    assert CUDA_HOME is not None
+                    nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
+                    self.compiler.set_executable('compiler_so', nvcc_cmd)
+                    # {'nvcc': {}, 'cxx: {}}
+                    if isinstance(cflags, dict):
+                        cflags = cflags['nvcc']
+                    else:
+                        cflags = prepare_unix_cflags(cflags)
+                # cxx compile Cpp source
+                elif isinstance(cflags, dict):
+                    cflags = cflags['cxx']
+
+                add_std_without_repeat(
+                    cflags, self.compiler.compiler_type, use_std14=False)
+                original_compile(obj, src, ext, cc_args, cflags, pp_opts)
+            finally:
+                # restore original_compiler
+                self.compiler.compiler_so = original_compiler
+
+        def object_filenames_with_cuda(origina_func, build_directory):
+            """
+            Decorated the function to add customized naming machanism.
+            Originally, both .cc/.cu will have .o object output that will
+            bring file override problem. Use .cu.o as CUDA object suffix.
+            """
+
+            def wrapper(source_filenames, strip_dir=0, output_dir=''):
+                try:
+                    objects = origina_func(source_filenames, strip_dir,
+                                           output_dir)
+                    for i, source in enumerate(source_filenames):
+                        # modify xx.o -> xx.cu.o
+                        if is_cuda_file(source):
+                            old_obj = objects[i]
+                            objects[i] = old_obj[:-1] + 'cu.o'
+                    # if user set build_directory, output objects there.
+                    if build_directory is not None:
+                        objects = [
+                            os.path.join(build_directory, os.path.basename(obj))
+                            for obj in objects
+                        ]
+                    # ensure to use abspath
+                    objects = [os.path.abspath(obj) for obj in objects]
+                finally:
+                    self.compiler.object_filenames = origina_func
+
+                return objects
+
+            return wrapper
+
+        # customized compile process
+        self.compiler._compile = unix_custom_single_compiler
+        self.compiler.object_filenames = object_filenames_with_cuda(
+            self.compiler.object_filenames, self.build_lib)
+
+        self._record_op_info()
+
+        print("Compiling user custom op, it will cost a few seconds.....")
+        build_ext.build_extensions(self)
+
+    def get_ext_filename(self, fullname):
+        # for example: custommed_extension.cpython-37m-x86_64-linux-gnu.so
+        ext_name = super(BuildExtension, self).get_ext_filename(fullname)
+        if self.no_python_abi_suffix and six.PY3:
+            split_str = '.'
+            name_items = ext_name.split(split_str)
+            assert len(
+                name_items
+            ) > 2, "Expected len(name_items) > 2, but received {}".format(
+                len(name_items))
+            name_items.pop(-2)
+            # custommed_extension.so
+            ext_name = split_str.join(name_items)
+
+        return ext_name
+
+    def _check_abi(self):
+        """
+        Check ABI Compatibility.
+        """
+        if hasattr(self.compiler, 'compiler_cxx'):
+            compiler = self.compiler.compiler_cxx[0]
+        elif IS_WINDOWS:
+            compiler = os.environ.get('CXX', 'cl')
+            raise NotImplementedError("We don't support Windows Currently.")
+        else:
+            compiler = os.environ.get('CXX', 'c++')
+
+        check_abi_compatibility(compiler)
+
+    def _record_op_info(self):
+        """
+        Record custum op inforomation. 
+        """
+        # parse shared library abs path
+        outputs = self.get_outputs()
+        assert len(outputs) == 1
+        # multi operators built into same one .so file
+        so_path = os.path.abspath(outputs[0])
+        so_name = os.path.basename(so_path)
+
+        for i, extension in enumerate(self.extensions):
+            sources = [os.path.abspath(s) for s in extension.sources]
+            op_names = parse_op_name_from(sources)
+
+            for op_name in op_names:
+                CustomOpInfo.instance().add(op_name,
+                                            so_name=so_name,
+                                            so_path=so_path)
+
+
+class EasyInstallCommand(easy_install, object):
+    """
+    Extend easy_intall Command to control the behavior of naming shared library
+    file.
+
+    NOTE(Aurelius84): This is a hook subclass inherited Command used to rename shared
+                    library file after extracting egg-info into site-packages.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(EasyInstallCommand, self).__init__(*args, **kwargs)
+
+    # NOTE(Aurelius84): Add args and kwargs to make compatible with PY2/PY3
+    def run(self, *args, **kwargs):
+        super(EasyInstallCommand, self).run(*args, **kwargs)
+        # NOTE: To avoid failing import .so file instead of
+        # python file because they have same name, we rename
+        # .so shared library to another name.
+        for egg_file in self.outputs:
+            filename, ext = os.path.splitext(egg_file)
+            if ext == '.so':
+                new_so_path = filename + "_pd_" + ext
+                if not os.path.exists(new_so_path):
+                    os.rename(r'%s' % egg_file, r'%s' % new_so_path)
+                assert os.path.exists(new_so_path)
+
+
+def load(name,
+         sources,
+         extra_cflags=None,
+         extra_cuda_cflags=None,
+         extra_ldflags=None,
+         extra_include_paths=None,
+         build_directory=None,
+         interpreter=None,
+         verbose=False):
+    """
+    An Interface to automatically compile C++/CUDA source files Just-In-Time
+    and return callable python function as other Paddle layers API. It will
+    append user defined custom op in background.
+
+    This module will perform compiling, linking, api generation and module loading
+    processes for users. It does not require CMake or Ninja environment and only
+    g++/nvcc on Linux and clang++ on MacOS. Moreover, ABI compatibility will be
+    checked to ensure that compiler version on local machine is compatible with
+    pre-installed Paddle whl in python site-packages. For example if Paddle is built
+    with GCC5.4, the version of user's local machine should satisfy GCC >= 5.4.
+    Otherwise, a fatal error will occur because  ABI compatibility.
+
+    Args:
+        name(str): generated shared library file name.
+        sources(list[str]): custom op source files name with .cc/.cu suffix.
+        extra_cflag(list[str]): additional flags used to compile CPP files. By default
+                               all basic and framework related flags have been included.
+                               If your pre-insall Paddle supported MKLDNN, please add
+                               '-DPADDLE_WITH_MKLDNN'. Default None.
+        extra_cuda_cflags(list[str]): additonal flags used to compile CUDA files. See
+                                https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+                                for details. Default None.
+        extra_ldflags(list[str]): additonal flags used to link shared library. See
+                                https://gcc.gnu.org/onlinedocs/gcc/Link-Options.html for details.
+                                Default None.
+        extra_include_paths(list[str]): additional include path used to search header files.
+                                        Default None.
+        build_directory(str): specific directory path to put shared library file. If set None,
+                            it will use `PADDLE_EXTENSION_DIR` from os.environ. Use 
+                            `paddle.utils.cpp_extension.get_build_directory()` to see the location.
+        interpreter(str): alias or full interpreter path to specific which one to use if have installed multiple.
+                           If set None, will use `python` as default interpreter.
+        verbose(bool): whether to verbose compiled log information
+
+    Returns:
+        custom api: A callable python function with same signature as CustomOp Kernel defination.
+
+    Example:
+
+        >> from paddle.utils.cpp_extension import load
+        >> relu2 = load(name='relu2',
+                        sources=['relu_op.cc', 'relu_op.cu'])
+        >> x = paddle.rand([4, 10]], dtype='float32')
+        >> out = relu2(x)
+    """
+
+    if build_directory is None:
+        build_directory = get_build_directory(verbose)
+
+    # ensure to use abs path
+    build_directory = os.path.abspath(build_directory)
+    log_v("build_directory: {}".format(build_directory), verbose)
+
+    file_path = os.path.join(build_directory, "setup.py")
+    sources = [os.path.abspath(source) for source in sources]
+
+    # TODO(Aurelius84): split cflags and cuda_flags
+    if extra_cflags is None: extra_cflags = []
+    if extra_cuda_cflags is None: extra_cuda_cflags = []
+    compile_flags = extra_cflags + extra_cuda_cflags
+    log_v("additonal compile_flags: [{}]".format(' '.join(compile_flags)),
+          verbose)
+
+    # write setup.py file and compile it 
+    _write_setup_file(name, sources, file_path, extra_include_paths,
+                      compile_flags, extra_ldflags, verbose)
+    _jit_compile(file_path, interpreter, verbose)
+
+    # import as callable python api
+    custom_op_api = _import_module_from_library(name, build_directory, verbose)
+
+    return custom_op_api
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..52c17d77bd4771ce44f2282adfd9a25394ce97ea
--- /dev/null
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -0,0 +1,722 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import six
+import sys
+import copy
+import glob
+import logging
+import collections
+import textwrap
+import warnings
+import subprocess
+
+from contextlib import contextmanager
+from setuptools.command import bdist_egg
+
+from .. import load_op_library
+from ...fluid import core
+from ...fluid.framework import OpProtoHolder
+from ...sysconfig import get_include, get_lib
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger("utils.cpp_extension")
+
+OS_NAME = sys.platform
+IS_WINDOWS = OS_NAME.startswith('win')
+NVCC_COMPILE_FLAGS = [
+    '-ccbin', 'cc', '-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU', '-DPADDLE_USE_DSO',
+    '-Xcompiler', '-fPIC', '-w', '--expt-relaxed-constexpr', '-O3', '-DNVCC'
+]
+
+GCC_MINI_VERSION = (5, 4, 0)
+# Give warning if using wrong compiler
+WRONG_COMPILER_WARNING = '''
+                        *************************************
+                        *  Compiler Compatibility WARNING   *
+                        *************************************
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+Found that your compiler ({user_compiler}) is not compatible with the compiler 
+built Paddle for this platform, which is {paddle_compiler} on {platform}. Please
+use {paddle_compiler} to compile your custom op. Or you may compile Paddle from
+source using {user_compiler}, and then also use it compile your custom op.
+
+See https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/2.0/install/compile/linux-compile.html
+for help with compiling Paddle from source.
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+'''
+# Give warning if used compiler version is incompatible
+ABI_INCOMPATIBILITY_WARNING = '''
+                            **********************************
+                            *    ABI Compatibility WARNING   *
+                            **********************************
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+Found that your compiler ({user_compiler} == {version}) may be ABI-incompatible with pre-installed Paddle!
+Please use compiler that is ABI-compatible with GCC >= 5.4 (Recommended 8.2).
+
+See https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html for ABI Compatibility
+information
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+'''
+USING_NEW_CUSTOM_OP_LOAD_METHOD = True
+
+
+# NOTE(chenweihang): In order to be compatible with 
+# the two custom op define method, after removing 
+# old method, we can remove them together
+def use_new_custom_op_load_method(*args):
+    global USING_NEW_CUSTOM_OP_LOAD_METHOD
+    if len(args) == 0:
+        return USING_NEW_CUSTOM_OP_LOAD_METHOD
+    else:
+        assert len(args) == 1 and isinstance(args[0], bool)
+        USING_NEW_CUSTOM_OP_LOAD_METHOD = args[0]
+
+
+@contextmanager
+def bootstrap_context():
+    """
+    Context to manage how to write `__bootstrap__` code in .egg
+    """
+    origin_write_stub = bdist_egg.write_stub
+    bdist_egg.write_stub = custom_write_stub
+    yield
+
+    bdist_egg.write_stub = origin_write_stub
+
+
+def load_op_meta_info_and_register_op(lib_filename):
+    if USING_NEW_CUSTOM_OP_LOAD_METHOD:
+        core.load_op_meta_info_and_register_op(lib_filename)
+    else:
+        core.load_op_library(lib_filename)
+    return OpProtoHolder.instance().update_op_proto()
+
+
+def custom_write_stub(resource, pyfile):
+    """
+    Customized write_stub function to allow us to inject generated python
+    api codes into egg python file.
+    """
+    _stub_template = textwrap.dedent("""
+        import os
+        import sys
+        import types
+        import paddle
+        
+        def inject_ext_module(module_name, api_names):
+            if module_name in sys.modules:
+                return sys.modules[module_name]
+
+            new_module = types.ModuleType(module_name)
+            for api_name in api_names:
+                setattr(new_module, api_name, eval(api_name))
+
+            return new_module
+
+        def __bootstrap__():
+            cur_dir = os.path.dirname(os.path.abspath(__file__))
+            so_path = os.path.join(cur_dir, "{resource}")
+
+            assert os.path.exists(so_path)
+
+            # load custom op shared library with abs path
+            new_custom_ops = paddle.utils.cpp_extension.load_op_meta_info_and_register_op(so_path)
+            m = inject_ext_module(__name__, new_custom_ops)
+        
+        __bootstrap__()
+
+        {custom_api}
+        """).lstrip()
+
+    # Parse registerring op information
+    _, op_info = CustomOpInfo.instance().last()
+    so_path = op_info.so_path
+
+    new_custom_ops = load_op_meta_info_and_register_op(so_path)
+    assert len(
+        new_custom_ops
+    ) > 0, "Required at least one custom operators, but received len(custom_op) =  %d" % len(
+        new_custom_ops)
+
+    # NOTE: To avoid importing .so file instead of python file because they have same name,
+    # we rename .so shared library to another name, see EasyInstallCommand.
+    filename, ext = os.path.splitext(resource)
+    resource = filename + "_pd_" + ext
+
+    api_content = []
+    for op_name in new_custom_ops:
+        api_content.append(_custom_api_content(op_name))
+
+    with open(pyfile, 'w') as f:
+        f.write(
+            _stub_template.format(
+                resource=resource, custom_api='\n\n'.join(api_content)))
+
+
+OpInfo = collections.namedtuple('OpInfo', ['so_name', 'so_path'])
+
+
+class CustomOpInfo:
+    """
+    A global Singleton map to record all compiled custom ops information.
+    """
+
+    @classmethod
+    def instance(cls):
+        if not hasattr(cls, '_instance'):
+            cls._instance = cls()
+        return cls._instance
+
+    def __init__(self):
+        assert not hasattr(
+            self.__class__,
+            '_instance'), 'Please use `instance()` to get CustomOpInfo object!'
+        # NOTE(Aurelius84): Use OrderedDict to save more order information
+        self.op_info_map = collections.OrderedDict()
+
+    def add(self, op_name, so_name, so_path=None):
+        self.op_info_map[op_name] = OpInfo(so_name, so_path)
+
+    def last(self):
+        """
+        Return the lastest insert custom op info.
+        """
+        assert len(self.op_info_map) > 0
+        return next(reversed(self.op_info_map.items()))
+
+
+def prepare_unix_cflags(cflags):
+    """
+    Prepare all necessary compiled flags for nvcc compiling CUDA files.
+    """
+    cflags = NVCC_COMPILE_FLAGS + cflags + get_cuda_arch_flags(cflags)
+
+    return cflags
+
+
+def add_std_without_repeat(cflags, compiler_type, use_std14=False):
+    """
+    Append -std=c++11/14 in cflags if without specific it before.
+    """
+    cpp_flag_prefix = '/std:' if compiler_type == 'msvc' else '-std='
+    if not any(cpp_flag_prefix in flag for flag in cflags):
+        suffix = 'c++14' if use_std14 else 'c++11'
+        cpp_flag = cpp_flag_prefix + suffix
+        cflags.append(cpp_flag)
+
+
+def get_cuda_arch_flags(cflags):
+    """
+    For an arch, say "6.1", the added compile flag will be
+    ``-gencode=arch=compute_61,code=sm_61``.
+    For an added "+PTX", an additional
+    ``-gencode=arch=compute_xx,code=compute_xx`` is added.
+    """
+    # TODO(Aurelius84):
+    return []
+
+
+def normalize_extension_kwargs(kwargs, use_cuda=False):
+    """ 
+    Normalize include_dirs, library_dir and other attributes in kwargs.
+    """
+    assert isinstance(kwargs, dict)
+    # append necessary include dir path of paddle
+    include_dirs = kwargs.get('include_dirs', [])
+    include_dirs.extend(find_paddle_includes(use_cuda))
+    kwargs['include_dirs'] = include_dirs
+
+    # append necessary lib path of paddle
+    library_dirs = kwargs.get('library_dirs', [])
+    library_dirs.extend(find_paddle_libraries(use_cuda))
+    kwargs['library_dirs'] = library_dirs
+
+    # add runtime library dirs
+    runtime_library_dirs = kwargs.get('runtime_library_dirs', [])
+    runtime_library_dirs.extend(find_paddle_libraries(use_cuda))
+    kwargs['runtime_library_dirs'] = runtime_library_dirs
+
+    # append compile flags
+    extra_compile_args = kwargs.get('extra_compile_args', [])
+    extra_compile_args.extend(['-g', '-w'])  # diable warnings
+    kwargs['extra_compile_args'] = extra_compile_args
+
+    # append link flags
+    extra_link_args = kwargs.get('extra_link_args', [])
+    extra_link_args.append('-lpaddle_framework')
+    if use_cuda:
+        extra_link_args.append('-lcudart')
+
+    kwargs['extra_link_args'] = extra_link_args
+
+    kwargs['language'] = 'c++'
+    return kwargs
+
+
+def find_paddle_includes(use_cuda=False):
+    """
+    Return Paddle necessary include dir path.
+    """
+    # pythonXX/site-packages/paddle/include
+    paddle_include_dir = get_include()
+    third_party_dir = os.path.join(paddle_include_dir, 'third_party')
+
+    include_dirs = [paddle_include_dir, third_party_dir]
+
+    return include_dirs
+
+
+def find_cuda_includes():
+
+    cuda_home = find_cuda_home()
+    if cuda_home is None:
+        raise ValueError(
+            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
+        )
+
+    return [os.path.join(cuda_home, 'lib64')]
+
+
+def find_cuda_home():
+    """
+    Use heuristic method to find cuda path
+    """
+    # step 1. find in $CUDA_HOME or $CUDA_PATH
+    cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH')
+
+    # step 2.  find path by `which nvcc`
+    if cuda_home is None:
+        which_cmd = 'where' if IS_WINDOWS else 'which'
+        try:
+            with open(os.devnull, 'w') as devnull:
+                nvcc_path = subprocess.check_output(
+                    [which_cmd, 'nvcc'], stderr=devnull)
+                if six.PY3:
+                    nvcc_path = nvcc_path.decode()
+                nvcc_path = nvcc_path.rstrip('\r\n')
+                # for example: /usr/local/cuda/bin/nvcc
+                cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
+        except:
+            if IS_WINDOWS:
+                # search from default NVIDIA GPU path
+                candidate_paths = glob.glob(
+                    'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
+                if len(candidate_paths) > 0:
+                    cuda_home = candidate_paths[0]
+            else:
+                cuda_home = "/usr/local/cuda"
+    # step 3. check whether path is valid
+    if not os.path.exists(cuda_home) and core.is_compiled_with_cuda():
+        cuda_home = None
+        warnings.warn(
+            "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
+        )
+
+    return cuda_home
+
+
+def find_paddle_libraries(use_cuda=False):
+    """
+    Return Paddle necessary library dir path.
+    """
+    # pythonXX/site-packages/paddle/libs
+    paddle_lib_dirs = [get_lib()]
+    if use_cuda:
+        cuda_dirs = find_cuda_includes()
+        paddle_lib_dirs.extend(cuda_dirs)
+    return paddle_lib_dirs
+
+
+def add_compile_flag(extension, flag):
+    extra_compile_args = copy.deepcopy(extension.extra_compile_args)
+    if isinstance(extra_compile_args, dict):
+        for args in extra_compile_args.values():
+            args.append(flag)
+    else:
+        extra_compile_args.append(flag)
+
+    extension.extra_compile_args = extra_compile_args
+
+
+def is_cuda_file(path):
+
+    cuda_suffix = set(['.cu'])
+    items = os.path.splitext(path)
+    assert len(items) > 1
+    return items[-1] in cuda_suffix
+
+
+def get_build_directory(verbose=False):
+    """
+    Return paddle extension root directory, default specific by `PADDLE_EXTENSION_DIR`
+    """
+    root_extensions_directory = os.environ.get('PADDLE_EXTENSION_DIR')
+    if root_extensions_directory is None:
+        dir_name = "paddle_extensions"
+        if OS_NAME.startswith('linux'):
+            root_extensions_directory = os.path.join(
+                os.path.expanduser('~/.cache'), dir_name)
+        else:
+            # TODO(Aurelius84): consider wind32/macOs
+            raise NotImplementedError("Only support Linux now.")
+
+        log_v("$PADDLE_EXTENSION_DIR is not set, using path: {} by default.".
+              format(root_extensions_directory), verbose)
+
+    if not os.path.exists(root_extensions_directory):
+        os.makedirs(root_extensions_directory)
+
+    return root_extensions_directory
+
+
+def parse_op_info(op_name):
+    """
+    Parse input names and outpus detail information from registered custom op
+    from OpInfoMap.
+    """
+    from paddle.fluid.framework import OpProtoHolder
+    if op_name not in OpProtoHolder.instance().op_proto_map:
+        raise ValueError(
+            "Please load {} shared library file firstly by `paddle.utils.cpp_extension.load_op_meta_info_and_register_op(...)`".
+            format(op_name))
+    op_proto = OpProtoHolder.instance().get_op_proto(op_name)
+
+    in_names = [x.name for x in op_proto.inputs]
+    out_names = [x.name for x in op_proto.outputs]
+
+    return in_names, out_names
+
+
+def _import_module_from_library(module_name, build_directory, verbose=False):
+    """
+    Load .so shared library and import it as callable python module.
+    """
+    # TODO(Aurelius84): Consider file suffix is .dll on Windows Platform.
+    ext_path = os.path.join(build_directory, module_name + '.so')
+    if not os.path.exists(ext_path):
+        raise FileNotFoundError("Extension path: {} does not exist.".format(
+            ext_path))
+
+    # load custom op_info and kernels from .so shared library
+    log_v('loading shared library from: {}'.format(ext_path), verbose)
+    op_names = load_op_meta_info_and_register_op(ext_path)
+
+    # generate Python api in ext_path
+    return _generate_python_module(module_name, op_names, build_directory,
+                                   verbose)
+
+
+def _generate_python_module(module_name,
+                            op_names,
+                            build_directory,
+                            verbose=False):
+    """
+    Automatically generate python file to allow import or load into as module
+    """
+    api_file = os.path.join(build_directory, module_name + '.py')
+    log_v("generate api file: {}".format(api_file), verbose)
+
+    # write into .py file
+    api_content = [_custom_api_content(op_name) for op_name in op_names]
+    with open(api_file, 'w') as f:
+        f.write('\n\n'.join(api_content))
+
+    # load module
+    custom_module = _load_module_from_file(api_file, verbose)
+    return custom_module
+
+
+def _custom_api_content(op_name):
+    params_str, ins_str, outs_str = _get_api_inputs_str(op_name)
+
+    API_TEMPLATE = textwrap.dedent("""
+        from paddle.fluid.layer_helper import LayerHelper
+
+        def {op_name}({inputs}):
+            helper = LayerHelper("{op_name}", **locals())
+
+            # prepare inputs and output 
+            ins = {ins}
+            outs = {{}}
+            out_names = {out_names}
+            for out_name in out_names:
+                # Set 'float32' temporarily, and the actual dtype of output variable will be inferred
+                # in runtime.
+                outs[out_name] = helper.create_variable(dtype='float32')
+            
+            helper.append_op(type="{op_name}", inputs=ins, outputs=outs)
+
+            res = [outs[out_name] for out_name in out_names]
+
+            return res[0] if len(res)==1 else res
+            """).lstrip()
+
+    # generate python api file
+    api_content = API_TEMPLATE.format(
+        op_name=op_name, inputs=params_str, ins=ins_str, out_names=outs_str)
+
+    return api_content
+
+
+def _load_module_from_file(api_file_path, verbose=False):
+    """
+    Load module from python file.
+    """
+    if not os.path.exists(api_file_path):
+        raise FileNotFoundError("File : {} does not exist.".format(
+            api_file_path))
+
+    # Unique readable module name to place custom api.
+    log_v('import module from file: {}'.format(api_file_path), verbose)
+    ext_name = "_paddle_cpp_extension_"
+    if six.PY2:
+        import imp
+        module = imp.load_source(ext_name, api_file_path)
+    else:
+        from importlib import machinery
+        loader = machinery.SourceFileLoader(ext_name, api_file_path)
+        module = loader.load_module()
+
+    return module
+
+
+def _get_api_inputs_str(op_name):
+    """
+    Returns string of api parameters and inputs dict.
+    """
+    in_names, out_names = parse_op_info(op_name)
+    # e.g: x, y, z
+    params_str = ','.join([p.lower() for p in in_names])
+    # e.g: {'X': x, 'Y': y, 'Z': z}
+    ins_str = "{%s}" % ','.join(
+        ["'{}' : {}".format(in_name, in_name.lower()) for in_name in in_names])
+    # e.g: ['Out', 'Index']
+    outs_str = "[%s]" % ','.join(["'{}'".format(name) for name in out_names])
+    return params_str, ins_str, outs_str
+
+
+def _write_setup_file(name,
+                      sources,
+                      file_path,
+                      include_dirs,
+                      compile_flags,
+                      link_args,
+                      verbose=False):
+    """
+    Automatically generate setup.py and write it into build directory.
+    """
+    template = textwrap.dedent("""
+    import os
+    from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup
+    from paddle.utils.cpp_extension import get_build_directory
+    setup(
+        name='{name}',
+        ext_modules=[
+            {prefix}Extension(
+                sources={sources},
+                include_dirs={include_dirs},
+                extra_compile_args={extra_compile_args},
+                extra_link_args={extra_link_args})],
+        cmdclass={{"build_ext" : BuildExtension.with_options(
+            output_dir=get_build_directory(),
+            no_python_abi_suffix=True,
+            use_new_method={use_new_method})
+        }})""").lstrip()
+
+    with_cuda = False
+    if any([is_cuda_file(source) for source in sources]):
+        with_cuda = True
+    log_v("with_cuda: {}".format(with_cuda), verbose)
+
+    content = template.format(
+        name=name,
+        prefix='CUDA' if with_cuda else 'Cpp',
+        sources=list2str(sources),
+        include_dirs=list2str(include_dirs),
+        extra_compile_args=list2str(compile_flags),
+        extra_link_args=list2str(link_args),
+        use_new_method=use_new_custom_op_load_method())
+
+    log_v('write setup.py into {}'.format(file_path), verbose)
+    with open(file_path, 'w') as f:
+        f.write(content)
+
+
+def list2str(args):
+    """
+    Convert list[str] into string. For example: [x, y] -> "['x', 'y']"
+    """
+    if args is None: return '[]'
+    assert isinstance(args, (list, tuple))
+    args = ["'{}'".format(arg) for arg in args]
+    return '[' + ','.join(args) + ']'
+
+
+def _jit_compile(file_path, interpreter=None, verbose=False):
+    """
+    Build shared library in subprocess
+    """
+    ext_dir = os.path.dirname(file_path)
+    setup_file = os.path.basename(file_path)
+
+    if interpreter is None:
+        interpreter = 'python'
+    try:
+        py_path = subprocess.check_output(['which', interpreter])
+        py_version = subprocess.check_output([interpreter, '-V'])
+        if six.PY3:
+            py_path = py_path.decode()
+            py_version = py_version.decode()
+        log_v("Using Python interpreter: {}, version: {}".format(
+            py_path.strip(), py_version.strip()), verbose)
+    except Exception:
+        _, error, _ = sys.exc_info()
+        raise RuntimeError(
+            'Failed to check Python interpreter with `{}`, errors: {}'.format(
+                interpreter, error))
+
+    compile_cmd = 'cd {} && {} {} build'.format(ext_dir, interpreter,
+                                                setup_file)
+    print("Compiling user custom op, it will cost a few seconds.....")
+    run_cmd(compile_cmd, verbose)
+
+
+def parse_op_name_from(sources):
+    """
+    Parse registerring custom op name from sources.
+    """
+
+    def regex(content):
+        if USING_NEW_CUSTOM_OP_LOAD_METHOD:
+            pattern = re.compile(r'PD_BUILD_OP\(([^,\)]+)\)')
+        else:
+            pattern = re.compile(r'REGISTER_OPERATOR\(([^,]+),')
+
+        content = re.sub(r'\s|\t|\n', '', content)
+        op_name = pattern.findall(content)
+        op_name = set([re.sub('_grad', '', name) for name in op_name])
+
+        return op_name
+
+    op_names = set()
+    for source in sources:
+        with open(source, 'r') as f:
+            content = f.read()
+            op_names |= regex(content)
+
+    return list(op_names)
+
+
+def run_cmd(command, verbose=False):
+    """
+    Execute command with subprocess.
+    """
+    # logging
+    log_v("execute command: {}".format(command), verbose)
+    try:
+        from subprocess import DEVNULL  # py3
+    except ImportError:
+        DEVNULL = open(os.devnull, 'wb')
+
+    # execute command
+    try:
+        if verbose:
+            return subprocess.check_call(
+                command, shell=True, stderr=subprocess.STDOUT)
+        else:
+            return subprocess.check_call(command, shell=True, stdout=DEVNULL)
+    except Exception:
+        _, error, _ = sys.exc_info()
+        raise RuntimeError("Failed to run command: {}, errors: {}".format(
+            compile, error))
+
+
+def check_abi_compatibility(compiler, verbose=False):
+    """
+    Check whether GCC version on user local machine is compatible with Paddle in
+    site-packages.
+    """
+    # TODO(Aurelius84): After we support windows, remove IS_WINDOWS in following code.
+    if os.environ.get('PADDLE_SKIP_CHECK_ABI') in ['True', 'true', '1'
+                                                   ] or IS_WINDOWS:
+        return True
+
+    cmd_out = subprocess.check_output(
+        ['which', compiler], stderr=subprocess.STDOUT)
+    compiler_path = os.path.realpath(cmd_out.decode()
+                                     if six.PY3 else cmd_out).strip()
+    # step 1. if not found any suitable compiler, raise error
+    if not any(name in compiler_path
+               for name in _expected_compiler_current_platform()):
+        warnings.warn(
+            WRONG_COMPILER_WARNING.format(
+                user_compiler=compiler,
+                paddle_compiler=_expected_compiler_current_platform()[0],
+                platform=OS_NAME))
+        return False
+
+    # clang++ have no ABI compatibility problem
+    if OS_NAME.startswith('darwin'):
+        return True
+    try:
+        if OS_NAME.startswith('linux'):
+            version_info = subprocess.check_output(
+                [compiler, '-dumpfullversion'])
+            if six.PY3:
+                version_info = version_info.decode()
+            version = version_info.strip().split('.')
+            assert len(version) == 3
+            # check version compatibility
+            if tuple(map(int, version)) >= GCC_MINI_VERSION:
+                return True
+            else:
+                warnings.warn(
+                    ABI_INCOMPATIBILITY_WARNING.format(
+                        user_compiler=compiler, version=version_info.strip()))
+        # TODO(Aurelius84): check version compatibility on windows
+        elif IS_WINDOWS:
+            warnings.warn("We don't support Windows now.")
+    except Exception:
+        _, error, _ = sys.exc_info()
+        warnings.warn('Failed to check compiler version for {}: {}'.format(
+            compiler, error))
+
+    return False
+
+
+def _expected_compiler_current_platform():
+    """
+    Returns supported compiler string on current platform
+    """
+    expect_compilers = ['clang', 'clang++'] if OS_NAME.startswith(
+        'darwin') else ['gcc', 'g++', 'gnu-c++', 'gnu-cc']
+    return expect_compilers
+
+
+def log_v(info, verbose):
+    """
+    Print log information on stdout.
+    """
+    if verbose:
+        logging.info(info)
diff --git a/python/setup.py.in b/python/setup.py.in
index 652b81b25c8307d77f7404085bceeaa55d3fffdd..f662e21a7be2dc6b1f8faa6cf1f08c49c14ee095 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -139,6 +139,7 @@ write_distributed_training_mode_py(filename='@PADDLE_BINARY_DIR@/python/paddle/f
 packages=['paddle',
           'paddle.libs',
           'paddle.utils',
+          'paddle.utils.cpp_extension',
           'paddle.dataset',
           'paddle.reader',
           'paddle.distributed',
@@ -378,6 +379,8 @@ def find_files(pattern, root):
           yield os.path.join(dirpath, filename)
 
 headers = (
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) +
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/framework')) +
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/imperative')) +
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/memory')) +