From c58c4ede1eecb8de9416f9c76134a3312722a4e0 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 3 Dec 2021 08:27:23 +0800
Subject: [PATCH] =?UTF-8?q?=E3=80=90PTen=E3=80=91C++=20API=20Code-Generati?=
 =?UTF-8?q?on=20(#37668)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add code-gen file

* add api-gen in cmake

* adjust the code format

* temp save the code

* add genen-api module into pten

* merge the develop code

* fix merge conflict

* fix code conflict with develop

* support reduce_mean/sum

* fix the CI requirement

* fix requirement problem of CI

* remove original api code

* fix bug caused by removing original api
---
 .gitignore                                    |   1 +
 paddle/pten/api/CMakeLists.txt                |   2 +-
 paddle/pten/api/all.h                         |   5 +-
 paddle/pten/api/include/creation.h            |  49 --
 paddle/pten/api/include/linalg.h              |  30 --
 paddle/pten/api/include/manipulation.h        |  28 --
 paddle/pten/api/include/math.h                |  48 --
 paddle/pten/api/lib/CMakeLists.txt            |  25 +-
 paddle/pten/api/lib/creation.cc               | 135 ------
 paddle/pten/api/lib/kernel_dispatch.cc        |  41 ++
 paddle/pten/api/lib/kernel_dispatch.h         |  20 +
 paddle/pten/api/lib/linalg.cc                 | 120 -----
 paddle/pten/api/lib/manipulation.cc           | 140 ------
 paddle/pten/api/lib/math.cc                   | 319 ------------
 paddle/pten/api/lib/tensor.cc                 |   4 +-
 paddle/pten/tests/api/CMakeLists.txt          |   4 +-
 paddle/pten/tests/api/test_cast_api.cc        |   3 +-
 paddle/pten/tests/api/test_dot_api.cc         |   2 +-
 paddle/pten/tests/api/test_elementwise_api.cc |   2 +-
 paddle/pten/tests/api/test_fill_api.cc        |   2 +-
 paddle/pten/tests/api/test_flatten_api.cc     |   2 +-
 paddle/pten/tests/api/test_matmul_api.cc      |   2 +-
 paddle/pten/tests/api/test_mean_api.cc        |   2 +-
 paddle/pten/tests/api/test_reshape_api.cc     |   2 +-
 paddle/pten/tests/api/test_scale_api.cc       |   3 +-
 paddle/pten/tests/api/test_slice_api.cc       |   2 +-
 paddle/pten/tests/api/test_sum_api.cc         |   4 +-
 paddle/scripts/musl_build/build_inside.sh     |   1 +
 python/paddle/utils/code_gen/api.yaml         | 153 ++++++
 python/paddle/utils/code_gen/api_gen.py       | 452 ++++++++++++++++++
 30 files changed, 708 insertions(+), 895 deletions(-)
 delete mode 100644 paddle/pten/api/include/creation.h
 delete mode 100644 paddle/pten/api/include/linalg.h
 delete mode 100644 paddle/pten/api/include/manipulation.h
 delete mode 100644 paddle/pten/api/include/math.h
 delete mode 100644 paddle/pten/api/lib/creation.cc
 delete mode 100644 paddle/pten/api/lib/linalg.cc
 delete mode 100644 paddle/pten/api/lib/manipulation.cc
 delete mode 100644 paddle/pten/api/lib/math.cc
 create mode 100644 python/paddle/utils/code_gen/api.yaml
 create mode 100644 python/paddle/utils/code_gen/api_gen.py

diff --git a/.gitignore b/.gitignore
index c246a56cf15..6be36bf8c24 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ paddle/fluid/API_DEV.spec
 paddle/fluid/API_PR.spec
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
+paddle/pten/api/*/api*
 
 *.DS_Store
 *.vs
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
index 09df2c01fd9..a454ae807bc 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
@@ -1,3 +1,3 @@
 add_subdirectory(lib)
 
-cc_library(pten_api SRCS all.cc DEPS linalg_api math_api creation_api manipulation_api utils_api)
+cc_library(pten_api SRCS all.cc DEPS pten_function_api utils_api)
diff --git a/paddle/pten/api/all.h b/paddle/pten/api/all.h
index 2c647786379..e853ae331e4 100644
--- a/paddle/pten/api/all.h
+++ b/paddle/pten/api/all.h
@@ -25,10 +25,7 @@ limitations under the License. */
 #endif
 
 // new pten apis
-#include "paddle/pten/api/include/creation.h"
-#include "paddle/pten/api/include/linalg.h"
-#include "paddle/pten/api/include/manipulation.h"
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/api/include/utils.h"
 
diff --git a/paddle/pten/api/include/creation.h b/paddle/pten/api/include/creation.h
deleted file mode 100644
index b4e4bd0fd05..00000000000
--- a/paddle/pten/api/include/creation.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-#include "paddle/pten/common/backend.h"
-#include "paddle/pten/common/data_type.h"
-#include "paddle/pten/common/scalar.h"
-#include "paddle/pten/common/scalar_array.h"
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor full(const ScalarArray& shape,
-                        const Scalar& value,
-                        DataType dtype = DataType::FLOAT32,
-                        Backend backend = Backend::CPU,
-                        DataLayout layout = DataLayout::NCHW);
-
-PD_DLL_DECL Tensor full_like(const Tensor& x,
-                             const Scalar& value,
-                             DataType dtype = DataType::UNDEFINED,
-                             Backend backend = Backend::UNDEFINED,
-                             DataLayout layout = DataLayout::UNDEFINED);
-
-PD_DLL_DECL Tensor ones_like(const Tensor& x,
-                             DataType dtype = DataType::UNDEFINED,
-                             Backend backend = Backend::UNDEFINED,
-                             DataLayout layout = DataLayout::UNDEFINED);
-
-PD_DLL_DECL Tensor zeros_like(const Tensor& x,
-                              DataType dtype = DataType::UNDEFINED,
-                              Backend backend = Backend::UNDEFINED,
-                              DataLayout layout = DataLayout::UNDEFINED);
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/include/linalg.h b/paddle/pten/api/include/linalg.h
deleted file mode 100644
index 259cf664932..00000000000
--- a/paddle/pten/api/include/linalg.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor dot(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor matmul(const Tensor& x,
-                          const Tensor& y,
-                          bool transpose_x = false,
-                          bool transpose_y = false);
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/include/manipulation.h b/paddle/pten/api/include/manipulation.h
deleted file mode 100644
index 579fa5cdf94..00000000000
--- a/paddle/pten/api/include/manipulation.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis);
-
-PD_DLL_DECL Tensor cast(const Tensor& x, DataType out_dtype);
-
-PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector<int64_t>& shape);
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
deleted file mode 100644
index 700af6d2d59..00000000000
--- a/paddle/pten/api/include/math.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/api/include/tensor.h"
-#include "paddle/pten/common/scalar.h"
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y);
-
-PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y);
-
-// TODO(chenweihang): move mean API into stat.h/cc
-PD_DLL_DECL Tensor mean(const Tensor& x,
-                        const std::vector<int64_t>& axis,
-                        bool keep_dim);
-
-PD_DLL_DECL Tensor sum(const Tensor& x,
-                       const std::vector<int64_t>& axis,
-                       DataType dtype,
-                       bool keep_dim);
-
-// TODO(chenweihang): Follow-up discussion on the handling of `act` argument
-PD_DLL_DECL Tensor scale(const Tensor& x,
-                         const Scalar& scale,
-                         float bias,
-                         bool bias_after_scale);
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index f30a3c89eb6..ed2ad801283 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -14,8 +14,25 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor device_conte
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
 
-cc_library(math_api SRCS math.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(linalg_api SRCS linalg.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(creation_api SRCS creation.cc DEPS pten_tensor pten kernel_dispatch)
-cc_library(manipulation_api SRCS manipulation.cc DEPS pten_tensor pten kernel_dispatch)
+set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
+set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
+
+set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/include/api.h)
+set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
+set(api_header_file_tmp ${api_header_file}.tmp)
+set(api_source_file_tmp ${api_source_file}.tmp)
+
+add_custom_command(
+  OUTPUT ${api_header_file} ${api_source_file}
+  COMMAND python ${api_gen_file} 
+                 --api_yaml_path ${api_yaml_file}
+                 --api_header_path ${api_header_file_tmp}
+                 --api_source_path ${api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} ${api_source_file}
+  COMMENT "copy_if_different ${api_header_file} ${api_source_file}"
+  DEPENDS ${api_yaml_file}
+  VERBATIM)
+
 cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch)
+cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch)
diff --git a/paddle/pten/api/lib/creation.cc b/paddle/pten/api/lib/creation.cc
deleted file mode 100644
index 40054b5d272..00000000000
--- a/paddle/pten/api/lib/creation.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/creation.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-
-PT_DECLARE_MODULE(CreationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(CreationCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor full(const ScalarArray& shape,
-                        const Scalar& value,
-                        DataType dtype,
-                        Backend backend,
-                        DataLayout layout) {
-  // 1. Get kernel signature and kernel
-  pten::KernelKey kernel_key{backend, layout, dtype};
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "fill_constant", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  kernel_context.EmplaceBackAttr(pten::ScalarArray(shape));
-  kernel_context.EmplaceBackAttr(pten::Scalar(value));
-
-  // 4. InferMeta
-  auto out_meta = pten::FullInferMeta(shape, dtype, layout);
-
-  // 5. Prepare outputs
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  Tensor out;
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor full_like(const Tensor& x,
-                             const Scalar& value,
-                             DataType dtype,
-                             Backend backend,
-                             DataLayout layout) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-
-  DataType kernel_data_type =
-      dtype == DataType::UNDEFINED ? kernel_key.dtype() : dtype;
-  Backend kernel_backend =
-      backend == Backend::UNDEFINED ? kernel_key.backend() : backend;
-  DataLayout kernel_layout =
-      layout == DataLayout::UNDEFINED ? kernel_key.layout() : layout;
-
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "fill_any_like", {kernel_backend, kernel_layout, kernel_data_type});
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackAttr(pten::Scalar(value));
-
-  // 4. InferMeta
-  auto out_meta = FullLikeInferMeta(dense_x->meta(), dtype, layout);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_backend));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor ones_like(const Tensor& x,
-                             DataType dtype,
-                             Backend backend,
-                             DataLayout layout) {
-  return full_like(x, 1, dtype, backend, layout);
-}
-
-PD_DLL_DECL Tensor zeros_like(const Tensor& x,
-                              DataType dtype,
-                              Backend backend,
-                              DataLayout layout) {
-  return full_like(x, 0, dtype, backend, layout);
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Creation);
diff --git a/paddle/pten/api/lib/kernel_dispatch.cc b/paddle/pten/api/lib/kernel_dispatch.cc
index 0205a0d53c3..97b3bf281fc 100644
--- a/paddle/pten/api/lib/kernel_dispatch.cc
+++ b/paddle/pten/api/lib/kernel_dispatch.cc
@@ -57,5 +57,46 @@ paddle::platform::DeviceContext* GetDeviceContextByBackend(
   return pool.Get(pten::TransToFluidPlace(backend));
 }
 
+DataType ParseDataType(DataType dtype) { return dtype; }
+DataType ParseDataType(const Tensor& tensor) { return tensor.type(); }
+DataType ParseDataType(const std::vector<Tensor>& tensors) {
+  if (tensors.empty()) {
+    return DataType::UNDEFINED;
+  }
+  DataType dtype = tensors[0].type();
+  auto n = tensors.size();
+  for (size_t i = 1; i < n; ++i) {
+    if (tensors[i].type() != dtype) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The data_type of input tensor in list isn't consistent, "
+          "the first tensor is %s, but %dth tensor is %s.",
+          dtype,
+          i,
+          tensors[i].type()));
+    }
+  }
+  return dtype;
+}
+
+DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) {
+  return dtype != DataType::UNDEFINED ? dtype : ParseDataType(tensor);
+}
+
+Backend ParseBackend(Backend backend) { return backend; }
+Backend ParseBackend(const Tensor& tensor) {
+  return pten::TransToPtenBackend(tensor.inner_place());
+}
+
+Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) {
+  return backend != Backend::UNDEFINED ? backend : ParseBackend(tensor);
+}
+
+DataLayout ParseLayout(DataLayout layout) { return layout; }
+DataLayout ParseLayout(const Tensor& tensor) { return tensor.layout(); }
+
+DataLayout ParseLayoutWithInputOrder(DataLayout layout, const Tensor& tensor) {
+  return layout != DataLayout::UNDEFINED ? layout : ParseLayout(tensor);
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/kernel_dispatch.h b/paddle/pten/api/lib/kernel_dispatch.h
index 2dba88d07eb..e78e79f27c2 100644
--- a/paddle/pten/api/lib/kernel_dispatch.h
+++ b/paddle/pten/api/lib/kernel_dispatch.h
@@ -129,5 +129,25 @@ KernelKeySet ParseKernelKeyByInputArgs(const Args&... args) {
   return detail::KernelKeyParser().apply(args...).key_set;
 }
 
+DataType ParseDataType(DataType dtype);
+DataType ParseDataType(const Tensor& tensor);
+DataType ParseDataType(const std::vector<Tensor>& tensors);
+DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor);
+
+Backend ParseBackend(Backend backend);
+Backend ParseBackend(const Tensor& tensor);
+template <typename T, typename... Args>
+Backend ParseBackend(T t, Args... args) {
+  auto backend_set =
+      BackendSet(ParseBackend(t)) | BackendSet(ParseBackend(args...));
+  return static_cast<Backend>(64 -
+                              detail::CountLeadingZeros(backend_set.bitset()));
+}
+Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor);
+
+DataLayout ParseLayout(DataLayout layout);
+DataLayout ParseLayout(const Tensor& tensor);
+DataLayout ParseLayoutWithInputOrder(DataLayout layout, const Tensor& tensor);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/linalg.cc b/paddle/pten/api/lib/linalg.cc
deleted file mode 100644
index 8eae16d9018..00000000000
--- a/paddle/pten/api/lib/linalg.cc
+++ /dev/null
@@ -1,120 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/linalg.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/kernel_context.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-
-PT_DECLARE_MODULE(LinalgCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(LinalgCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor dot(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "dot", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  // TODO(chenweihang): add transform impl
-
-  // 4. InferMeta
-  auto out_meta = DotInferMeta(dense_x->meta(), dense_y->meta());
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor matmul(const Tensor& x,
-                          const Tensor& y,
-                          bool transpose_x,
-                          bool transpose_y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "matmul_v2", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(transpose_x);
-  kernel_context.EmplaceBackAttr(transpose_y);
-  // TODO(chenweihang): add transform impl
-
-  // 4. InferMeta
-  auto out_meta = MatmulInferMeta(
-      dense_x->meta(), dense_y->meta(), transpose_x, transpose_y);
-
-  // 5. Prepare outputs
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-
-  Tensor out;
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Linalg);
diff --git a/paddle/pten/api/lib/manipulation.cc b/paddle/pten/api/lib/manipulation.cc
deleted file mode 100644
index 51a7702d9fc..00000000000
--- a/paddle/pten/api/lib/manipulation.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/manipulation.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/infermeta/unary.h"
-
-PT_DECLARE_MODULE(ManipulationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(ManipulationCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "flatten_contiguous_range", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(start_axis);
-  kernel_context.EmplaceBackAttr(stop_axis);
-
-  // 4. InferMeta
-  auto out_meta = FlattenInferMeta(dense_x->meta(), start_axis, stop_axis);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor cast(const Tensor& x, DataType out_dtype) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "cast", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(out_dtype);
-  kernel_context.EmplaceBackAttr(dense_x->meta().dtype);
-
-  // 4. InferMeta
-  auto out_meta = CastInferMeta(dense_x->meta(), out_dtype);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor reshape(const Tensor& x, const std::vector<int64_t>& shape) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "reshape2", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(shape);
-
-  // 4. InferMeta
-  auto out_meta = InferMetaFromVecValue(dense_x->meta(), shape);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Manipulation);
diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc
deleted file mode 100644
index a97d78b5a9d..00000000000
--- a/paddle/pten/api/lib/math.cc
+++ /dev/null
@@ -1,319 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/api/include/math.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-
-#include "paddle/pten/api/lib/api_registry.h"
-#include "paddle/pten/api/lib/kernel_dispatch.h"
-#include "paddle/pten/api/lib/utils/allocator.h"
-#include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/include/core.h"
-#include "paddle/pten/include/infermeta.h"
-#include "paddle/pten/infermeta/unary.h"
-
-PT_DECLARE_MODULE(MathCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(MathCUDA);
-#endif
-
-namespace paddle {
-namespace experimental {
-
-PD_DLL_DECL Tensor mean(const Tensor& x,
-                        const std::vector<int64_t>& axis,
-                        bool keep_dim) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "reduce_mean", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-
-  // The real value of reduce_all will be get in kernel
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-
-  DataType out_dtype = DataType::UNDEFINED;
-
-  kernel_context.EmplaceBackAttr(axis);
-  kernel_context.EmplaceBackAttr(keep_dim);
-  kernel_context.EmplaceBackAttr(reduce_all);
-  kernel_context.EmplaceBackAttr(dense_x->dtype());
-  kernel_context.EmplaceBackAttr(out_dtype);
-
-  // 4. InferShape
-  auto out_meta = ReduceInferMeta(dense_x->meta(), axis, keep_dim);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor sum(const Tensor& x,
-                       const std::vector<int64_t>& axis,
-                       DataType dtype,
-                       bool keep_dim) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "reduce_sum", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-
-  // The real value of reduce_all will be get in kernel
-  // so use default value(false) is OK.
-  bool reduce_all = false;
-
-  DataType out_dtype = DataType::UNDEFINED;
-  if (dense_x->dtype() == DataType::BOOL ||
-      dense_x->dtype() == DataType::INT32 ||
-      dense_x->dtype() == DataType::INT64) {
-    out_dtype = DataType::INT64;
-  }
-
-  kernel_context.EmplaceBackAttr(axis);
-  kernel_context.EmplaceBackAttr(keep_dim);
-  kernel_context.EmplaceBackAttr(reduce_all);
-  kernel_context.EmplaceBackAttr(dense_x->dtype());
-  kernel_context.EmplaceBackAttr(out_dtype);
-
-  // 4. InferMeta
-  auto out_meta = ReduceInferMeta(dense_x->meta(), axis, keep_dim);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_add", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_sub", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_div", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x, y);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "elementwise_mul", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
-  kernel_context.EmplaceBackInput(dense_y);
-  kernel_context.EmplaceBackAttr(-1);
-
-  // 4. InferMeta
-  auto out_meta = ElementwiseInferMeta(dense_x->meta(), dense_y->meta(), -1);
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-PD_DLL_DECL Tensor scale(const Tensor& x,
-                         const Scalar& scale,
-                         float bias,
-                         bool bias_after_scale) {
-  // 1. Get kernel signature and kernel
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "scale", kernel_key);
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = pten::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
-  kernel_context.EmplaceBackInput(dense_x);
-  kernel_context.EmplaceBackAttr(pten::Scalar(scale));
-  kernel_context.EmplaceBackAttr(bias);
-  kernel_context.EmplaceBackAttr(bias_after_scale);
-
-  // 4. InferMeta
-  auto out_meta = UnchangedInferMeta(dense_x->meta());
-
-  // 5. Prepare outputs
-  Tensor out;
-  const auto allocator = std::make_shared<DefaultAllocator>(
-      pten::TransToFluidPlace(kernel_key.backend()));
-  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
-  kernel_context.EmplaceBackOutput(dense_out);
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-}  // namespace experimental
-}  // namespace paddle
-
-PT_REGISTER_API(Math);
diff --git a/paddle/pten/api/lib/tensor.cc b/paddle/pten/api/lib/tensor.cc
index 3f0966d369d..6b4a3b1950a 100644
--- a/paddle/pten/api/lib/tensor.cc
+++ b/paddle/pten/api/lib/tensor.cc
@@ -19,7 +19,6 @@ limitations under the License. */
 #include <vector>
 
 #include "glog/logging.h"
-#include "paddle/pten/api/include/manipulation.h"
 #include "paddle/pten/api/include/utils.h"
 #include "paddle/pten/api/lib/ext_compat_utils.h"
 #include "paddle/pten/api/lib/utils/allocator.h"
@@ -67,6 +66,9 @@ inline bool IsDenseTensor(
 
 }  // namespace detail
 
+// declare cast api
+Tensor cast(const Tensor &x, DataType out_dtype);
+
 /////// Tensor Methods ////////
 
 /* Part 1: Construction and destruction methods */
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index c670d094810..46f2ef8be7c 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WITH_ROCM)
-  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor utils_api manipulation_api glog)
+  hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api utils_api glog)
 else()
-  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor utils_api manipulation_api glog)
+  cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api utils_api glog)
 endif()
 
 cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest)
diff --git a/paddle/pten/tests/api/test_cast_api.cc b/paddle/pten/tests/api/test_cast_api.cc
index ef110e8e33c..c2660a1f800 100644
--- a/paddle/pten/tests/api/test_cast_api.cc
+++ b/paddle/pten/tests/api/test_cast_api.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
-#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_dot_api.cc b/paddle/pten/tests/api/test_dot_api.cc
index 972e065596e..41c03f8f262 100644
--- a/paddle/pten/tests/api/test_dot_api.cc
+++ b/paddle/pten/tests/api/test_dot_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index 44033f1c611..e5971aae551 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_fill_api.cc b/paddle/pten/tests/api/test_fill_api.cc
index 1ebfc8e6746..e87d094eec9 100644
--- a/paddle/pten/tests/api/test_fill_api.cc
+++ b/paddle/pten/tests/api/test_fill_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_flatten_api.cc b/paddle/pten/tests/api/test_flatten_api.cc
index 2fcf00efc60..93c8a50f02a 100644
--- a/paddle/pten/tests/api/test_flatten_api.cc
+++ b/paddle/pten/tests/api/test_flatten_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_matmul_api.cc b/paddle/pten/tests/api/test_matmul_api.cc
index d3652db54ec..01ca4aad642 100644
--- a/paddle/pten/tests/api/test_matmul_api.cc
+++ b/paddle/pten/tests/api/test_matmul_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_mean_api.cc b/paddle/pten/tests/api/test_mean_api.cc
index 59d91672f96..a8c4c5306dc 100644
--- a/paddle/pten/tests/api/test_mean_api.cc
+++ b/paddle/pten/tests/api/test_mean_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc
index 643551ec1cb..b6179f11b10 100644
--- a/paddle/pten/tests/api/test_reshape_api.cc
+++ b/paddle/pten/tests/api/test_reshape_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_scale_api.cc b/paddle/pten/tests/api/test_scale_api.cc
index 2c0cd5cc71d..3541e3b85cc 100644
--- a/paddle/pten/tests/api/test_scale_api.cc
+++ b/paddle/pten/tests/api/test_scale_api.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/tests/api/test_slice_api.cc b/paddle/pten/tests/api/test_slice_api.cc
index 31a96c392dc..004c085af06 100644
--- a/paddle/pten/tests/api/test_slice_api.cc
+++ b/paddle/pten/tests/api/test_slice_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/api.h"
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
diff --git a/paddle/pten/tests/api/test_sum_api.cc b/paddle/pten/tests/api/test_sum_api.cc
index 4656f404639..d1b7ea33e8b 100644
--- a/paddle/pten/tests/api/test_sum_api.cc
+++ b/paddle/pten/tests/api/test_sum_api.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/pten/api/include/math.h"
+#include "paddle/pten/api/include/api.h"
 
 #include "paddle/pten/api/lib/utils/allocator.h"
 #include "paddle/pten/core/dense_tensor.h"
@@ -50,7 +50,7 @@ TEST(API, sum) {
   std::vector<int64_t> axis = {0, 1};
 
   // 2. test API
-  auto out = paddle::experimental::sum(x, axis, DataType::UNDEFINED, false);
+  auto out = paddle::experimental::sum(x, axis, false);
   // 3. check result
   ASSERT_EQ(out.dims().size(), 1);
   ASSERT_EQ(out.dims()[0], 1);
diff --git a/paddle/scripts/musl_build/build_inside.sh b/paddle/scripts/musl_build/build_inside.sh
index 04dea2086a6..4c7fa804de5 100755
--- a/paddle/scripts/musl_build/build_inside.sh
+++ b/paddle/scripts/musl_build/build_inside.sh
@@ -51,6 +51,7 @@ if [ "$pip_index" ]; then
 fi
 
 if [ "$WITH_REQUIREMENT" ]; then
+    echo "pyyaml" >> $WITH_REQUIREMENT
     echo ">>> install python requirement: $WITH_REQUIREMENT";
     pip install $PIP_ARGS -r "$WITH_REQUIREMENT";
 fi
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
new file mode 100644
index 00000000000..581aaef62a7
--- /dev/null
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -0,0 +1,153 @@
+- api : add
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_add
+    param : [x, y, -1]
+
+- api : cast
+  args : (const Tensor& x, DataType out_dtype)
+  output : Tensor
+  infer_meta : 
+    func : CastInferMeta
+  kernel :
+    func : cast
+    param : [x, out_dtype, x.dtype()]
+    data_type : x
+
+- api : divide
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_div
+    param : [x, y, -1]
+
+- api : dot
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : DotInferMeta
+  kernel : 
+    func : dot
+
+- api : flatten
+  args : (const Tensor& x, int start_axis, int stop_axis)
+  output : Tensor
+  infer_meta : 
+    func : FlattenInferMeta
+  kernel : 
+    func : flatten_contiguous_range
+
+- api : full
+  args : (const ScalarArray& shape, const Scalar& value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
+  output: Tensor
+  infer_meta : 
+    func : FullInferMeta
+    param : [shape, dtype, layout]
+  kernel : 
+    func : fill_constant
+    param : [shape, value]
+    data_type : dtype
+    backend : place
+    layout : layout
+  
+- api : full_like
+  args : (const Tensor& x, const Scalar& value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
+  output: Tensor
+  infer_meta : 
+    func : FullLikeInferMeta
+    param : [x, dtype, layout]
+  kernel : 
+    func : fill_any_like
+    param : [x, value]
+    data_type : dtype > x
+    backend : place > x
+    layout : layout > x
+
+- api : matmul
+  args : (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
+  output : Tensor
+  infer_meta : 
+    func : MatmulInferMeta
+  kernel : 
+    func : matmul_v2
+
+- api : mean
+  args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
+  output : Tensor
+  infer_meta : 
+    func : ReduceInferMeta
+  kernel : 
+    func : reduce_mean
+    param : [x, axis, keep_dim, false, x.dtype(), DataType::UNDEFINED]
+
+- api : multiply
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_mul
+    param : [x, y, -1]
+
+- api : ones_like
+  args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
+  output : Tensor
+  invoke : full_like(x, 1, dtype, place, layout)
+
+- api : reshape
+  args : (const Tensor& x, const std::vector<int64_t>& shape)
+  output : Tensor
+  infer_meta : 
+    func : InferMetaFromVecValue
+  kernel : 
+    func : reshape2
+
+- api : scale
+  args : (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale)
+  output : Tensor
+  infer_meta : 
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : scale
+
+- api : subtract
+  args : (const Tensor& x, const Tensor& y)
+  output : Tensor
+  infer_meta : 
+    func : ElementwiseInferMeta
+    param : [x, y, -1]
+  kernel :
+    func : elementwise_sub
+    param : [x, y, -1]
+
+- api : sum
+  args : (const Tensor& x, const std::vector<int64_t>& axis, bool keep_dim)
+  output : Tensor
+  infer_meta : 
+    func : ReduceInferMeta
+  kernel : 
+    func : reduce_sum
+    param : [x, axis, keep_dim, false, x.dtype(), DataType::UNDEFINED]
+
+- api : zeros_like
+  args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
+  output : Tensor
+  invoke : full_like(x, 0, dtype, place, layout)
+
+# - api : full_like
+#   args : (const Tensor& x, const Scalar& value, DataType dtype, Backend place)->Tensor
+#   output: {Tensor : dtype}
+#   kernel : fill_any_like
+#   T : [dtype, x]
+#   backend : [place, x]
+#   layout : []
+#   InferMeta : UnchangedInferMeta(x)
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
new file mode 100644
index 00000000000..cd81d001b8f
--- /dev/null
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -0,0 +1,452 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+
+
+class API:
+    prefix_tensor_name = 'dense_'
+
+    def __init__(self, api_item_yaml):
+        self.api = api_item_yaml['api']
+        # args:
+        #   inputs: 
+        #     names : [], list of input names
+        #   attrs:
+        #     names : [], list of attribute names
+        #     attr_info : { attr_name : (type, default_values)}    
+        self.args = self.parse_args(api_item_yaml['args'])
+        self.output = api_item_yaml['output']
+        self.is_base_api = True
+        if 'invoke' in api_item_yaml:
+            self.is_base_api = False
+            self.invoke = api_item_yaml['invoke']
+        else:
+            self.kernel = api_item_yaml['kernel']
+            if 'backend' not in self.kernel or len(self.kernel['backend']) == 0:
+                self.kernel['backend'] = None
+            if 'layout' not in self.kernel or len(self.kernel['layout']) == 0:
+                self.kernel['layout'] = None
+            if 'data_type' not in self.kernel or len(self.kernel[
+                    'data_type']) == 0:
+                self.kernel['data_type'] = None
+            if 'param' not in self.kernel or len(self.kernel['param']) == 0:
+                self.kernel['param'] = None
+
+            self.infer_meta = api_item_yaml['infer_meta']
+            if 'param' not in self.infer_meta or len(self.infer_meta[
+                    'param']) == 0:
+                self.infer_meta['param'] = None
+
+    def parse_args(self, args_str):
+        inputs = {'names': []}
+        attrs = {'names': [], 'attr_info': {}}
+        args_str = args_str.strip()
+        assert args_str.startswith('(') and args_str.endswith(')'), \
+            f"Args declaration should start with '(' and end with ')', please check the args of {self.api} in api.yaml."
+        args_str = args_str[1:-1]
+        args_list = args_str.split(',')
+        input_types = ['const Tensor&', 'const Tensor &']
+        attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
+                      'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
+                      'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
+        args_declare_str = ""
+        args_define_str = ""
+        for item in args_list:
+            item = item.strip()
+            # match the input tensor
+            has_input = False
+            for in_type in input_types:
+                if item.startswith(in_type):
+                    input_name = item[len(in_type):].strip()
+                    assert len(input_name) > 0, \
+                        f"The input tensor name should not be empty. Please check the args of {self.api} in api.yaml."
+                    inputs['names'].append(input_name)
+                    args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
+                    args_define_str = args_define_str + in_type + ' ' + input_name + ', '
+                    has_input = True
+                    break
+            if has_input:
+                continue
+
+            # match the attribute
+            for attr_type in attr_types:
+                if item.startswith(attr_type):
+                    attr_name = item[len(attr_type):].strip()
+                    assert len(attr_name) > 0, \
+                        f"The attribute name should not be empty. Please check the args of {self.api} in api.yaml."
+                    default_value = None
+                    if '=' in attr_name:
+                        attr_infos = attr_name.split('=')
+                        attr_name = attr_infos[0].strip()
+                        default_value = attr_infos[1].strip()
+
+                    default_value_str = "" if default_value is None else '=' + default_value
+                    args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
+                    args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
+                    attrs['names'].append(attr_name)
+                    attrs['attr_info'][attr_name] = (attr_type, default_value)
+                    break
+
+        args = {
+            'inputs': inputs,
+            'attrs': attrs,
+            'args_declare': args_declare_str[:-2],
+            'args_define': args_define_str[:-2]
+        }
+        return args
+
+    def gene_api_declaration(self):
+        return f"""
+PD_DLL_DECL {self.output} {self.api}({self.args['args_declare']});
+"""
+
+    def gene_kernel_select(self, input_names, attrs, kernel):
+
+        kernel_key_item_init = """
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+"""
+        # Check the tensor options
+        attr_backend_count = 0
+        attr_layout_count = 0
+        attr_data_type_count = 0
+        for attr_name in attrs['names']:
+            if attrs['attr_info'][attr_name][0] == 'Backend':
+                assert kernel['backend'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+                attr_backend_count = attr_backend_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataLayout':
+                assert kernel['layout'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'DataLayout' type in attributes, you must set layout of kernel manually."
+                attr_layout_count = attr_layout_count + 1
+            if attrs['attr_info'][attr_name][0] == 'DataType':
+                assert kernel['data_type'] is not None, \
+                    f"{self.api} api: When there is a parameter with 'DataType' type in attributes, you must set data_type of kernel manually."
+                attr_data_type_count = attr_data_type_count + 1
+
+        # preprocess kernel configures
+        kernel_select_code = ""
+        if kernel['backend'] is not None:
+            if '>' in kernel['backend']:
+                vars_list = kernel['backend'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
+                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
+                    f"{self.api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                args_str = ""
+                for ele in kernel['backend'].split(','):
+                    args_str = args_str + ele.strip() + ', '
+                kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackend({args_str[:-2]});
+"""
+
+        if kernel['layout'] is not None:
+            if '>' in kernel['layout']:
+                vars_list = kernel['layout'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set layout with '>' only allows 2, but received {len(vars_list)}."
+                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataLayout', \
+                    f"{self.api} api: When use '>' to set kernel layout, the first param should be a attribute with DataLayout type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayoutWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                vars_list = kernel['layout'].split(',')
+                assert len(
+                    vars_list
+                ) == 1, f"{self.api} api: The number of params to set layout must be 1, but received {len(vars_list)}."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayout({vars_list[0].strip()});
+"""
+
+        if kernel['data_type'] is not None:
+            if '>' in kernel['data_type']:
+                vars_list = kernel['data_type'].split('>')
+                assert len(
+                    vars_list
+                ) == 2, f"{self.api} api: The number of params to set data_type with '>' only allows 2, but received {len(vars_list)}."
+                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataType', \
+                    f"{self.api} api: When use '>' to set kernel data_type, the first param should be a attribute with DataType type."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataTypeWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+            else:
+                vars_list = kernel['data_type'].split(',')
+                assert len(
+                    vars_list
+                ) == 1, f"{self.api} api: The number of params to set data_type only allows 2, but received {len(vars_list)}."
+                kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataType({vars_list[0].strip()});
+"""
+
+        if len(input_names) == 0:
+            assert attr_backend_count > 0 and attr_layout_count > 0 and attr_data_type_count > 0, \
+                f"{self.api} api: When there is no input tensor, the args must have 'Backend', 'DataLayout' and 'DataType'."
+
+        kernel_select_args = ""
+        for input_name in input_names:
+            kernel_select_args = kernel_select_args + input_name + ", "
+
+        if len(kernel_select_args) > 2:
+            kernel_select_args = kernel_select_args[:-2]
+
+        kernel_select_code = kernel_key_item_init + kernel_select_code
+
+        if len(input_names) > 0:
+            kernel_select_code = kernel_select_code + f"""
+  if (kernel_backend == Backend::UNDEFINED 
+        || kernel_layout == DataLayout::UNDEFINED
+        || kernel_data_type == DataType::UNDEFINED ) {{
+    auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
+    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {{
+      kernel_backend = kernel_key.backend();
+    }}
+    if (kernel_layout == DataLayout::UNDEFINED) {{
+      kernel_layout = kernel_key.layout();
+    }}
+    if (kernel_data_type == DataType::UNDEFINED) {{
+      kernel_data_type = kernel_key.dtype();
+    }}
+  }}"""
+
+        kernel_select_code = kernel_select_code + f"""
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{kernel['func']}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{self.api} API kernel: " << kernel;"""
+
+        return kernel_select_code
+
+    def gene_infer_meta(self, input_names, attr_names, infer_meta) -> str:
+        infer_meta_params = infer_meta['param'] if infer_meta[
+            'param'] is not None else input_names + attr_names
+        param_code = ""
+        for param in infer_meta_params:
+            if param in input_names:
+                param_code = param_code + self.prefix_tensor_name + param + "->meta(), "
+            elif param in attr_names:
+                param_code = param_code + param + ", "
+            elif isinstance(param, str):
+                param_code = param_code + "\"" + param + "\", "
+            elif isinstance(param, bool):
+                param_code = param_code + str(param).lower() + ", "
+            else:
+                param_code = param_code + str(param) + ", "
+
+        param_code = param_code[:-2]
+        return f"""
+  auto out_meta = pten::{infer_meta['func']}({param_code});
+"""
+
+    def gene_kernel_context(self, input_names, attrs, infer_meta, kernel_param):
+        attr_names = attrs['names']
+        if kernel_param is None:
+            kernel_param = input_names + attr_names
+
+        input_code_str = ""
+        attr_code_str = ""
+        for param in kernel_param:
+            if param in input_names:
+                # set input for kernel_context
+                input_code_str = input_code_str + f"""
+  auto {self.prefix_tensor_name}{param} = std::dynamic_pointer_cast<pten::DenseTensor>({param}.impl());
+  kernel_context.EmplaceBackInput({self.prefix_tensor_name}{param});"""
+
+            elif param in attr_names:
+                # set attr for kernel_context
+                if 'ScalarArray' in attrs['attr_info'][param][0]:
+                    param = 'pten::ScalarArray(' + param + ')'
+                elif 'Scalar' in attrs['attr_info'][param][0]:
+                    param = 'pten::Scalar(' + param + ')'
+                attr_code_str = attr_code_str + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+
+            elif isinstance(param, bool):
+                attr_code_str = attr_code_str + f"""
+  kernel_context.EmplaceBackAttr({str(param).lower()});"""
+
+            else:
+                attr_code_str = attr_code_str + f"""
+  kernel_context.EmplaceBackAttr({param});"""
+
+        return f"""
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+  auto kernel_context = pten::KernelContext(dev_ctx);
+{input_code_str}
+{attr_code_str}
+{self.gene_infer_meta(input_names, attr_names, infer_meta)}
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          pten::TransToFluidPlace(kernel_backend));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+
+  Tensor out;
+  out.set_impl(dense_out);"""
+
+    def gene_api_code(self):
+        if self.is_base_api:
+            return f"""
+PD_DLL_DECL {self.output} {self.api}({self.args["args_define"]}) {{
+{self.gene_kernel_select(self.args['inputs']['names'], self.args['attrs'], self.kernel)}
+{self.gene_kernel_context(self.args['inputs']['names'], self.args['attrs'], self.infer_meta, self.kernel['param'])}
+
+  kernel(&kernel_context);
+  return out;
+}}
+"""
+
+        else:
+            return f"""
+PD_DLL_DECL {self.output} {self.api}({self.args["args_define"]}) {{
+  return {self.invoke};
+}}
+"""
+
+
+def header_include():
+    return """
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/common/scalar_array.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/infermeta.h"
+"""
+
+
+def module_declare():
+    return """
+PT_DECLARE_MODULE(CreationCPU);
+PT_DECLARE_MODULE(LinalgCPU);
+PT_DECLARE_MODULE(ManipulationCPU);
+PT_DECLARE_MODULE(MathCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(CreationCUDA);
+PT_DECLARE_MODULE(LinalgCUDA);
+PT_DECLARE_MODULE(ManipulationCUDA);
+PT_DECLARE_MODULE(MathCUDA);
+#endif
+"""
+
+
+def api_register():
+    return """
+PT_REGISTER_API(Creation);
+PT_REGISTER_API(Linalg);
+PT_REGISTER_API(Manipulation);
+PT_REGISTER_API(Math);
+"""
+
+
+def api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+
+""", """
+
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_api(api_yaml_path, header_file_path, source_file_path):
+
+    with open(api_yaml_path, 'r') as f:
+        apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/pten/api/include/api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(module_declare())
+    source_file.write(namespace[0])
+
+    for api in apis:
+        api_code = API(api)
+        print(api_code.gene_api_declaration())
+        header_file.write(api_code.gene_api_declaration())
+        source_file.write(api_code.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+    source_file.write(api_register())
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ API files')
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to yaml file directory',
+        default='python/paddle/utils/code_gen/api.yaml')
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/pten/api/include/api.h')
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/pten/api/lib/api.cc')
+
+    options = parser.parse_args()
+
+    api_yaml_path = options.api_yaml_path
+    header_file_path = options.api_header_path
+    source_file_path = options.api_source_path
+
+    generate_api(api_yaml_path, header_file_path, source_file_path)
+
+
+if __name__ == '__main__':
+    main()
-- 
GitLab