diff --git a/.gitignore b/.gitignore
index 708126b3bb070f0ce3b4e751b8732b77af8b36c4..e905833cae7a60f46f6d8fddf5403d46808873f3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,8 @@ paddle/fluid/API_DEV.spec
 paddle/fluid/API_PR.spec
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
-paddle/pten/api/*/api*
+paddle/pten/api/*/api.*
+paddle/pten/api/*/backward*
 paddle/pten/include/*
 paddle/pten/extension.h
 
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
index a454ae807bcaaebc90494db804e17d5791bfcc91..0491363eda78e3cd4c7001981db6e09828f2a34a 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
@@ -1,3 +1,3 @@
 add_subdirectory(lib)
 
-cc_library(pten_api SRCS all.cc DEPS pten_function_api utils_api)
+cc_library(pten_api SRCS all.cc DEPS pten_function_api pten_bw_function_api utils_api)
diff --git a/paddle/pten/api/backward/README.md b/paddle/pten/api/backward/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bc934a975f53a01159e46a115363ea58086f358a
--- /dev/null
+++ b/paddle/pten/api/backward/README.md
@@ -0,0 +1 @@
+The code files in this directory(paddle/pten/api/backward) are auto-generated when building PaddlePaddle.
diff --git a/paddle/pten/api/include/kernel_signature.h b/paddle/pten/api/include/kernel_signature.h
index d750b47ef864b404b47551a8501acdaee833bde7..b8e7b0d75bc6cb5d8458c4e0663bc4ff1cd1a732 100644
--- a/paddle/pten/api/include/kernel_signature.h
+++ b/paddle/pten/api/include/kernel_signature.h
@@ -115,4 +115,14 @@ using conj_kernel = void (*)(const DeviceContext&,
                              const DenseTensor&,
                              DenseTensor*);
 
+/* -------------- Grad Kernel ----------------- */
+using matmul_grad_kernel = void (*)(const DeviceContext&,
+                                    const DenseTensor&,
+                                    const DenseTensor&,
+                                    const DenseTensor&,
+                                    bool,
+                                    bool,
+                                    DenseTensor*,
+                                    DenseTensor*);
+
 }  // namespace pten
diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt
index 1c2b3823920d6e7877842ab48d66f48f4a9af076..1e645a68edfdfa8b09216860cb905a171a0258aa 100644
--- a/paddle/pten/api/lib/CMakeLists.txt
+++ b/paddle/pten/api/lib/CMakeLists.txt
@@ -14,18 +14,27 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor pten_context
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
 
+# forward api file
 set(api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_gen.py)
 set(api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api.yaml)
-
 set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/include/api.h)
 set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
 set(api_source_file_tmp ${api_source_file}.tmp)
 
+# backward api file
+set(bw_api_gen_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward_api_gen.py)
+set(bw_api_yaml_file ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/backward.yaml)
+set(bw_api_header_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/backward/backward_api.h)
+set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/backward_api.cc)
+set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
+set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
+
 if (NOT PYTHON_EXECUTABLE)
   find_package(PythonInterp REQUIRED)
 endif()
 
+# generate forward api
 add_custom_command(
   OUTPUT ${api_header_file} ${api_source_file}
   COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml
@@ -39,5 +48,19 @@ add_custom_command(
   DEPENDS ${api_yaml_file} ${api_gen_file}
   VERBATIM)
 
+# generate backward api
+add_custom_command(
+  OUTPUT ${bw_api_header_file} ${bw_api_source_file} ${bw_api_header_file_tmp} ${bw_api_source_file_tmp}
+  COMMAND ${PYTHON_EXECUTABLE} ${bw_api_gen_file} 
+                 --backward_yaml_path ${bw_api_yaml_file}
+                 --backward_header_path ${bw_api_header_file_tmp}
+                 --backward_source_path ${bw_api_source_file_tmp}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_header_file_tmp} ${bw_api_header_file}
+  COMMAND ${CMAKE_COMMAND} -E copy_if_different ${bw_api_source_file_tmp} ${bw_api_source_file}
+  COMMENT "copy_if_different ${bw_api_header_file} ${bw_api_source_file}"
+  DEPENDS ${bw_api_yaml_file} ${bw_api_gen_file}
+  VERBATIM)
+
 cc_library(utils_api SRCS utils.cc DEPS pten_tensor pten kernel_dispatch)
 cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch)
+cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta)
diff --git a/paddle/pten/api/lib/api_declare.h b/paddle/pten/api/lib/api_declare.h
index d29050c8ba4a825e161c92f9aa6dac7a86988154..0023170714fa6bfeed4793313833278dc2bbc373 100644
--- a/paddle/pten/api/lib/api_declare.h
+++ b/paddle/pten/api/lib/api_declare.h
@@ -17,8 +17,5 @@ limitations under the License. */
 // api symbols declare, remove in the future
 #include "paddle/pten/api/lib/api_registry.h"
 
-PT_DECLARE_API(Creation);
-PT_DECLARE_API(Linalg);
-PT_DECLARE_API(Manipulation);
 PT_DECLARE_API(Math);
 PT_DECLARE_API(Utils);
diff --git a/paddle/pten/api/lib/api_utils.h b/paddle/pten/api/lib/api_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3e7d74db1e89fddc27af6272c47a8e9e05af8bb
--- /dev/null
+++ b/paddle/pten/api/lib/api_utils.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+/* ------------------ for input ----------------------- */
+
+inline std::shared_ptr<pten::DenseTensor> TensorToDenseTensor(
+    const Tensor& tensor) {
+  return std::dynamic_pointer_cast<pten::DenseTensor>(tensor.impl());
+}
+
+inline std::unique_ptr<std::vector<pten::DenseTensor>> TensorToDenseTensor(
+    const std::vector<Tensor>& tensors) {
+  auto pt_tensors = std::make_unique<std::vector<pten::DenseTensor>>();
+  pt_tensors->reserve(tensors.size());
+
+  for (const auto& t : tensors) {
+    pt_tensors->push_back(
+        *std::dynamic_pointer_cast<pten::DenseTensor>(t.impl()));
+  }
+
+  return std::move(pt_tensors);
+}
+
+/* ----------------- for infer_meta --------------------- */
+
+inline const pten::DenseTensorMeta& GetDenseTensorMeta(
+    const pten::DenseTensor& tensor) {
+  return tensor.meta();
+}
+
+inline std::vector<pten::DenseTensorMeta> GetDenseTensorMeta(
+    const std::vector<pten::DenseTensor>& tensors) {
+  std::vector<pten::DenseTensorMeta> metas;
+  metas.reserve(tensors.size());
+  for (const auto& t : tensors) {
+    metas.push_back(t.meta());
+  }
+  return metas;
+}
+
+/* ------------------ for output ----------------------- */
+
+inline pten::DenseTensor* SetKernelOutput(const pten::DenseTensorMeta& meta,
+                                          Backend backend,
+                                          Tensor* out) {
+  auto dense_tensor = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<SharedStorage>(pten::TransToFluidPlace(backend)),
+      meta);
+  out->set_impl(dense_tensor);
+  return dense_tensor.get();
+}
+
+inline std::vector<pten::DenseTensor*> SetKernelOutput(
+    const std::vector<pten::DenseTensorMeta>& metas,
+    Backend backend,
+    std::vector<Tensor>* out) {
+  size_t n = metas.size();
+  out->reserve(n);
+  std::vector<pten::DenseTensor*> results(n);
+  for (size_t i = 0; i < n; ++i) {
+    auto tensor_ptr = std::make_shared<pten::DenseTensor>(
+        pten::make_intrusive<SharedStorage>(pten::TransToFluidPlace(backend)),
+        metas[i]);
+    results[i] = tensor_ptr.get();
+    out->emplace_back();
+    out->back().set_impl(tensor_ptr);
+  }
+  return results;
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/infermeta/CMakeLists.txt b/paddle/pten/infermeta/CMakeLists.txt
index f92727f33fb05330394226dbdce114b90f4a86ff..8e50d9d2c90d435eddd75f110ca7de38e11c9044 100644
--- a/paddle/pten/infermeta/CMakeLists.txt
+++ b/paddle/pten/infermeta/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils)
+cc_library(backward_infermeta SRCS backward.cc DEPS convert_utils)
diff --git a/paddle/pten/infermeta/backward.cc b/paddle/pten/infermeta/backward.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a66e8cd2ecb384be7bd807269cce290e8f8e04e
--- /dev/null
+++ b/paddle/pten/infermeta/backward.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/infermeta/backward.h"
+
+namespace pten {
+
+std::tuple<DenseTensorMeta, DenseTensorMeta> MatmulGradInferMeta(
+    const DenseTensorMeta& x_meta,
+    const DenseTensorMeta& y_meta,
+    const DenseTensorMeta& out_grad_meta,
+    bool transpose_x,
+    bool transpose_y) {
+  return std::make_tuple(x_meta, y_meta);
+}
+
+}  // namespace pten
diff --git a/paddle/pten/infermeta/backward.h b/paddle/pten/infermeta/backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..03bdb3a962a96e84f7ed569c18d3b73fad145a78
--- /dev/null
+++ b/paddle/pten/infermeta/backward.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <tuple>
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+std::tuple<DenseTensorMeta, DenseTensorMeta> MatmulGradInferMeta(
+    const DenseTensorMeta& x_meta,
+    const DenseTensorMeta& y_meta,
+    const DenseTensorMeta& out_grad_meta,
+    bool transpose_x,
+    bool transpose_y);
+
+}  // namespace pten
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index a0d7ce84f75fdec701b69deb503c671b7419fee4..f37b45eef1b80211cbb749c20b489af43cdafdee 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -1,7 +1,7 @@
 - api : add
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ElementwiseInferMeta
     param : [x, y, -1]
   kernel :
@@ -10,7 +10,7 @@
 - api : cast
   args : (const Tensor& x, DataType out_dtype)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : CastInferMeta
   kernel :
     func : cast
@@ -38,7 +38,7 @@
 - api : divide
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ElementwiseInferMeta
     param : [x, y, -1]
   kernel :
@@ -47,31 +47,31 @@
 - api : dot
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : DotInferMeta
-  kernel : 
+  kernel :
     func : dot
 
 - api : empty
   args : (const ScalarArray& shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
   output: Tensor
-  infer_meta : 
+  infer_meta :
     func : CreateInferMeta
     param : [shape, dtype, layout]
-  kernel : 
+  kernel :
     func : empty
     param : [shape]
     data_type : dtype
     backend : place
     layout : layout
-  
+
 - api : empty_like
   args : (const Tensor& x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
   output: Tensor
-  infer_meta : 
+  infer_meta :
     func : CreateLikeInferMeta
     param : [x, dtype, layout]
-  kernel : 
+  kernel :
     func : empty_like
     param : []
     data_type : dtype > x
@@ -81,31 +81,31 @@
 - api : flatten
   args : (const Tensor& x, int start_axis, int stop_axis)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : FlattenInferMeta
-  kernel : 
+  kernel :
     func : flatten
 
 - api : full
   args : (const ScalarArray& shape, const Scalar& value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU, DataLayout layout=DataLayout::NCHW)
   output: Tensor
-  infer_meta : 
+  infer_meta :
     func : CreateInferMeta
     param : [shape, dtype, layout]
-  kernel : 
+  kernel :
     func : full
     param : [shape, value]
     data_type : dtype
     backend : place
     layout : layout
-  
+
 - api : full_like
   args : (const Tensor& x, const Scalar& value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED, DataLayout layout = DataLayout::UNDEFINED)
   output: Tensor
-  infer_meta : 
+  infer_meta :
     func : CreateLikeInferMeta
     param : [x, dtype, layout]
-  kernel : 
+  kernel :
     func : full_like
     param : [value]
     data_type : dtype > x
@@ -115,24 +115,25 @@
 - api : matmul
   args : (const Tensor& x, const Tensor& y, bool transpose_x = false, bool transpose_y = false)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : MatmulInferMeta
-  kernel : 
+  kernel :
     func : matmul
+  backward : matmul_grad
 
 - api : mean
   args : (const Tensor& x, const std::vector<int64_t>& axis={}, bool keep_dim=false)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ReduceInferMeta
     param: [x, axis, keep_dim]
-  kernel : 
+  kernel :
     func : mean
 
 - api : multiply
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ElementwiseInferMeta
     param : [x, y, -1]
   kernel :
@@ -146,15 +147,15 @@
 - api : reshape
   args : (const Tensor& x, const ScalarArray& shape)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ReshapeInferMeta
-  kernel : 
+  kernel :
     func : reshape
 
 - api : scale
   args : (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : UnchangedInferMeta
     param : [x]
   kernel :
@@ -163,7 +164,7 @@
 - api : subtract
   args : (const Tensor& x, const Tensor& y)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ElementwiseInferMeta
     param : [x, y, -1]
   kernel :
@@ -172,10 +173,10 @@
 - api : sum
   args : (const Tensor& x, const std::vector<int64_t>& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor
-  infer_meta : 
+  infer_meta :
     func : ReduceInferMeta
     param: [x, axis, keep_dim, dtype]
-  kernel : 
+  kernel :
     func : sum
     param : [x, axis, keep_dim, dtype]
     data_type : x
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index c99473158524637de112289e58182cd14bea60fc..6bb02ab9d40dbe28b01bf669417a8d521c6458da 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,6 +16,8 @@ import os
 import yaml
 import argparse
 
+import gen_utils
+
 
 class API:
     prefix_tensor_name = 'dense_'
@@ -23,12 +25,12 @@ class API:
     def __init__(self, api_item_yaml):
         self.api = api_item_yaml['api']
         # args:
-        #   inputs: 
+        #   inputs:
         #     names : [], list of input names
         #   attrs:
         #     names : [], list of attribute names
-        #     attr_info : { attr_name : (type, default_values)}    
-        self.args = self.parse_args(api_item_yaml['args'])
+        #     attr_info : { attr_name : (type, default_values)}
+        self.args = gen_utils.parse_args(self.api, api_item_yaml['args'])
         self.output = api_item_yaml['output']
         self.is_base_api = True
         if 'invoke' in api_item_yaml:
@@ -50,271 +52,29 @@ class API:
             if 'param' not in self.infer_meta:
                 self.infer_meta['param'] = None
 
-    def parse_args(self, args_str):
-        inputs = {'names': []}
-        attrs = {'names': [], 'attr_info': {}}
-        args_str = args_str.strip()
-        assert args_str.startswith('(') and args_str.endswith(')'), \
-            f"Args declaration should start with '(' and end with ')', please check the args of {self.api} in api.yaml."
-        args_str = args_str[1:-1]
-        args_list = args_str.split(',')
-        input_types = [
-            'const Tensor&', 'const Tensor &', 'const std::vector<Tensor>&',
-            'const std::vector<Tensor> &'
-        ]
-        attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
-                      'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
-                      'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
-        args_declare_str = ""
-        args_define_str = ""
-        for item in args_list:
-            item = item.strip()
-            # match the input tensor
-            has_input = False
-            for in_type in input_types:
-                if item.startswith(in_type):
-                    input_name = item[len(in_type):].strip()
-                    assert len(input_name) > 0, \
-                        f"The input tensor name should not be empty. Please check the args of {self.api} in api.yaml."
-                    inputs['names'].append(input_name)
-                    args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
-                    args_define_str = args_define_str + in_type + ' ' + input_name + ', '
-                    has_input = True
-                    break
-            if has_input:
-                continue
-
-            # match the attribute
-            for attr_type in attr_types:
-                if item.startswith(attr_type):
-                    attr_name = item[len(attr_type):].strip()
-                    assert len(attr_name) > 0, \
-                        f"The attribute name should not be empty. Please check the args of {self.api} in api.yaml."
-                    default_value = None
-                    if '=' in attr_name:
-                        attr_infos = attr_name.split('=')
-                        attr_name = attr_infos[0].strip()
-                        default_value = attr_infos[1].strip()
-
-                    default_value_str = "" if default_value is None else '=' + default_value
-                    args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
-                    args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
-                    attrs['names'].append(attr_name)
-                    attrs['attr_info'][attr_name] = (attr_type, default_value)
-                    break
-
-        args = {
-            'inputs': inputs,
-            'attrs': attrs,
-            'args_declare': args_declare_str[:-2],
-            'args_define': args_define_str[:-2]
-        }
-        return args
-
     def gene_api_declaration(self):
         return f"""
 PADDLE_API {self.output} {self.api}({self.args['args_declare']});
 """
 
-    def gene_kernel_select(self, input_names, attrs, kernel):
-
-        kernel_key_item_init = """
-  Backend kernel_backend = Backend::UNDEFINED;
-  DataLayout kernel_layout = DataLayout::UNDEFINED;
-  DataType kernel_data_type = DataType::UNDEFINED;
-"""
-        # Check the tensor options
-        attr_backend_count = 0
-        attr_layout_count = 0
-        attr_data_type_count = 0
-        for attr_name in attrs['names']:
-            if attrs['attr_info'][attr_name][0] == 'Backend':
-                assert kernel['backend'] is not None, \
-                    f"{self.api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
-                attr_backend_count = attr_backend_count + 1
-            if attrs['attr_info'][attr_name][0] == 'DataLayout':
-                assert kernel['layout'] is not None, \
-                    f"{self.api} api: When there is a parameter with 'DataLayout' type in attributes, you must set layout of kernel manually."
-                attr_layout_count = attr_layout_count + 1
-            if attrs['attr_info'][attr_name][0] == 'DataType':
-                assert kernel['data_type'] is not None, \
-                    f"{self.api} api: When there is a parameter with 'DataType' type in attributes, you must set data_type of kernel manually."
-                attr_data_type_count = attr_data_type_count + 1
-
-        # preprocess kernel configures
-        kernel_select_code = ""
-        if kernel['backend'] is not None:
-            if '>' in kernel['backend']:
-                vars_list = kernel['backend'].split('>')
-                assert len(
-                    vars_list
-                ) == 2, f"{self.api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
-                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
-                    f"{self.api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
-"""
-
-            else:
-                args_str = ""
-                for ele in kernel['backend'].split(','):
-                    args_str = args_str + ele.strip() + ', '
-                kernel_select_code = kernel_select_code + f"""
-  kernel_backend = ParseBackend({args_str[:-2]});
-"""
-
-        if kernel['layout'] is not None:
-            if '>' in kernel['layout']:
-                vars_list = kernel['layout'].split('>')
-                assert len(
-                    vars_list
-                ) == 2, f"{self.api} api: The number of params to set layout with '>' only allows 2, but received {len(vars_list)}."
-                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataLayout', \
-                    f"{self.api} api: When use '>' to set kernel layout, the first param should be a attribute with DataLayout type."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_layout = ParseLayoutWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
-"""
-
-            else:
-                vars_list = kernel['layout'].split(',')
-                assert len(
-                    vars_list
-                ) == 1, f"{self.api} api: The number of params to set layout must be 1, but received {len(vars_list)}."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_layout = ParseLayout({vars_list[0].strip()});
-"""
-
-        if kernel['data_type'] is not None:
-            if '>' in kernel['data_type']:
-                vars_list = kernel['data_type'].split('>')
-                assert len(
-                    vars_list
-                ) == 2, f"{self.api} api: The number of params to set data_type with '>' only allows 2, but received {len(vars_list)}."
-                assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataType', \
-                    f"{self.api} api: When use '>' to set kernel data_type, the first param should be a attribute with DataType type."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_data_type = ParseDataTypeWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
-"""
-
-            else:
-                vars_list = kernel['data_type'].split(',')
-                assert len(
-                    vars_list
-                ) == 1, f"{self.api} api: The number of params to set data_type only allows 2, but received {len(vars_list)}."
-                kernel_select_code = kernel_select_code + f"""
-  kernel_data_type = ParseDataType({vars_list[0].strip()});
-"""
-
-        if len(input_names) == 0:
-            assert attr_backend_count > 0 and attr_layout_count > 0 and attr_data_type_count > 0, \
-                f"{self.api} api: When there is no input tensor, the args must have 'Backend', 'DataLayout' and 'DataType'."
-
-        kernel_select_args = ""
-        for input_name in input_names:
-            kernel_select_args = kernel_select_args + input_name + ", "
-
-        if len(kernel_select_args) > 2:
-            kernel_select_args = kernel_select_args[:-2]
-
-        kernel_select_code = kernel_key_item_init + kernel_select_code
-
-        if len(input_names) > 0:
-            kernel_select_code = kernel_select_code + f"""
-  if (kernel_backend == Backend::UNDEFINED 
-        || kernel_layout == DataLayout::UNDEFINED
-        || kernel_data_type == DataType::UNDEFINED ) {{
-    auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
-    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
-    if (kernel_backend == Backend::UNDEFINED) {{
-      kernel_backend = kernel_key.backend();
-    }}
-    if (kernel_layout == DataLayout::UNDEFINED) {{
-      kernel_layout = kernel_key.layout();
-    }}
-    if (kernel_data_type == DataType::UNDEFINED) {{
-      kernel_data_type = kernel_key.dtype();
-    }}
-  }}"""
-
-        kernel_select_code = kernel_select_code + f"""
-  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "{kernel['func']}", {{kernel_backend, kernel_layout, kernel_data_type}});
-  VLOG(6) << "{self.api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
-  VLOG(6) << "{self.api} API kernel: " << kernel;"""
-
-        return kernel_select_code
-
-    def gene_infer_meta(self, input_names, attr_names, infer_meta) -> str:
-        infer_meta_params = infer_meta['param'] if infer_meta[
-            'param'] is not None else input_names + attr_names
-        param_code = ""
-        for param in infer_meta_params:
-            if param in input_names:
-                param_code = param_code + "GetDenseTensorMeta(" + self.prefix_tensor_name + param + "), "
-            elif param in attr_names:
-                param_code = param_code + param + ", "
-            elif isinstance(param, str):
-                param_code = param_code + "\"" + param + "\", "
-            elif isinstance(param, bool):
-                param_code = param_code + str(param).lower() + ", "
-            else:
-                param_code = param_code + str(param) + ", "
-
-        param_code = param_code[:-2]
-        return f"""
-  auto out_meta = pten::{infer_meta['func']}({param_code});
-"""
-
-    def get_kernel_args(self, input_names, attrs, kernel_param):
-        input_tensor_code = ""
-        for input_name in input_names:
-            # set input code
-            input_tensor_code = input_tensor_code + f"""
-  auto {self.prefix_tensor_name}{input_name} = TensorToDenseTensor({input_name});"""
-
-        attr_names = attrs['names']
-        if kernel_param is None:
-            kernel_param = input_names + attr_names
-
-        kernel_args = "*dev_ctx, "
-        for param in kernel_param:
-            if param in input_names:
-                kernel_args = kernel_args + "*" + self.prefix_tensor_name + param + ", "
-            elif param in attr_names:
-                # set attr for kernel_context
-                if 'ScalarArray' in attrs['attr_info'][param][0]:
-                    param = 'pten::ScalarArray(' + param + ')'
-                elif 'Scalar' in attrs['attr_info'][param][0]:
-                    param = 'pten::Scalar(' + param + ')'
-                kernel_args = kernel_args + param + ", "
-            elif isinstance(param, bool):
-                kernel_args = kernel_args + str(param).lower() + ", "
-            else:
-                kernel_args = kernel_args + str(param) + ", "
-        return input_tensor_code, kernel_args[:-2]
-
     def gene_api_code(self):
         if self.is_base_api:
-            input_tensors, kernel_args = self.get_kernel_args(
+            input_tensors, kernel_args = gen_utils.get_kernel_args(
                 self.args['inputs']['names'], self.args['attrs'],
                 self.kernel['param'])
+            out_type, _ = gen_utils.parse_output(self.api, self.output)
+            outputs_args, output_create = gen_utils.gene_output(out_type)
             return f"""
 PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
-{self.gene_kernel_select(self.args['inputs']['names'], self.args['attrs'], self.kernel)}
+{gen_utils.gene_kernel_select(self.api, self.args['inputs']['names'], self.args['attrs'], self.kernel)}
 
   auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
 {input_tensors}
-{self.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)}
-  auto dense_out = std::make_shared<pten::DenseTensor>(
-        pten::make_intrusive<paddle::experimental::SharedStorage>(
-            pten::TransToFluidPlace(kernel_backend)),
-        std::move(out_meta));
-
-  Tensor out;
-  out.set_impl(dense_out);
+{gen_utils.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)}
+{output_create}
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<pten::{self.api}_kernel>();
-  (*kernel_fn)({kernel_args}, dense_out.get());
+  (*kernel_fn)({kernel_args}, {outputs_args});
 
   return out;
 }}
@@ -330,6 +90,8 @@ PADDLE_API {self.output} {self.api}({self.args["args_define"]}) {{
 
 def header_include():
     return """
+#include <tuple>
+
 #include "paddle/pten/api/include/tensor.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/pten/common/scalar_array.h"
@@ -345,6 +107,7 @@ def source_include(header_file_path):
 
 #include "paddle/pten/api/include/kernel_signature.h"
 #include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/api_utils.h"
 #include "paddle/pten/api/lib/kernel_dispatch.h"
 #include "paddle/pten/api/lib/utils/storage.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -358,9 +121,6 @@ def source_include(header_file_path):
 
 def api_register():
     return """
-PT_REGISTER_API(Creation);
-PT_REGISTER_API(Linalg);
-PT_REGISTER_API(Manipulation);
 PT_REGISTER_API(Math);
 """
 
@@ -377,35 +137,6 @@ namespace experimental {
 """)
 
 
-def tensor_to_densetensor():
-    return """
-  std::shared_ptr<pten::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
-      return std::dynamic_pointer_cast<pten::DenseTensor>(tensor.impl());
-  }
-
-  std::shared_ptr<std::vector<pten::DenseTensor>> TensorToDenseTensor(const std::vector<Tensor>& tensors) {
-      std::vector<pten::DenseTensor> pt_tensors;
-
-      for(auto & t : tensors) {
-          pt_tensors.push_back(*std::dynamic_pointer_cast<pten::DenseTensor>(t.impl()));
-      }
-      return std::make_shared<std::vector<pten::DenseTensor>>(pt_tensors);
-  }
-
-   const pten::DenseTensorMeta GetDenseTensorMeta(const std::shared_ptr<pten::DenseTensor> & x) {
-       return x->meta();
-   }
-
-   const std::vector<pten::DenseTensorMeta> GetDenseTensorMeta(const std::shared_ptr<std::vector<pten::DenseTensor>>& x) {
-       std::vector<pten::DenseTensorMeta> metas;
-       for(auto& t : *x) {
-           metas.push_back(t.meta());
-       }
-       return metas;
-   }
-"""
-
-
 def generate_api(api_yaml_path, header_file_path, source_file_path):
 
     with open(api_yaml_path, 'r') as f:
@@ -422,7 +153,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
     include_header_file = "paddle/pten/api/include/api.h"
     source_file.write(source_include(include_header_file))
     source_file.write(namespace[0])
-    source_file.write(tensor_to_densetensor())
 
     for api in apis:
         api_code = API(api)
@@ -443,7 +173,7 @@ def main():
         description='Generate PaddlePaddle C++ API files')
     parser.add_argument(
         '--api_yaml_path',
-        help='path to yaml file directory',
+        help='path to api yaml file',
         default='python/paddle/utils/code_gen/api.yaml')
     parser.add_argument(
         '--api_header_path',
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26da7ae2adfaceaffe90aa203ec78bd0edb14b61
--- /dev/null
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -0,0 +1,34 @@
+- backward_api : matmul_grad
+  forward : matmul (const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y) -> Tensor(out)
+  args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x=false, bool transpose_y=false)
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : MatmulGradInferMeta
+  kernel :
+    func : matmul_grad
+
+- backward_api : scale_grad
+  forward : scale (const Tensor& x, const Scalar& scale, float bias, bool bias_after_scale) -> Tensor(out)
+  args : (const Tensor& out_grad, const Scalar& scale, float bias=0.0, bool bias_after_scale=true)
+  output : Tensor(x_grad)
+  invoke : scale(out_grad, scale, bias, bias_after_scale)
+
+# TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
+#
+# - backward_api : matmul_double_grad
+#   forward : matmul_grad (const Tensor& x, const Tensor& y, const Tensor& out_grad, bool transpose_x, bool transpose_y) -> tuple<Tensor, Tensor>(dx, dy)
+#   args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, bool transpose_x, bool transpose_y)
+#   output : tuple<Tensor, Tensor, Tensor>  // d2x, d2y, dout_grad
+#   infer_meta :
+#     func : MatmulDoubleGradInferMeta
+#   kernel :
+#     func : matmul_double_grad
+
+# - backward_api : matmul_triple_grad
+#   forward : matmul_double_grad (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, bool transpose_x, bool transpose_y) -> tuple<Tensor, Tensor, Tensor>(d2x, d2y, dout_grad)
+#   args : (const Tensor& x, const Tensor& y, const Tensor& out_grad, const Tensor& dx_grad, const Tensor& dy_grad, const Tensor& d2x_grad, const Tensor& d2y_grad, const Tensor& dout_grad_grad, bool transpose_x, bool transpose_y)
+#   output : tuple<Tensor, Tensor, Tensor, Tensor, Tensor>  // d3x, d3y, d2out_grad, ddx_grad, ddy_grad
+#   infer_meta :
+#     func : MatmulTripleGradInferMeta
+#   kernel :
+#     func : matmul_triple_grad
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cb14327f6e09092bbce0229ae26f1b456238802
--- /dev/null
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -0,0 +1,251 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import yaml
+import argparse
+import re
+
+import gen_utils
+
+
+class BackwardAPI:
+    def __init__(self, backward_item_yaml):
+        self.backward_api = backward_item_yaml['backward_api']
+        self.args, self.output_type, self.return_comment = self.parse_and_check_args(
+            backward_item_yaml['forward'], backward_item_yaml['args'],
+            backward_item_yaml['output'])
+
+        self.is_base_api = True
+        if 'invoke' in backward_item_yaml:
+            self.is_base_api = False
+            self.invoke = backward_item_yaml['invoke']
+        else:
+            self.kernel = backward_item_yaml['kernel']
+            if 'backend' not in self.kernel or len(self.kernel['backend']) == 0:
+                self.kernel['backend'] = None
+            if 'layout' not in self.kernel or len(self.kernel['layout']) == 0:
+                self.kernel['layout'] = None
+            if 'data_type' not in self.kernel or len(self.kernel[
+                    'data_type']) == 0:
+                self.kernel['data_type'] = None
+            if 'param' not in self.kernel or len(self.kernel['param']) == 0:
+                self.kernel['param'] = None
+
+            self.infer_meta = backward_item_yaml['infer_meta']
+            if 'param' not in self.infer_meta or len(self.infer_meta[
+                    'param']) == 0:
+                self.infer_meta['param'] = None
+
+    def parse_forward_config(self, forward_config):
+        # api_name (const Tensor& input, ... , int attr, ...) -> Tensor(out)
+        result = re.search(
+            r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->[^\(]*\((?P<outputs>[^\)]+)\)",
+            forward_config)
+        api = result.group('api')
+        outputs = [item.strip() for item in result.group('outputs').split(',')]
+        forward_args = gen_utils.parse_args(api, result.group('args'))
+
+        return api, forward_args['inputs'], forward_args['attrs'], outputs
+
+    def parse_and_check_args(self, forward_config, args_config, output_config):
+        # parse the forward and backward config
+        _, fw_inputs, fw_attrs, fw_outputs = self.parse_forward_config(
+            forward_config)
+        bw_args = gen_utils.parse_args(self.backward_api, args_config)
+
+        # check the inputs of backward
+        for input in bw_args['inputs']['names']:
+            if input not in fw_inputs and input not in fw_outputs:
+                if input.endswith('_grad'):
+                    original_name = input[:-5]
+                    assert original_name in fw_outputs, \
+                        f"{self.backward_api} : Input Tensor error: the input tensor({input}) of backward should be an input or output or grad of output in forward api. \
+                         Please check the forward of {self.backward_api} in yaml."
+
+        # check the attributes of backward
+        for attr in bw_args['attrs']['names']:
+            assert attr in fw_attrs['names'] and bw_args['attrs']['attr_info'][attr][0] == fw_attrs['attr_info'][attr][0], \
+                f"{self.backward_api} : Attribute error: The attribute({attr}) of backward isn't consistent with forward api. \
+                 Please check the args of {self.backward_api} in yaml."
+
+        # check the output of backward
+        output_type, return_comment = gen_utils.parse_output(self.backward_api,
+                                                             output_config)
+        assert output_type.count('Tensor') <= len(fw_inputs['names']), \
+            f"{self.backward_api} : Output error: The number of ouputs should be less then the number of inputs of forward api. \
+             Please check the output of {self.backward_api} in yaml."
+
+        return bw_args, output_type, return_comment
+
+    def gene_api_declaration(self):
+        if self.return_comment:
+            return f"""
+// {self.return_comment}
+{self.output_type} {self.backward_api}({self.args['args_declare']});
+"""
+
+        else:
+            return f"""
+{self.output_type} {self.backward_api}({self.args['args_declare']});
+"""
+
+    def gene_api_code(self):
+        if self.is_base_api:
+            input_tensors, kernel_args = gen_utils.get_kernel_args(
+                self.args['inputs']['names'], self.args['attrs'],
+                self.kernel['param'])
+            outputs_args, output_create = gen_utils.gene_output(
+                self.output_type)
+            return f"""
+// {self.return_comment}
+{self.output_type} {self.backward_api}({self.args["args_define"]}) {{
+{gen_utils.gene_kernel_select(self.backward_api, self.args['inputs']['names'], self.args['attrs'], self.kernel)}
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
+{input_tensors}
+{gen_utils.gene_infer_meta(self.args['inputs']['names'], self.args['attrs']['names'], self.infer_meta)}
+{output_create}
+
+  auto* kernel_fn = kernel.GetVariadicKernelFn<pten::{self.backward_api}_kernel>();
+  (*kernel_fn)({kernel_args}, {outputs_args});
+
+  return out;
+}}
+"""
+
+        else:
+            inveke_func_name = self.invoke.split('(')[0].strip()
+            if inveke_func_name in self.args['attrs']['names']:
+                # Adjust the param whose name is same with api invoked.
+                pattern = '\W' + inveke_func_name + '[^A-Za-z0-9_(]'
+
+                def adjust_name(matched):
+                    matched_str = matched.group()
+                    return matched_str[0:-1] + '_val' + matched_str[-1]
+
+                invoke_code = re.sub(pattern, adjust_name, self.invoke)
+                params_code = re.sub(pattern, adjust_name,
+                                     self.args["args_define"])
+            else:
+                invoke_code = self.invoke
+                params_code = self.args["args_define"]
+            return f"""
+// {self.return_comment}
+{self.output_type} {self.backward_api}({params_code}) {{
+  return {invoke_code};
+}}
+"""
+
+
+def header_include():
+    return """
+#include <tuple>
+
+#include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/scalar.h"
+#include "paddle/pten/common/scalar_array.h"
+"""
+
+
+def source_include(header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/pten/api/include/kernel_signature.h"
+#include "paddle/pten/api/lib/api_registry.h"
+#include "paddle/pten/api/lib/api_utils.h"
+#include "paddle/pten/api/lib/kernel_dispatch.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/api/include/api.h"
+#include "paddle/pten/infermeta/backward.h"
+"""
+
+
+def backward_api_namespace():
+    return ("""
+namespace paddle {
+namespace experimental {
+
+""", """
+
+}  // namespace experimental
+}  // namespace paddle
+""")
+
+
+def generate_backward_api(backward_yaml_path, header_file_path,
+                          source_file_path):
+
+    with open(backward_yaml_path, 'r') as f:
+        bw_apis = yaml.load(f, Loader=yaml.FullLoader)
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = backward_api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = "paddle/pten/api/backward/backward_api.h"
+    source_file.write(source_include(include_header_file))
+    source_file.write(namespace[0])
+
+    for bw_api in bw_apis:
+        bw_api = BackwardAPI(bw_api)
+        # print(api_code.gene_api_declaration())
+        header_file.write(bw_api.gene_api_declaration())
+        source_file.write(bw_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ backward API files')
+    parser.add_argument(
+        '--backward_yaml_path',
+        help='path to backward yaml file',
+        default='python/paddle/utils/code_gen/backward.yaml')
+    parser.add_argument(
+        '--backward_header_path',
+        help='output of generated backward header code file',
+        default='paddle/pten/api/backward/backward_api.h')
+
+    parser.add_argument(
+        '--backward_source_path',
+        help='output of generated backward source code file',
+        default='paddle/pten/api/lib/backward_api.cc')
+
+    options = parser.parse_args()
+
+    backward_yaml_path = options.backward_yaml_path
+    header_file_path = options.backward_header_path
+    source_file_path = options.backward_source_path
+
+    generate_backward_api(backward_yaml_path, header_file_path,
+                          source_file_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/utils/code_gen/gen_utils.py b/python/paddle/utils/code_gen/gen_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d368c292b7cfefb0121aba9f0c0fcdc7b0a4caf
--- /dev/null
+++ b/python/paddle/utils/code_gen/gen_utils.py
@@ -0,0 +1,338 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+PREFIX_TENSOR_NAME = 'dense_'
+
+
+def parse_args(api_name, args_str):
+    """
+    Returns:
+       { inputs : {
+             names : [] // list of input names
+             input_info : { input_name : type }
+         }
+         attrs: {
+             names : [] // list of attribute names
+             attr_info : { attr_name : (type, default_value)}
+         }
+         args_declare : "str" // str of funtion params with default value. Example: (..., bool flag=false)
+         args_define : "str" // str of funtion params without default value. Example: (..., bool flag)
+       }
+    """
+    inputs = {'names': [], 'input_info': {}}
+    attrs = {'names': [], 'attr_info': {}}
+    args_str = args_str.strip()
+    assert args_str.startswith('(') and args_str.endswith(')'), \
+        f"Args declaration should start with '(' and end with ')', please check the args of {api_name} in yaml."
+    args_str = args_str[1:-1]
+    args_list = args_str.split(',')
+    input_types = [
+        'const Tensor&', 'const Tensor &', 'const std::vector<Tensor>&',
+        'const std::vector<Tensor> &'
+    ]
+    attr_types = ['const Scalar&', 'const Scalar &', 'const ScalarArray&', 'const ScalarArray &', \
+                  'int', 'int32_t', 'int64_t', 'size_t', 'float', 'double', 'bool', \
+                  'const std::vector<int64_t>&', 'Backend', 'DataLayout', 'DataType']
+    args_declare_str = ""
+    args_define_str = ""
+
+    for item in args_list:
+        item = item.strip()
+        # match the input tensor
+        has_input = False
+        for in_type in input_types:
+            if item.startswith(in_type):
+                input_name = item[len(in_type):].strip()
+                assert len(input_name) > 0, \
+                    f"The input tensor name should not be empty. Please check the args of {api_name} in yaml."
+                assert len(attrs['names']) == 0, \
+                    f"The input Tensor should appear before attributes. please check the position of {api_name}:input({input_name}) in yaml"
+
+                inputs['names'].append(input_name)
+                inputs['input_info'][input_name] = in_type
+                args_declare_str = args_declare_str + in_type + ' ' + input_name + ', '
+                args_define_str = args_define_str + in_type + ' ' + input_name + ', '
+                has_input = True
+                break
+        if has_input:
+            continue
+
+        # match the attribute
+        for attr_type in attr_types:
+            if item.startswith(attr_type):
+                attr_name = item[len(attr_type):].strip()
+                assert len(attr_name) > 0, \
+                    f"The attribute name should not be empty. Please check the args of {api_name} in yaml."
+                default_value = None
+                if '=' in attr_name:
+                    attr_infos = attr_name.split('=')
+                    attr_name = attr_infos[0].strip()
+                    default_value = attr_infos[1].strip()
+
+                default_value_str = "" if default_value is None else '=' + default_value
+                args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', '
+                args_define_str = args_define_str + attr_type + ' ' + attr_name + ', '
+                attrs['names'].append(attr_name)
+                attrs['attr_info'][attr_name] = (attr_type, default_value)
+                break
+
+    args = {
+        'inputs': inputs,
+        'attrs': attrs,
+        'args_declare': args_declare_str[:-2],
+        'args_define': args_define_str[:-2]
+    }
+    return args
+
+
+def parse_output(api_name, output_config):
+    def parse_output_item(output_item):
+        alllowd_output_types = ['Tensor', 'std::vector<Tensor>']
+        if re.search(r'\(\w*\)', output_item):
+            result = re.search(
+                r"(?P<out_type>[a-zA-Z0-9_<>]+)\s*\((?P<name>\w+)\)",
+                output_item)
+            out_type = result.group('out_type')
+            assert out_type in alllowd_output_types, \
+                f"{api_name} : Output type error: the output type only support Tensor and std::vector<Tensor>, \
+                  but now is {out_type}."
+
+            return out_type, result.group('name')
+
+        else:
+            if output_item.strip() in alllowd_output_types:
+                return output_item.strip(), 'out'
+            else:
+                raise ValueError(
+                    "{} : Output type error: the output type only support Tensor and std::vector<Tensor>, \
+                  but now is {}.".format(api_name, out_type))
+
+    temp_list = output_config.split(',')
+
+    if len(temp_list) == 1:
+        out_type, out_name = parse_output_item(temp_list[0])
+        return out_type, out_name
+    else:
+        out_type_list = []
+        out_name_list = []
+        for output_item in temp_list:
+            out_type, out_name = parse_output_item(output_item)
+            out_type_list.append(out_type)
+            out_name_list.append(out_name)
+
+        return "std::tuple<" + ",".join(out_type_list) + ">", ", ".join(
+            out_name_list)
+
+
+def gene_kernel_select(api, input_names, attrs, kernel) -> str:
+
+    kernel_key_item_init = """
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+"""
+    # Check the tensor options
+    attr_backend_count = 0
+    attr_layout_count = 0
+    attr_data_type_count = 0
+    for attr_name in attrs['names']:
+        if attrs['attr_info'][attr_name][0] == 'Backend':
+            assert kernel['backend'] is not None, \
+                f"{api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+            attr_backend_count = attr_backend_count + 1
+        if attrs['attr_info'][attr_name][0] == 'DataLayout':
+            assert kernel['layout'] is not None, \
+                f"{api} api: When there is a parameter with 'DataLayout' type in attributes, you must set layout of kernel manually."
+            attr_layout_count = attr_layout_count + 1
+        if attrs['attr_info'][attr_name][0] == 'DataType':
+            assert kernel['data_type'] is not None, \
+                f"{api} api: When there is a parameter with 'DataType' type in attributes, you must set data_type of kernel manually."
+            attr_data_type_count = attr_data_type_count + 1
+
+    # preprocess kernel configures
+    kernel_select_code = ""
+    if kernel['backend'] is not None:
+        if '>' in kernel['backend']:
+            vars_list = kernel['backend'].split('>')
+            assert len(
+                vars_list
+            ) == 2, f"{api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
+            assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
+                f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+        else:
+            args_str = ""
+            for ele in kernel['backend'].split(','):
+                args_str = args_str + ele.strip() + ', '
+            kernel_select_code = kernel_select_code + f"""
+  kernel_backend = ParseBackend({args_str[:-2]});
+"""
+
+    if kernel['layout'] is not None:
+        if '>' in kernel['layout']:
+            vars_list = kernel['layout'].split('>')
+            assert len(
+                vars_list
+            ) == 2, f"{api} api: The number of params to set layout with '>' only allows 2, but received {len(vars_list)}."
+            assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataLayout', \
+                f"{api} api: When use '>' to set kernel layout, the first param should be a attribute with DataLayout type."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayoutWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+        else:
+            vars_list = kernel['layout'].split(',')
+            assert len(
+                vars_list
+            ) == 1, f"{api} api: The number of params to set layout must be 1, but received {len(vars_list)}."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_layout = ParseLayout({vars_list[0].strip()});
+"""
+
+    if kernel['data_type'] is not None:
+        if '>' in kernel['data_type']:
+            vars_list = kernel['data_type'].split('>')
+            assert len(
+                vars_list
+            ) == 2, f"{api} api: The number of params to set data_type with '>' only allows 2, but received {len(vars_list)}."
+            assert vars_list[0].strip() in attrs['names'] and attrs['attr_info'][vars_list[0].strip()][0] == 'DataType', \
+                f"{api} api: When use '>' to set kernel data_type, the first param should be a attribute with DataType type."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataTypeWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
+"""
+
+        else:
+            vars_list = kernel['data_type'].split(',')
+            assert len(
+                vars_list
+            ) == 1, f"{api} api: The number of params to set data_type only allows 2, but received {len(vars_list)}."
+            kernel_select_code = kernel_select_code + f"""
+  kernel_data_type = ParseDataType({vars_list[0].strip()});
+"""
+
+    if len(input_names) == 0:
+        assert attr_backend_count > 0 and attr_layout_count > 0 and attr_data_type_count > 0, \
+            f"{api} api: When there is no input tensor, the args must have 'Backend', 'DataLayout' and 'DataType'."
+
+    kernel_select_args = ""
+    for input_name in input_names:
+        kernel_select_args = kernel_select_args + input_name + ", "
+
+    if len(kernel_select_args) > 2:
+        kernel_select_args = kernel_select_args[:-2]
+
+    kernel_select_code = kernel_key_item_init + kernel_select_code
+
+    if len(input_names) > 0:
+        kernel_select_code = kernel_select_code + f"""
+  if (kernel_backend == Backend::UNDEFINED 
+        || kernel_layout == DataLayout::UNDEFINED
+        || kernel_data_type == DataType::UNDEFINED ) {{
+    auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args});
+    auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {{
+      kernel_backend = kernel_key.backend();
+    }}
+    if (kernel_layout == DataLayout::UNDEFINED) {{
+      kernel_layout = kernel_key.layout();
+    }}
+    if (kernel_data_type == DataType::UNDEFINED) {{
+      kernel_data_type = kernel_key.dtype();
+    }}
+  }}"""
+
+    kernel_select_code = kernel_select_code + f"""
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "{kernel['func']}", {{kernel_backend, kernel_layout, kernel_data_type}});
+  VLOG(6) << "{api} API kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
+  VLOG(6) << "{api} API kernel: " << kernel;"""
+
+    return kernel_select_code
+
+
+def gene_infer_meta(input_names, attr_names, infer_meta) -> str:
+    infer_meta_params = infer_meta['param'] if infer_meta[
+        'param'] is not None else input_names + attr_names
+    param_code = ""
+    for param in infer_meta_params:
+        if param in input_names:
+            param_code = param_code + "GetDenseTensorMeta(*" + PREFIX_TENSOR_NAME + param + "), "
+        elif param in attr_names:
+            param_code = param_code + param + ", "
+        elif isinstance(param, str):
+            param_code = param_code + "\"" + param + "\", "
+        elif isinstance(param, bool):
+            param_code = param_code + str(param).lower() + ", "
+        else:
+            param_code = param_code + str(param) + ", "
+
+    param_code = param_code[:-2]
+    return f"""
+  auto out_meta = pten::{infer_meta['func']}({param_code});
+"""
+
+
+def get_kernel_args(input_names, attrs, kernel_param):
+    input_tensor_code = ""
+    for input_name in input_names:
+        # set input code
+        input_tensor_code = input_tensor_code + f"""
+  auto {PREFIX_TENSOR_NAME}{input_name} = TensorToDenseTensor({input_name});"""
+
+    attr_names = attrs['names']
+    if kernel_param is None:
+        kernel_param = input_names + attr_names
+
+    kernel_args = "*dev_ctx, "
+    for param in kernel_param:
+        if param in input_names:
+            kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", "
+        elif param in attr_names:
+            # set attr for kernel_context
+            if 'ScalarArray' in attrs['attr_info'][param][0]:
+                param = 'pten::ScalarArray(' + param + ')'
+            elif 'Scalar' in attrs['attr_info'][param][0]:
+                param = 'pten::Scalar(' + param + ')'
+            kernel_args = kernel_args + param + ", "
+        elif isinstance(param, bool):
+            kernel_args = kernel_args + str(param).lower() + ", "
+        else:
+            kernel_args = kernel_args + str(param) + ", "
+    return input_tensor_code, kernel_args[:-2]
+
+
+def gene_output(output_type):
+    kernel_output = ""
+    output_create = f"""
+  {output_type} out;"""
+
+    if output_type == 'Tensor' or output_type == 'std::vector<Tensor>':
+        kernel_output = 'dense_out'
+        output_create = output_create + """
+  auto dense_out = SetKernelOutput(out_meta, kernel_backend, &out);"""
+    elif re.match(r'std::tuple<.*>$', output_type):
+        out_num = output_type.count('Tensor')
+        for i in range(out_num):
+            kernel_output = kernel_output + f'dense_out_{i}, '
+            output_create = output_create + f"""
+  auto dense_out_{i} = SetKernelOutput(std::get<{i}>(out_meta), kernel_backend, &std::get<{i}>(out));"""
+
+        kernel_output = kernel_output[:-2]
+
+    return kernel_output, output_create