[AutoParallel] Dygraph basic impl for semi auto parallel (#55698)

* add phi forward api gen impl * add phi backward gen code * polish api code gen impl * polish code gen impl * remove auto_paralel namespace * add dygraph forward impl * add for_auto_parallel cond * fix code gen errors * add dygraph backward impl * resolve conflict with develop * refactor dist api gen impl * revert origin api gen impl * replace template for override func * fix dnnl marco error * revert third_party change * add with distributed marco * Update grad_tensor_holder.cc details * merge dist tensor constructor * change test tensor to replicate * fx typo * resolve conflict with develop * fix out dim error

[AutoParallel] Dygraph basic impl for semi auto parallel (#55698)
* add phi forward api gen impl * add phi backward gen code * polish api code gen impl * polish code gen impl * remove auto_paralel namespace * add dygraph forward impl * add for_auto_parallel cond * fix code gen errors * add dygraph backward impl * resolve conflict with develop * refactor dist api gen impl * revert origin api gen impl * replace template for override func * fix dnnl marco error * revert third_party change * add with distributed marco * Update grad_tensor_holder.cc details * merge dist tensor constructor * change test tensor to replicate * fx typo * resolve conflict with develop * fix out dim error
7039bef3 · Chen Weihang · GitHub · fcde3991 · 7039bef3 · 7039bef3
23 changed file
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -26,8 +26,12 @@
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
+
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif

 /**
 * Implementation of GradNodeBase, Edge and GradTensorHolder.
@@ -121,6 +125,14 @@ void GradNodeBase::SetGradInMeta(const paddle::Tensor& fwd_out,
    phi::SparseCsrTensor* csr_tensor =
        static_cast<phi::SparseCsrTensor*>(fwd_out.impl().get());
    dense_tensor = csr_tensor->mutable_non_zero_elements();
+#ifdef PADDLE_WITH_DISTRIBUTE
+  } else if (phi::distributed::DistTensor::classof(fwd_out.impl().get())) {
+    // TODO(chenweihang): DistTensor contains global and local meta, here
+    // only set the local meta now, we should set global meta later
+    dense_tensor =
+        static_cast<phi::distributed::DistTensor*>(fwd_out.impl().get())
+            ->mutable_value();
+#endif
  } else {
    VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
               "non-DenseTensor argument.";
@@ -256,10 +268,28 @@ void GradNodeBase::SetGradOutMeta(const paddle::Tensor& fwd_in,
                                          "which is illegal."));
      meta.SetTensorMeta(dense_tensor->meta());
      meta.SetPlace(fwd_in.place());
+#ifdef PADDLE_WITH_DISTRIBUTE
+    } else if (phi::distributed::DistTensor::classof(fwd_in.impl().get())) {
+      phi::DenseTensor* dense_tensor =
+          static_cast<phi::distributed::DistTensor*>(fwd_in.impl().get())
+              ->mutable_value();
+      PADDLE_ENFORCE_NE(
+          dense_tensor->meta().dtype,
+          phi::DataType::UNDEFINED,
+          paddle::platform::errors::Fatal("Attempting to copy DenseTensorMeta "
+                                          "with phi::DataType::UNDEFINED,"
+                                          "which is illegal."));
+      meta.SetTensorMeta(dense_tensor->meta());
+      meta.SetPlace(fwd_in.place());
+#endif
+    } else {
+      VLOG(7)
+          << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+             "non-DenseTensor argument.";
    }
  } else {
-    VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
-               "non-DenseTensor argument.";
+    VLOG(7) << "Unable to initialize the DenseTensorMeta because the Tensor "
+               "is not initialized.";
  }
 }

@@ -367,7 +397,8 @@ void GradNodeBase::SetGradOutMeta(const std::vector<paddle::Tensor>& fwd_in,
    // Record TensorMeta
    if (fwd_in_tensor.impl() && fwd_in_tensor.impl().get()) {
      if (phi::DenseTensor::classof(fwd_in_tensor.impl().get())) {
-        // Only Copy Meta
+        // TODO(chenweihang): DistTensor contains global and local meta, here
+        // only set the local meta now, we should set global meta later
        phi::DenseTensor* dense_tensor =
            static_cast<phi::DenseTensor*>(fwd_in_tensor.impl().get());
        PADDLE_ENFORCE_NE(dense_tensor->dtype(),

--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -20,6 +20,10 @@
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif

 namespace egr {

@@ -83,6 +87,23 @@ void GradTensorHolder::CopyValueFromTensor(size_t slot_id,
      } else if (t.is_sparse_csr_tensor() || t.is_sparse_coo_tensor()) {
        buffer_[slot_id][rank] =
            paddle::experimental::sparse::full_like(t, 1, t.dtype());
+#ifdef PADDLE_WITH_DISTRIBUTE
+      } else if (t.is_dist_tensor()) {
+        VLOG(6) << "Create a new dist tensor.";
+        // TODO(chenweihang): we need a shard_tensor API in C++
+        // TODO(chenweihang): replace by valid dist_attr later
+        auto temp =
+            paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
+        auto dense_temp =
+            std::dynamic_pointer_cast<phi::DenseTensor>(temp.impl());
+        auto dist_tensor = std::make_shared<phi::distributed::DistTensor>(
+            dense_temp,
+            dense_temp->meta(),
+            std::make_shared<
+                phi::distributed::auto_parallel::TensorDistAttr>());
+        temp.set_impl(dist_tensor);
+        buffer_[slot_id][rank] = temp;
+#endif
      } else {
        PADDLE_THROW(paddle::platform::errors::Fatal(
            "Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR "
@@ -178,6 +199,10 @@ void GradTensorHolder::add(size_t slot_id,
                                                        &buffer_values);
        }
      }
+#ifdef PADDLE_WITH_DISTRIBUTE
+    } else if (t.is_dist_tensor()) {
+      buffer_tensor = add_ad_func(t, buffer_tensor);
+#endif
    } else {
      // TODO(jiabin): Support Other TensorBase later
      // TODO(zhanlve): Replace SelectedRowsAddTensor with add_dygraph_function

--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -297,13 +297,15 @@ void InitDistTensorWithTensor(
  if (place == src.place()) {
    std::shared_ptr<phi::DenseTensor> tensor =
        std::static_pointer_cast<phi::DenseTensor>(src.impl());
-    self->tensor.set_impl(std::make_shared<DistTensor>(tensor, dist_attr));
+    self->tensor.set_impl(
+        std::make_shared<DistTensor>(tensor, tensor->meta(), dist_attr));
    VLOG(4) << "Same place, do ShareDataWith for DistTensor.";
  } else {
    std::shared_ptr<phi::DenseTensor> tensor =
        std::static_pointer_cast<phi::DenseTensor>(
            src.copy_to(place, true).impl());
-    self->tensor.set_impl(std::make_shared<DistTensor>(tensor, dist_attr));
+    self->tensor.set_impl(
+        std::make_shared<DistTensor>(tensor, tensor->meta(), dist_attr));
    VLOG(4) << "Different place, do TensorCopy for DistTensor.";
  }
  if (src.get_autograd_meta()) {

--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -249,6 +249,27 @@ static PyObject* tensor_method_numpy(TensorObject* self,
                           place,
                           dense_tensor->Holder()->ptr(),
                           dense_tensor->Holder()->size());
+#ifdef PADDLE_WITH_DISTRIBUTE
+    } else if (self->tensor.is_dist_tensor()) {
+      // TODO(chenweihang): deal with DistTensor as local DenseTensor now,
+      // if the local DenseTensor is shard or partial, do gather or reduce?
+      VLOG(6) << "Getting DistTensor's numpy value";
+      auto* dist_tensor =
+          static_cast<phi::distributed::DistTensor*>(self->tensor.impl().get());
+      auto& dense_tensor = dist_tensor->value();
+      cpu_tensor.set_meta(dense_tensor.meta());
+      // deep copy
+      auto tmp_allocation_ptr =
+          memory::Alloc(cpu_place, dense_tensor.Holder()->size());
+      cpu_tensor.ResetHolder(std::shared_ptr<phi::Allocation>(
+          tmp_allocation_ptr.release(), tmp_allocation_ptr.get_deleter()));
+      // deep copy
+      paddle::memory::Copy(place,
+                           cpu_tensor.Holder()->ptr(),
+                           place,
+                           dense_tensor.Holder()->ptr(),
+                           dense_tensor.Holder()->size());
+#endif
    } else {
      VLOG(6) << "Getting DenseTensor's numpy value";
      auto dense_tensor =
@@ -290,6 +311,22 @@ static PyObject* tensor_method_numpy(TensorObject* self,
                                      dense_tensor->Holder()->ptr(),
                                      dense_tensor->Holder()->size(),
                                      kind);
+#ifdef PADDLE_WITH_DISTRIBUTE
+    } else if (self->tensor.is_dist_tensor()) {
+      VLOG(6) << "Getting DistTensor's numpy value";
+      auto* dist_tensor =
+          static_cast<phi::distributed::DistTensor*>(self->tensor.impl().get());
+      auto& dense_tensor = dist_tensor->value();
+      cpu_tensor.set_meta(dense_tensor.meta());
+      auto tmp_allocation_ptr =
+          memory::Alloc(cpu_place, dense_tensor.Holder()->size());
+      cpu_tensor.ResetHolder(std::shared_ptr<phi::Allocation>(
+          tmp_allocation_ptr.release(), tmp_allocation_ptr.get_deleter()));
+      paddle::platform::GpuMemcpySync(cpu_tensor.Holder()->ptr(),
+                                      dense_tensor.Holder()->ptr(),
+                                      dense_tensor.Holder()->size(),
+                                      kind);
+#endif
    } else {
      VLOG(6) << "Getting DenseTensor's numpy value";
      auto dense_tensor =

--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -9,6 +9,9 @@ set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/api.h)
 set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
 set(api_source_file_tmp ${api_source_file}.tmp)
+# dist forward api file
+set(dist_api_gen_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/dist_api_gen.py)

 # backward api file
 set(bw_api_gen_file
@@ -21,6 +24,9 @@ set(bw_api_header_file
 set(bw_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/backward_api.cc)
 set(bw_api_header_file_tmp ${bw_api_header_file}.tmp)
 set(bw_api_source_file_tmp ${bw_api_source_file}.tmp)
+# dist backward api file
+set(dist_bw_api_gen_file
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/dist_bw_api_gen.py)

 # dygraph(intermediate) api file
 set(im_api_gen_file
@@ -124,19 +130,37 @@ endif()

 execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml)

-# generate forward api
-execute_process(
-  COMMAND
-    ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${api_yaml_file}
-    ${legacy_api_yaml_file} --api_header_path ${api_header_file_tmp}
-    --api_source_path ${api_source_file_tmp})
+if(WITH_DISTRIBUTE)
+  # generate dist forward api
+  execute_process(
+    COMMAND
+      ${PYTHON_EXECUTABLE} ${dist_api_gen_file} --api_yaml_path
+      ${api_yaml_file} ${legacy_api_yaml_file} --api_header_path
+      ${api_header_file_tmp} --api_source_path ${api_source_file_tmp})

-# generate backward api
-execute_process(
-  COMMAND
-    ${PYTHON_EXECUTABLE} ${bw_api_gen_file} --backward_yaml_path
-    ${bw_api_yaml_file} ${legacy_bw_api_yaml_file} --backward_header_path
-    ${bw_api_header_file_tmp} --backward_source_path ${bw_api_source_file_tmp})
+  # generate dist backward api
+  execute_process(
+    COMMAND
+      ${PYTHON_EXECUTABLE} ${dist_bw_api_gen_file} --backward_yaml_path
+      ${bw_api_yaml_file} ${legacy_bw_api_yaml_file} --backward_header_path
+      ${bw_api_header_file_tmp} --backward_source_path
+      ${bw_api_source_file_tmp})
+else()
+  # generate forward api
+  execute_process(
+    COMMAND
+      ${PYTHON_EXECUTABLE} ${api_gen_file} --api_yaml_path ${api_yaml_file}
+      ${legacy_api_yaml_file} --api_header_path ${api_header_file_tmp}
+      --api_source_path ${api_source_file_tmp})
+
+  # generate backward api
+  execute_process(
+    COMMAND
+      ${PYTHON_EXECUTABLE} ${bw_api_gen_file} --backward_yaml_path
+      ${bw_api_yaml_file} ${legacy_bw_api_yaml_file} --backward_header_path
+      ${bw_api_header_file_tmp} --backward_source_path
+      ${bw_api_source_file_tmp})
+endif()

 # generate fused_op api
 execute_process(

--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -19,6 +19,13 @@ limitations under the License. */

 DECLARE_bool(use_stride_kernel);

+#include "glog/logging.h"
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
+
 namespace paddle {
 namespace experimental {

@@ -475,5 +482,26 @@ void TransStride(phi::DeviceContext* dev_ctx,
                 phi::SelectedRows* from,
                 phi::SelectedRows* to) {}

+#ifdef PADDLE_WITH_DISTRIBUTE
+/* ------------------ for auto parallel ----------------------- */
+
+phi::distributed::DistTensor* SetKernelDistOutput(Tensor* out) {
+  if (out) {
+    // TODO(chenweihang): now all dist case are nullptr
+    if (out->impl() == nullptr) {
+      auto dense_t = std::make_shared<phi::DenseTensor>();
+      // TODO(chenweihang): polish code, dist_attr is null now
+      auto dist_attr =
+          std::make_shared<phi::distributed::auto_parallel::TensorDistAttr>();
+      auto dist_t = std::make_shared<phi::distributed::DistTensor>(
+          dense_t, phi::DenseTensorMeta(), dist_attr);
+      out->set_impl(dist_t);
+    }
+    return static_cast<phi::distributed::DistTensor*>(out->impl().get());
+  }
+  return nullptr;
+}
+#endif
+
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -24,6 +24,12 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/string_tensor.h"

+namespace phi {
+namespace distributed {
+class DistTensor;
+}  // namespace distributed
+}  // namespace phi
+
 namespace paddle {
 namespace experimental {

@@ -127,5 +133,11 @@ void TransStride(phi::DeviceContext* dev_ctx,
                 phi::SelectedRows* from,
                 phi::SelectedRows* to);

+#ifdef PADDLE_WITH_DISTRIBUTE
+/* ------------------ for auto parallel ----------------------- */
+
+phi::distributed::DistTensor* SetKernelDistOutput(Tensor* out);
+#endif
+
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -27,6 +27,10 @@ limitations under the License. */
 #include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/contiguous_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
+
 DECLARE_bool(use_stride_kernel);

 namespace paddle {
@@ -567,5 +571,46 @@ void TransDataBackend(const phi::SelectedRows* tensor,
  }
 }

+#ifdef PADDLE_WITH_DISTRIBUTE
+/* ------------------ for auto parallel ----------------------- */
+
+std::shared_ptr<phi::distributed::DistTensor> PrepareDataForDistTensor(
+    const Tensor& input,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag,
+    bool is_stride_kernel) {
+  const auto& tensor_in = input.impl();
+  if (tensor_in) {
+    phi::distributed::DistTensor* dist_tensor =
+        static_cast<phi::distributed::DistTensor*>(tensor_in.get());
+    phi::DenseTensor& dense_tensor = *(dist_tensor->mutable_value());
+    if (!transform_flag.NeedTransform() || !dense_tensor.initialized() ||
+        (!NeedTransformPlace(
+             dense_tensor.place(), target_args_def.backend, transform_flag) &&
+         !NeedTransformDataType(
+             dense_tensor.dtype(), target_args_def.dtype, transform_flag) &&
+         !NeedTransformLayout(dense_tensor.layout(),
+                              target_args_def.layout,
+                              dense_tensor.place(),
+                              transform_flag) &&
+         !NeedTransform2Contiguous(is_stride_kernel,
+                                   dense_tensor.meta().is_contiguous()))) {
+      return std::static_pointer_cast<phi::distributed::DistTensor>(tensor_in);
+    }
+    phi::DenseTensor out = TransformData(
+        &dense_tensor, target_args_def, transform_flag, is_stride_kernel);
+    // TODO(chenweihang): The global meta in DistTensor is not changed,
+    // but the local meta in DenseTensor maybe changed, such as layout
+    // change(NCHW->NHWC), so the new DistTensor's meta maybe not unified.
+    VLOG(6) << "PrepareDataForDistTensor return transformed dist tensor";
+    return std::make_shared<phi::distributed::DistTensor>(
+        std::make_shared<phi::DenseTensor>(std::move(out)),
+        dist_tensor->meta(),
+        dist_tensor->dist_attr());
+  }
+  return nullptr;
+}
+#endif
+
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -20,6 +20,12 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"

+namespace phi {
+namespace distributed {
+class DistTensor;
+}  // namespace distributed
+}  // namespace phi
+
 namespace paddle {
 namespace experimental {

@@ -165,5 +171,16 @@ inline bool NeedTransformPlace(const phi::Place& src_place,
  return ret;
 }

+#ifdef PADDLE_WITH_DISTRIBUTE
+/* ------------------ for auto parallel ----------------------- */
+
+// TODO(chenweihang): impl Reshard input and output function
+std::shared_ptr<phi::distributed::DistTensor> PrepareDataForDistTensor(
+    const Tensor& input,
+    const phi::TensorArgDef& target_args_def,
+    const TransformFlag& transform_flag,
+    bool is_stride_kernel);
+#endif
+
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -173,6 +173,36 @@ struct KernelTypeParser : ArgsIterator<KernelTypeParser> {
  }
 };

+#ifdef PADDLE_WITH_DISTRIBUTE
+/* ------------------ for auto parallel ----------------------- */
+
+struct DistTensorTypeParser : ArgsIterator<DistTensorTypeParser> {
+  bool result = true;
+
+  void operator()(const Tensor& x) { result &= x.is_dist_tensor(); }
+
+  void operator()(const paddle::optional<Tensor>& x) {
+    if (x) {
+      result &= x.get_ptr()->is_dist_tensor();
+    }
+  }
+
+  void operator()(const std::vector<Tensor>& x) {
+    if (!x.empty()) {
+      for (auto& t : x) {
+        result &= t.is_dist_tensor();
+      }
+    }
+  }
+
+  // skip other type args, these args don't used in kernel selection
+  template <typename T>
+  void operator()(const T& x) {
+    // do nothing
+  }
+};
+#endif
+
 }  // namespace detail

 template <typename... Args>
@@ -205,5 +235,12 @@ DataLayout ParseLayout(DataLayout layout);
 DataLayout ParseLayout(const Tensor& tensor);
 DataLayout ParseLayoutWithInputOrder(DataLayout layout, const Tensor& tensor);

+#ifdef PADDLE_WITH_DISTRIBUTE
+template <typename... Args>
+bool AllInputsAreDistTensor(const Args&... args) {
+  return detail::DistTensorTypeParser().apply(args...).result;
+}
+#endif
+
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/api/yaml/generator/dist_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_api_gen.py
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import yaml
+from backward_api_gen import BackwardAPI
+from dist_api_gen import DistForwardAPI
+
+######################
+# Code Gen Templates #
+######################
+
+# 1. Create API Outputs
+SINGLE_OUT_CREATION_TEMPLATE = """
+    auto dist_out = SetKernelDistOutput({});
+    auto dense_out = dist_out->mutable_value();
+"""
+INPLACE_OUT_CREATION_TEMPLATE = """
+    *{} = {};
+"""
+MULTI_SINGLE_OUT_CREATION_TEMPLATE = """
+    auto dist_out_{} = SetKernelDistOutput({});
+    auto dense_out_{} = dist_out_{}->mutable_value();
+"""
+
+
+class DistBackwardAPI(DistForwardAPI, BackwardAPI):
+    def __init__(self, backward_item_yaml):
+        BackwardAPI.__init__(self, backward_item_yaml)
+        self.init_dist_api_members()
+
+    # override DistForwardAPI's method
+    def generate_output_creation_code(self) -> str:
+        # backward api only need to generate kernel outputs
+        output_num = len(self.outputs['types'])
+        output_creation_code = ""
+        if output_num == 1:
+            self.dist_output_args.append('dist_out')
+            self.dense_output_args.append('dense_out')
+            if self.outputs['types'][0] == 'Tensor':
+                output_creation_code += SINGLE_OUT_CREATION_TEMPLATE.format(
+                    self.outputs['names'][0]
+                )
+            else:
+                self.vector_output_size_assertion_check()
+        elif output_num > 1:
+            for i, out_type in enumerate(self.outputs['types']):
+                self.dist_output_args.append(f'dist_out_{i}')
+                self.dense_output_args.append(f'dense_out_{i}')
+                if out_type == 'Tensor':
+                    output_creation_code += (
+                        MULTI_SINGLE_OUT_CREATION_TEMPLATE.format(
+                            i, self.outputs['names'][i], i, i
+                        )
+                    )
+                else:
+                    self.vector_output_size_assertion_check()
+        else:
+            raise ValueError(
+                "{} : Output error: the output should not be empty.".format(
+                    self.api
+                )
+            )
+
+        return output_creation_code
+
+    # override DistForwardAPI's method
+    def generate_return_code(self) -> str:
+        return "return;"
+
+    # override BaseAPI's method
+    def get_api_func_name(self):
+        return self.api
+
+    # override BaseAPI's method
+    # The method lookup order are: (DistBackwardAPI.__mro__)
+    # <class '__main__.DistBackwardAPI'>,
+    # <class 'dist_api_gen.DistForwardAPI'>,
+    # <class 'api_gen.ForwardAPI'>,
+    # <class 'backward_api_gen.BackwardAPI'>,
+    # <class 'api_base.BaseAPI'>,
+    # <class 'object'>
+    # if don't override it, the ForwardAPI's gene_output wiil be called
+    def gene_output(
+        self,
+        out_dtype_list,
+        out_tensor_type_list=None,
+        code_indent='',
+        inplace_flag=False,
+    ):
+        return BackwardAPI.gene_output(
+            self,
+            out_dtype_list,
+            out_tensor_type_list,
+            code_indent,
+            inplace_flag,
+        )
+
+    # override BaseAPI's method
+    def get_return_type(self, inplace_flag=False):
+        return BackwardAPI.get_return_type(self)
+
+    # override BaseAPI's method
+    def gene_return_code(self):
+        return ""
+
+    # override BaseAPI's method
+    def gene_api_declaration(self) -> str:
+        return BackwardAPI.gene_api_declaration(self)
+
+
+def header_include():
+    return """
+#include <tuple>
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/int_array.h"
+#include "paddle/utils/optional.h"
+"""
+
+
+def source_include(header_file_path, fw_header_file_path):
+    return f"""
+#include "{header_file_path}"
+#include <memory>
+
+#include "glog/logging.h"
+#include "gflags/gflags.h"
+
+#include "paddle/phi/api/lib/api_custom_impl.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/data_transform.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/common/type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "{fw_header_file_path}"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
+#include "paddle/phi/api/profiler/event_tracing.h"
+#include "paddle/phi/api/profiler/supplement_tracing.h"
+
+DECLARE_bool(conv2d_disable_cudnn);
+DECLARE_int32(low_precision_op_list);
+"""
+
+
+def backward_api_namespace():
+    return (
+        """
+namespace paddle {
+namespace experimental {
+
+""",
+        """
+
+}  // namespace experimental
+}  // namespace paddle
+""",
+    )
+
+
+def generate_backward_api(
+    backward_yaml_path,
+    is_fused_backward_yaml,
+    header_file_path,
+    source_file_path,
+):
+    bw_apis = []
+    for each_api_yaml in backward_yaml_path:
+        with open(each_api_yaml, 'r') as f:
+            api_list = yaml.load(f, Loader=yaml.FullLoader)
+            if api_list:
+                bw_apis.extend(api_list)
+
+    header_file = open(header_file_path, 'w')
+    source_file = open(source_file_path, 'w')
+
+    namespace = backward_api_namespace()
+
+    header_file.write("#pragma once\n")
+    header_file.write(header_include())
+    header_file.write(namespace[0])
+
+    include_header_file = (
+        "paddle/phi/api/backward/fused_backward_api.h"
+        if is_fused_backward_yaml
+        else "paddle/phi/api/backward/backward_api.h"
+    )
+    include_fw_header_file = (
+        "paddle/phi/api/include/fused_api.h"
+        if is_fused_backward_yaml
+        else "paddle/phi/api/include/api.h"
+    )
+    source_file.write(
+        source_include(include_header_file, include_fw_header_file)
+    )
+    source_file.write(namespace[0])
+    # not all fused ops supoort dygraph
+    if is_fused_backward_yaml is True:
+        new_bw_apis = [
+            bw_api
+            for bw_api in bw_apis
+            if "support_dygraph_mode" in bw_api
+            and bw_api["support_dygraph_mode"] is True
+        ]
+        bw_apis = new_bw_apis
+
+    for bw_api in bw_apis:
+        dist_bw_api = DistBackwardAPI(bw_api)
+        header_file.write(dist_bw_api.gene_api_declaration())
+        if is_fused_backward_yaml is True:
+            source_file.write(dist_bw_api.gene_api_code())
+        else:
+            source_file.write(dist_bw_api.gene_api_code())
+
+    header_file.write(namespace[1])
+    source_file.write(namespace[1])
+
+    header_file.close()
+    source_file.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate PaddlePaddle C++ backward API files'
+    )
+    parser.add_argument(
+        '--backward_yaml_path',
+        help='path to backward yaml file',
+        nargs='+',
+        default=['paddle/phi/api/yaml/backward.yaml'],
+    )
+
+    parser.add_argument(
+        '--is_fused_backward_yaml',
+        help='flag of fused backward yaml',
+        action='store_true',
+    )
+
+    parser.add_argument(
+        '--backward_header_path',
+        help='output of generated backward header code file',
+        default='paddle/phi/api/backward/backward_api.h',
+    )
+
+    parser.add_argument(
+        '--backward_source_path',
+        help='output of generated backward source code file',
+        default='paddle/phi/api/lib/backward_api.cc',
+    )
+
+    options = parser.parse_args()
+
+    backward_yaml_path = options.backward_yaml_path
+    is_fused_backward_yaml = options.is_fused_backward_yaml
+    header_file_path = options.backward_header_path
+    source_file_path = options.backward_source_path
+
+    generate_backward_api(
+        backward_yaml_path,
+        is_fused_backward_yaml,
+        header_file_path,
+        source_file_path,
+    )
+
+
+if __name__ == '__main__':
+    main()
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -12,6 +12,7 @@ collect_srcs(
  flags.cc
  errors.cc
  enforce.cc
+  storage_properties.cc
  os_info.cc
  kernel_context.cc
  ddim.cc

--- a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
@@ -18,6 +18,9 @@
 #include "paddle/phi/core/dense_tensor.h"

 namespace phi {
+
+class DenseTensorUtils;
+
 namespace distributed {

 namespace auto_parallel {
@@ -54,10 +57,10 @@ class DistTensor final
  }

  DistTensor(const std::shared_ptr<phi::DenseTensor>& dense_tensor,
+             const DenseTensorMeta& meta,
             const std::shared_ptr<TensorDistAttr>& dist_attr)
-      : dist_attr_(dist_attr) {
+      : meta_(meta), dist_attr_(dist_attr) {
    value_ = std::make_unique<DenseTensor>(*dense_tensor);
-    set_meta(dense_tensor->meta());
  }

  ~DistTensor() = default;
@@ -121,6 +124,8 @@ class DistTensor final
  void set_meta(const DenseTensorMeta& meta);

 private:
+  friend class phi::DenseTensorUtils;
+
  DenseTensorMeta meta_;
  std::shared_ptr<TensorDistAttr> dist_attr_{nullptr};
  std::unique_ptr<DenseTensor> value_{nullptr};

--- a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
@@ -93,6 +93,7 @@ std::shared_ptr<DistTensor> RToSReshardFunction::Eval(

  return std::make_shared<DistTensor>(
      std::make_shared<DenseTensor>(out_physical_tensor_cur_rank),
+      out_physical_tensor_cur_rank.meta(),
      out_dist_attr);
 }


--- a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
@@ -66,7 +66,9 @@ std::shared_ptr<DistTensor> SToRReshardFunction::Eval(
      dev_ctx, in_physical_tensor_cur_rank, in_process_ids);

  return std::make_shared<DistTensor>(
-      std::make_shared<DenseTensor>(out_all_gather), out_dist_attr);
+      std::make_shared<DenseTensor>(out_all_gather),
+      out_all_gather.meta(),
+      out_dist_attr);
 }

 }  // namespace distributed

--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -17,11 +17,15 @@ limitations under the License. */
 #include "glog/logging.h"

 #include "paddle/phi/core/dense_tensor.h"
+
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/core/string_tensor_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif

 namespace phi {

@@ -84,6 +88,12 @@ void MetaTensor::set_dims(const DDim& dims) {
  } else if (phi::SparseCsrTensor::classof(tensor_)) {
    DenseTensorUtils::GetMutableMeta(static_cast<SparseCsrTensor*>(tensor_))
        ->dims = dims;
+#ifdef PADDLE_WITH_DISTRIBUTE
+  } else if (phi::distributed::DistTensor::classof(tensor_)) {
+    DenseTensorUtils::GetMutableMeta(
+        static_cast<distributed::DistTensor*>(tensor_))
+        ->dims = dims;
+#endif
  } else {
    PADDLE_THROW(phi::errors::Unimplemented(
        "Unsupported setting dims for `%s`.", tensor_->type_info().name()));
@@ -115,7 +125,12 @@ void MetaTensor::set_dtype(DataType dtype) {
  } else if (phi::SparseCsrTensor::classof(tensor_)) {
    DenseTensorUtils::GetMutableMeta(static_cast<SparseCsrTensor*>(tensor_))
        ->dtype = dtype;
-    // No need to set dtype
+#ifdef PADDLE_WITH_DISTRIBUTE
+  } else if (phi::distributed::DistTensor::classof(tensor_)) {
+    DenseTensorUtils::GetMutableMeta(
+        static_cast<distributed::DistTensor*>(tensor_))
+        ->dtype = dtype;
+#endif
  } else {
    PADDLE_THROW(phi::errors::Unimplemented(
        "Unsupported settting dtype for `%s`.", tensor_->type_info().name()));
@@ -146,6 +161,12 @@ void MetaTensor::set_layout(DataLayout layout) {
  } else if (phi::SparseCsrTensor::classof(tensor_)) {
    DenseTensorUtils::GetMutableMeta(static_cast<SparseCsrTensor*>(tensor_))
        ->layout = layout;
+#ifdef PADDLE_WITH_DISTRIBUTE
+  } else if (phi::distributed::DistTensor::classof(tensor_)) {
+    DenseTensorUtils::GetMutableMeta(
+        static_cast<distributed::DistTensor*>(tensor_))
+        ->layout = layout;
+#endif
  } else {
    PADDLE_THROW(phi::errors::Unimplemented(
        "Unsupported settting layout for `%s`.", tensor_->type_info().name()));
@@ -156,7 +177,11 @@ void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
  ValidCheck(*this);
  ValidCheck(meta_tensor);
  if (phi::SparseCooTensor::classof(tensor_) ||
-      phi::SparseCsrTensor::classof(tensor_)) {
+      phi::SparseCsrTensor::classof(tensor_)
+#ifdef PADDLE_WITH_DISTRIBUTE
+      || phi::distributed::DistTensor::classof(tensor_)
+#endif
+  ) {
    return;
  }
  if (meta_tensor.lod().empty()) {
@@ -182,7 +207,11 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
  if (phi::DenseTensor::classof(tensor_) ||
      phi::SelectedRows::classof(tensor_) ||
      phi::SparseCooTensor::classof(tensor_) ||
-      phi::SparseCsrTensor::classof(tensor_)) {
+      phi::SparseCsrTensor::classof(tensor_)
+#ifdef PADDLE_WITH_DISTRIBUTE
+      || phi::distributed::DistTensor::classof(tensor_)
+#endif
+  ) {
    share_dims(meta_tensor);
    set_dtype(meta_tensor.dtype());
    set_layout(meta_tensor.layout());
@@ -207,7 +236,12 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
  bool is_selected_rows = phi::SelectedRows::classof(tensor_);
  bool is_sparse_coo = phi::SparseCooTensor::classof(tensor_);
  bool is_sparse_csr = phi::SparseCsrTensor::classof(tensor_);
-  if (is_dense_tensor || is_selected_rows || is_sparse_coo || is_sparse_csr) {
+  bool is_dist_tensor = false;
+#ifdef PADDLE_WITH_DISTRIBUTE
+  is_dist_tensor = phi::distributed::DistTensor::classof(tensor_);
+#endif
+  if (is_dense_tensor || is_selected_rows || is_sparse_coo || is_sparse_csr ||
+      is_dist_tensor) {
    if (is_selected_rows) {
      const auto in_tensor_base = meta_tensor.tensor();
      PADDLE_ENFORCE_EQ(

--- a/paddle/phi/core/storage_properties.cc
+++ b/paddle/phi/core/storage_properties.cc
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/storage_properties.h"
+
+namespace phi {
+
+std::unique_ptr<StorageProperties> CopyStorageProperties(
+    const std::unique_ptr<StorageProperties>& sp) {
+  if (sp) {
+    if (NPUStorageProperties::classof(sp.get())) {
+      auto result = std::make_unique<NPUStorageProperties>();
+      result->storage_format =
+          static_cast<NPUStorageProperties*>(sp.get())->storage_format;
+      result->storage_dims =
+          static_cast<NPUStorageProperties*>(sp.get())->storage_dims;
+      return result;
+#ifdef PADDLE_WITH_DNNL
+    } else if (OneDNNStorageProperties::classof(sp.get())) {
+      auto result = std::make_unique<OneDNNStorageProperties>();
+      result->format = static_cast<OneDNNStorageProperties*>(sp.get())->format;
+      result->mem_desc =
+          static_cast<OneDNNStorageProperties*>(sp.get())->mem_desc;
+      return result;
+#endif
+    } else {
+      return nullptr;
+    }
+  }
+  return nullptr;
+}
+
+}  // namespace phi
--- a/paddle/phi/core/storage_properties.h
+++ b/paddle/phi/core/storage_properties.h
@@ -28,11 +28,13 @@ namespace phi {
 struct StorageProperties {
 public:
  virtual ~StorageProperties() = default;
+
  TypeInfo<StorageProperties> type_info() const { return type_info_; }

 private:
  template <typename T, typename U>
  friend class TypeInfoTraits;
+
  TypeInfo<StorageProperties> type_info_{
      TypeInfo<StorageProperties>::kUnknownType};
 };
@@ -70,29 +72,7 @@ struct OneDNNStorageProperties
 };
 #endif

-static std::unique_ptr<StorageProperties> CopyStorageProperties(
-    const std::unique_ptr<StorageProperties>& sp) {
-  if (sp) {
-    if (NPUStorageProperties::classof(sp.get())) {
-      auto result = std::make_unique<NPUStorageProperties>();
-      result->storage_format =
-          static_cast<NPUStorageProperties*>(sp.get())->storage_format;
-      result->storage_dims =
-          static_cast<NPUStorageProperties*>(sp.get())->storage_dims;
-      return result;
-#ifdef PADDLE_WITH_DNNL
-    } else if (OneDNNStorageProperties::classof(sp.get())) {
-      auto result = std::make_unique<OneDNNStorageProperties>();
-      result->format = static_cast<OneDNNStorageProperties*>(sp.get())->format;
-      result->mem_desc =
-          static_cast<OneDNNStorageProperties*>(sp.get())->mem_desc;
-      return result;
-#endif
-    } else {
-      return nullptr;
-    }
-  }
-  return nullptr;
-}
+std::unique_ptr<StorageProperties> CopyStorageProperties(
+    const std::unique_ptr<StorageProperties>& sp);

 }  // namespace phi
--- a/paddle/phi/core/tensor_utils.h
+++ b/paddle/phi/core/tensor_utils.h
@@ -21,8 +21,16 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/tensor_array.h"
 #include "paddle/phi/core/tensor_meta.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
+
 namespace phi {

+// TODO(chenweihang): DenseTensorUtils has been abused during the development
+// process, and now its semantics are incorrect. It can not only operate
+// DenseTensors, but also other types of Tensors, requiring renaming or
+// splitting
 class DenseTensorUtils {
 public:
  static DenseTensorMeta* GetMutableMeta(DenseTensor* tensor) {
@@ -37,6 +45,12 @@ class DenseTensorUtils {
    return &(tensor->meta_);
  }

+#ifdef PADDLE_WITH_DISTRIBUTE
+  static DenseTensorMeta* GetMutableMeta(distributed::DistTensor* tensor) {
+    return &(tensor->meta_);
+  }
+#endif
+
  static const std::shared_ptr<phi::Allocation>& GetHolder(
      const DenseTensor& tensor) {
    return tensor.holder_;

--- a/test/auto_parallel/test_dist_tensor.py
+++ b/test/auto_parallel/test_dist_tensor.py
@@ -18,6 +18,7 @@ import numpy as np

 import paddle
 import paddle.distributed as dist
+import paddle.nn.functional as F


 class TestDistTensor(unittest.TestCase):
@@ -52,5 +53,36 @@ class TestDistTensor(unittest.TestCase):
        self.assertEqual(dist_tensor_with_tensor.dist_attr, dist_attr)


+class TestDistTensorForDygraphAPI(unittest.TestCase):
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(np1, np2, rtol=1e-05)
+
+    def create_local_and_dist_tensor_pair(self, np_array):
+        local_t = paddle.to_tensor(np_array, dtype='float32')
+
+        mesh = dist.ProcessMesh([0], dim_names=["x"])
+        dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=[None, None])
+        dist_t = dist.shard_tensor(np_array, dist_attr=dist_attr)
+
+        local_t.stop_gradient = False
+        dist_t.stop_gradient = False
+
+        return local_t, dist_t
+
+    def test_relu_api_for_dist_tensor(self):
+        x = np.random.random(size=[4, 4]).astype("float32")
+        local_in, dist_in = self.create_local_and_dist_tensor_pair(x)
+        local_out = F.relu(local_in)
+        dist_out = F.relu(dist_in)
+        self.check_tensor_eq(local_out, dist_out)
+
+        # test backward
+        local_out.backward()
+        dist_out.backward()
+        self.check_tensor_eq(local_in.grad, dist_in.grad)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/test/cpp/auto_parallel/dist_tensor_test.cc
+++ b/test/cpp/auto_parallel/dist_tensor_test.cc
@@ -47,7 +47,7 @@ TEST(dist_tensor, constructor) {
  EXPECT_TRUE(x3.initialized());

  auto a = std::make_shared<DenseTensor>(alloc, DenseTensorMeta(dtype, dims));
-  DistTensor x4(a, dist_attr);
+  DistTensor x4(a, a->meta(), dist_attr);
  EXPECT_TRUE(x4.defined());
  EXPECT_TRUE(x4.initialized());
 }

--- a/test/cpp/auto_parallel/test_reshard_r_to_s.cc
+++ b/test/cpp/auto_parallel/test_reshard_r_to_s.cc
@@ -54,7 +54,9 @@ std::shared_ptr<DistTensor> ConstructReplicatedDistCPU(
  dist_attr->set_process_mesh(mesh);

  return std::make_shared<DistTensor>(
-      std::make_shared<DenseTensor>(input_dense), dist_attr);
+      std::make_shared<DenseTensor>(input_dense),
+      input_dense.meta(),
+      dist_attr);
 }

 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -87,7 +89,9 @@ std::shared_ptr<DistTensor> ConstructReplicatedDistGPU(
  dist_attr->set_process_mesh(mesh);

  return std::make_shared<DistTensor>(
-      std::make_shared<DenseTensor>(input_dense_gpu), dist_attr);
+      std::make_shared<DenseTensor>(input_dense_gpu),
+      input_dense_gpu.meta(),
+      dist_attr);
 }
 #endif