From e32c43758e9d9b9572b0b16c7576fda8747352c9 Mon Sep 17 00:00:00 2001
From: LiYuRio <63526175+LiYuRio@users.noreply.github.com>
Date: Tue, 13 Jun 2023 21:42:34 +0800
Subject: [PATCH] Construct dist tensor (#54425)

* construct dist tensor

* move constructor to header
---
 paddle/fluid/pybind/auto_parallel_py.cc       |   9 +-
 paddle/fluid/pybind/eager.cc                  | 210 ++++++++++++++++--
 paddle/fluid/pybind/eager_method.cc           |  27 +++
 paddle/fluid/pybind/eager_properties.cc       |  22 ++
 paddle/fluid/pybind/eager_utils.cc            |  35 +++
 paddle/fluid/pybind/eager_utils.h             |  15 ++
 paddle/fluid/pybind/tensor.cc                 |  14 ++
 paddle/phi/api/include/tensor.h               |   7 +
 paddle/phi/api/lib/kernel_dispatch.h          |   4 +
 paddle/phi/api/lib/tensor.cc                  |  10 +
 paddle/phi/core/dense_tensor.h                |   6 +
 .../distributed/auto_parallel/CMakeLists.txt  |  10 +-
 .../distributed/auto_parallel/dist_tensor.cc  |  69 ++++++
 .../distributed/auto_parallel/dist_tensor.h   | 130 +++++++++++
 paddle/phi/core/utils/type_info.cc            |   8 +
 python/paddle/tensor/to_string.py             |  24 ++
 test/auto_parallel/CMakeLists.txt             |   1 +
 test/auto_parallel/test_dist_tensor.py        |  54 +++++
 18 files changed, 637 insertions(+), 18 deletions(-)
 create mode 100644 paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
 create mode 100644 paddle/phi/core/distributed/auto_parallel/dist_tensor.h
 create mode 100644 test/auto_parallel/test_dist_tensor.py
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
index fdac30be8f0..1b78d7bd257 100644
--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -43,6 +43,8 @@ using phi::distributed::auto_parallel::Machine;
 using phi::distributed::auto_parallel::ProcessMesh;
 using phi::distributed::auto_parallel::TensorDistAttr;
 
+PyTypeObject *g_tensor_dist_attr_pytype = nullptr;
+
 static inline const ProcessMesh *get_tensor_process_mesh(
     const TensorDistAttr &self) {
   if (self.process_mesh().empty()) {
@@ -225,8 +227,11 @@ void BindAutoParallel(py::module *m) {
           py::arg("memo"))
       .def("__str__", &DeviceMesh::to_string);
 
-  py::class_<TensorDistAttr>(*m, "TensorDistAttr")
-      .def(py::init<>())
+  py::class_<TensorDistAttr, std::shared_ptr<TensorDistAttr>> py_dist_attr(
+      *m, "TensorDistAttr");
+  g_tensor_dist_attr_pytype =
+      reinterpret_cast<PyTypeObject *>(py_dist_attr.ptr());
+  py_dist_attr.def(py::init<>())
       .def(py::init([](const VarDesc &var_desc) {
         auto shape =
             paddle::distributed::auto_parallel::get_tensor_shape(&var_desc);
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index a732ed0c1ec..adf62731f03 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -41,6 +41,14 @@ limitations under the License. */
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/core/string_tensor.h"
+
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+using phi::distributed::auto_parallel::DistTensor;
+using phi::distributed::auto_parallel::TensorDistAttr;
+#endif
+
 namespace paddle {
 namespace pybind {
 
@@ -60,6 +68,52 @@ PyObject* TensorNew(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
   return obj;
 }
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+void EmptyDistTensorInitializer(
+    TensorObject* self,
+    const std::string& name,
+    const paddle::platform::Place& place,
+    const std::shared_ptr<TensorDistAttr>& dist_attr,
+    bool persistable = false,
+    int stop_gradient = -1,
+    framework::proto::VarType::Type dtype =
+        paddle::framework::proto::VarType::FP32,
+    const std::vector<int>& dims = {0}) {
+  auto ddims = phi::make_ddim(dims);
+  self->tensor.set_name(name);
+  auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor));
+  autograd_meta->SetPersistable(persistable);
+  if (stop_gradient != -1) {
+    autograd_meta->SetStopGradient(static_cast<bool>(stop_gradient));
+  }
+
+  std::shared_ptr<DistTensor> dist_tensor = nullptr;
+  if (dims.size() == 1 && dims[0] == 0) {
+    std::shared_ptr<phi::Allocation> allocation_ptr = nullptr;
+    dist_tensor = std::make_shared<DistTensor>(
+        allocation_ptr,
+        phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
+                             ddims),
+        dist_attr);
+  } else {
+    dist_tensor = std::make_shared<DistTensor>(
+        std::make_shared<phi::Allocation>(),
+        phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
+                             ddims),
+        dist_attr);
+  }
+  self->tensor.set_impl(dist_tensor);
+
+  if (!autograd_meta->GetMutableGradNode()) {
+    autograd_meta->SetGradNode(
+        std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
+    VLOG(3) << "Tensor(" << name
+            << ") have not GradNode, add GradNodeAccumulation"
+            << autograd_meta->GradNode() << " for it.";
+  }
+}
+#endif
+
 // TODO(jiabin): Overload this once we need more constructor in Python
 void EmptyTensorInitializer(TensorObject* self,
                             const std::string& name,
@@ -82,6 +136,7 @@ void EmptyTensorInitializer(TensorObject* self,
     // TODO(jiabin): Maybe support LOD later
     std::shared_ptr<phi::DenseTensor> dense_tensor = nullptr;
     if (dims.size() == 1 && dims[0] == 0) {
+      VLOG(0) << "Create dense tensor with dims[0] equal to 0";
       std::shared_ptr<phi::Allocation> allocation_ptr = nullptr;
       dense_tensor = std::make_shared<phi::DenseTensor>(
           allocation_ptr,
@@ -129,6 +184,48 @@ void EmptyStringTensorInitializer(TensorObject* self,
   self->tensor.set_impl(string_tensor);
 }
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+void InitDistTensorWithNumpyValue(TensorObject* self,
+                                  const py::object& array,
+                                  const paddle::platform::Place& place,
+                                  bool zero_copy = false) {
+  PADDLE_ENFORCE_EQ(
+      self->tensor.defined(),
+      true,
+      paddle::platform::errors::Fatal(
+          "Calling InitDistTensorWithNumpyValue of Eager Tensor without "
+          "EmptyDistTensorInitializer is "
+          "forbidden. Please check your code and make sure you new a "
+          "eager tensor before init it with NumPy."));
+  DistTensor* dist_tensor_ptr =
+      static_cast<DistTensor*>(self->tensor.impl().get());
+  phi::DenseTensor* impl_ptr =
+      static_cast<phi::DenseTensor*>(dist_tensor_ptr->mutable_value());
+
+  if (platform::is_cpu_place(place)) {
+    SetTensorFromPyArray<platform::CPUPlace>(impl_ptr, array, place, zero_copy);
+  } else if (platform::is_xpu_place(place)) {
+    SetTensorFromPyArray<platform::XPUPlace>(impl_ptr, array, place, zero_copy);
+  } else if (platform::is_gpu_place(place)) {
+    SetTensorFromPyArray<platform::CUDAPlace>(
+        impl_ptr, array, place, zero_copy);
+  } else if (platform::is_cuda_pinned_place(place)) {
+    SetTensorFromPyArray<platform::CUDAPinnedPlace>(
+        impl_ptr, array, place, zero_copy);
+  } else if (platform::is_custom_place(place)) {
+    SetTensorFromPyArray<platform::CustomPlace>(
+        impl_ptr, array, place, zero_copy);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Place should be one of "
+        "CPUPlace/XPUPlace/CUDAPlace/CUDAPinnedPlace/CustomPlace"));
+  }
+
+  // TODO(dev): dist_tensor meta is not equal to dense tensor meta
+  dist_tensor_ptr->set_meta(impl_ptr->meta());
+}
+#endif
+
 void InitTensorWithNumpyValue(TensorObject* self,
                               const py::object& array,
                               const paddle::platform::Place& place,
@@ -143,6 +240,7 @@ void InitTensorWithNumpyValue(TensorObject* self,
           "eager tensor before init it with NumPy."));
   phi::DenseTensor* impl_ptr =
       static_cast<phi::DenseTensor*>(self->tensor.impl().get());
+
   if (platform::is_cpu_place(place)) {
     SetTensorFromPyArray<platform::CPUPlace>(impl_ptr, array, place, zero_copy);
   } else if (platform::is_xpu_place(place)) {
@@ -186,6 +284,39 @@ void InitStringTensorWithNumpyValue(TensorObject* self, const py::object& obj) {
   }
 }
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+void InitDistTensorWithTensor(
+    TensorObject* self,
+    const paddle::Tensor& src,
+    const paddle::platform::Place& place,
+    const std::string& name,
+    const std::shared_ptr<TensorDistAttr>& dist_attr) {
+  PADDLE_ENFORCE(src.is_dense_tensor(),
+                 paddle::platform::errors::InvalidArgument(
+                     "DistTensor can only initialize by DenseTensor"));
+  self->tensor.set_name(name);
+  if (place == src.place()) {
+    std::shared_ptr<phi::DenseTensor> tensor =
+        std::static_pointer_cast<phi::DenseTensor>(src.impl());
+    self->tensor.set_impl(std::make_shared<DistTensor>(tensor, dist_attr));
+    VLOG(4) << "Same place, do ShareDataWith";
+  } else {
+    std::shared_ptr<phi::DenseTensor> tensor =
+        std::static_pointer_cast<phi::DenseTensor>(
+            src.copy_to(place, true).impl());
+    self->tensor.set_impl(std::make_shared<DistTensor>(tensor, dist_attr));
+    VLOG(4) << "Different place, do TensorCopy";
+  }
+  if (src.get_autograd_meta()) {
+    egr::EagerUtils::autograd_meta(&(self->tensor))
+        ->SetPersistable(
+            egr::EagerUtils::unsafe_autograd_meta(src)->Persistable());
+  } else {
+    egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
+  }
+}
+#endif
+
 void InitTensorWithTensor(TensorObject* self,
                           const paddle::Tensor& src,
                           const paddle::platform::Place& place,
@@ -283,6 +414,25 @@ paddle::platform::Place ParsePlace(
   return place;
 }
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+std::shared_ptr<TensorDistAttr> ParseDistAttrArgs(
+    std::unordered_map<std::string, PyObject*> kws_map,
+    std::unordered_map<std::string, Py_ssize_t> kw_order_map,
+    PyObject* args,
+    bool flag_kwargs,
+    Py_ssize_t args_num) {
+  std::shared_ptr<TensorDistAttr> dist_attr = nullptr;
+  if (kw_order_map["dist_attr"] <= args_num) {
+    dist_attr = CastPyArg2DistAttr(
+        PyTuple_GET_ITEM(args, kw_order_map["dist_attr"] - 1),
+        kw_order_map["dist_attr"] - 1);
+  } else if (flag_kwargs && kws_map["dist_attr"] != NULL) {
+    dist_attr = CastPyArg2DistAttr(kws_map["dist_attr"], 0);
+  }
+  return dist_attr;
+}
+#endif
+
 // boolean arguments: zero_copy, stop_gradient, persistable
 int ParseBooleanArgs(std::string key,
                      std::unordered_map<std::string, PyObject*> kws_map,
@@ -347,13 +497,13 @@ void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr,
   // kw_order_map's value is the position of the arguments respectively.
   // If u want to update this constructor with new arguments,
   // need to update this map and to add or change related code.
-  std::unordered_map<std::string, Py_ssize_t> kw_order_map{
-      {"value", 1},
-      {"place", 2},
-      {"persistable", 3},
-      {"zero_copy", 4},
-      {"name", 5},
-      {"stop_gradient", 6}};
+  std::unordered_map<std::string, Py_ssize_t> kw_order_map{{"value", 1},
+                                                           {"place", 2},
+                                                           {"persistable", 3},
+                                                           {"zero_copy", 4},
+                                                           {"name", 5},
+                                                           {"stop_gradient", 6},
+                                                           {"dist_attr", 7}};
 
   py::object numpy_value = py::object();
   paddle::platform::Place place =
@@ -378,6 +528,18 @@ void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr,
   stop_gradient = ParseBooleanArgs(
       "stop_gradient", kws_map, kw_order_map, args, flag_kwargs, args_num);
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+  std::shared_ptr<TensorDistAttr> dist_attr =
+      ParseDistAttrArgs(kws_map, kw_order_map, args, flag_kwargs, args_num);
+
+  if (dist_attr) {
+    EmptyDistTensorInitializer(
+        py_tensor_ptr, act_name, place, dist_attr, persistable, stop_gradient);
+    InitDistTensorWithNumpyValue(py_tensor_ptr, numpy_value, place, zero_copy);
+    return;
+  }
+#endif
+
   EmptyTensorInitializer(
       py_tensor_ptr, act_name, place, persistable, stop_gradient);
   InitTensorWithNumpyValue(py_tensor_ptr, numpy_value, place, zero_copy);
@@ -399,7 +561,7 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
   // If u want to update this constructor with new arguments,
   // need to update this map and to add or change related code.
   std::unordered_map<std::string, Py_ssize_t> kw_order_map{
-      {"value", 1}, {"place", 2}, {"name", 3}};
+      {"value", 1}, {"place", 2}, {"name", 3}, {"dist_attr", 4}};
 
   paddle::platform::Place place =
       egr::Controller::Instance().GetExpectedPlace();
@@ -408,6 +570,11 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
   place = ParsePlace(kws_map, kw_order_map, args, flag_kwargs, args_num);
   act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num);
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+  std::shared_ptr<TensorDistAttr> dist_attr =
+      ParseDistAttrArgs(kws_map, kw_order_map, args, flag_kwargs, args_num);
+#endif
+
   if (init_by_egr_tensor) {
     paddle::Tensor src_tensor;
     if (kw_order_map["value"] <= args_num) {
@@ -426,7 +593,16 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
             "way."));
       }
     }
+#ifdef PADDLE_WITH_DISTRIBUTE
+    if (dist_attr) {
+      InitDistTensorWithTensor(
+          py_tensor_ptr, src_tensor, place, act_name, dist_attr);
+    } else {
+      InitTensorWithTensor(py_tensor_ptr, src_tensor, place, act_name);
+    }
+#else
     InitTensorWithTensor(py_tensor_ptr, src_tensor, place, act_name);
+#endif
   } else {
     // init by framework tensor
     phi::DenseTensor src_tensor;
@@ -545,7 +721,8 @@ void AutoInitStringTensorByStringTensor(
  * ** persistable: bool,
  * ** zero_copy: bool,
  * ** name: std::string,
- * ** stop_gradient: bool)
+ * ** stop_gradient: bool,
+ * ** dist_attr: phi::distributed::TensorDistAttr)
  * 4.
  * def __init__ (
  * ** value: ndarray)
@@ -558,7 +735,8 @@ void AutoInitStringTensorByStringTensor(
  * def __init__ (
  * ** tensor: Tensor,
  * ** place: paddle::platform::Place,
- * ** name: std::string)
+ * ** name: std::string,
+ * ** dist_attr: phi::distributed::TensorDistAttr)
  * 7. (multi-place) (should have at least one parameter, one parameter similar
  * to case 5, zero parameter equals to case 1.)
  * def __init__ (
@@ -583,6 +761,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   PyObject* kw_dims = NULL;
   PyObject* kw_dtype = NULL;
   PyObject* kw_type = NULL;
+  PyObject* kw_dist_attr = NULL;
 
   // the keywords argument
   static char* kwlist[] = {const_cast<char*>("value"),
@@ -594,6 +773,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
                            const_cast<char*>("dims"),
                            const_cast<char*>("dtype"),
                            const_cast<char*>("type"),
+                           const_cast<char*>("dist_attr"),
                            NULL};
 
   // 'O' Store a Python object (without any conversion) in a C object pointer,
@@ -604,7 +784,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
   // which enhance case2, case3, case4, case5, case6, case7.
   bool flag_ = PyArg_ParseTupleAndKeywords(args,
                                            kwargs,
-                                           "|OOOOOOOOO",
+                                           "|OOOOOOOOOO",
                                            kwlist,
                                            &kw_value,
                                            &kw_place,
@@ -614,7 +794,8 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
                                            &kw_stop_gradient,
                                            &kw_dims,
                                            &kw_dtype,
-                                           &kw_type);
+                                           &kw_type,
+                                           &kw_dist_attr);
 
   // helper map
   std::unordered_map<std::string, PyObject*> kws_map{
@@ -626,7 +807,8 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
       {"stop_gradient", kw_stop_gradient},
       {"dims", kw_dims},
       {"dtype", kw_dtype},
-      {"type", kw_type}};
+      {"type", kw_type},
+      {"dist_attr", kw_dist_attr}};
 
   PADDLE_ENFORCE_EQ(flag_,
                     true,
@@ -636,7 +818,7 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
                         "sure you are on the right way. "
                         "The expected arguments as follow: ("
                         "value, place, persistable, zero_copy, "
-                        "name, stop_gradient, dims, dtype, type)"));
+                        "name, stop_gradient, dims, dtype, type, dist_attr)"));
 
   PADDLE_ENFORCE_NOT_NULL(
       self,
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 4c88cf30223..eb0e895cf57 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -62,6 +62,9 @@ typedef SSIZE_T ssize_t;
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
 
 PHI_DECLARE_bool(set_to_1d);
 
@@ -796,6 +799,15 @@ static PyObject* tensor_method_get_underline_tensor(TensorObject* self,
     auto* tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
     VLOG(6) << "tensor: " << tensor->IsInitialized();
     return ToPyObject(tensor);
+  } else if (self->tensor.is_dist_tensor()) {
+#ifdef PADDLE_WITH_DISTRIBUTE
+    auto* tensor = static_cast<phi::distributed::auto_parallel::DistTensor*>(
+        self->tensor.impl().get());
+    VLOG(6) << "dist tensor: " << tensor->IsInitialized();
+    return ToPyObject(tensor);
+#else
+    RETURN_PY_NONE
+#endif
   } else {
     RETURN_PY_NONE
   }
@@ -1697,6 +1709,17 @@ static PyObject* tensor_method_is_dense(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method_is_dist(TensorObject* self,
+                                       PyObject* args,
+                                       PyObject* kwargs) {
+  EAGER_TRY
+  if (!self->tensor.defined()) {
+    return ToPyObject(false);
+  }
+  return ToPyObject(self->tensor.is_dist_tensor());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method_is_sparse(TensorObject* self,
                                          PyObject* args,
                                          PyObject* kwargs) {
@@ -2051,6 +2074,10 @@ PyMethodDef variable_methods[] = {
      (PyCFunction)(void (*)(void))tensor_method_is_dense,
      METH_VARARGS | METH_KEYWORDS,
      NULL},
+    {"is_dist",
+     (PyCFunction)(void (*)(void))tensor_method_is_dist,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
     {"_zero_grads",
      (PyCFunction)(void (*)(void))tensor__zero_grads,
      METH_VARARGS | METH_KEYWORDS,
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index 1c2ba1b1dee..028d843c4d7 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -158,6 +158,23 @@ int tensor_properties_set_persistable(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NEG
 }
 
+PyObject* tensor_properties_get_dist_attr(TensorObject* self, void* closure) {
+  EAGER_TRY
+  if (self->tensor.is_dist_tensor()) {
+#ifdef PADDLE_WITH_DISTRIBUTE
+    phi::distributed::auto_parallel::DistTensor* dist_tensor =
+        static_cast<phi::distributed::auto_parallel::DistTensor*>(
+            self->tensor.impl().get());
+    return ToPyObject(dist_tensor->dist_attr().get());
+#else
+    RETURN_PY_NONE
+#endif
+  } else {
+    RETURN_PY_NONE
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
   EAGER_TRY
   std::vector<int64_t> value;
@@ -311,6 +328,11 @@ struct PyGetSetDef variable_properties[] = {
     // nullptr,
     //  nullptr},
     {"place", (getter)tensor_properties_get_place, nullptr, nullptr, nullptr},
+    {"dist_attr",
+     (getter)tensor_properties_get_dist_attr,
+     nullptr,
+     nullptr,
+     nullptr},
     {"_place_str",
      (getter)tensor_properties_get_place_str,
      nullptr,
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index dd5a06757c3..8c3bf619947 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -59,6 +59,9 @@ extern PyTypeObject* g_customplace_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;
 extern PyTypeObject* g_framework_lodtensorarray_pytype;
 extern PyTypeObject* g_jit_function_pytype;
+#ifdef PADDLE_WITH_DISTRIBUTE
+extern PyTypeObject* g_tensor_dist_attr_pytype;
+#endif
 
 int TensorDtype2NumpyDtype(phi::DataType dtype) {
   switch (dtype) {
@@ -540,6 +543,23 @@ platform::Place CastPyArg2Place(PyObject* obj, ssize_t arg_pos) {
   return place;
 }
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+using phi::distributed::auto_parallel::TensorDistAttr;
+std::shared_ptr<TensorDistAttr> CastPyArg2DistAttr(PyObject* obj,
+                                                   ssize_t arg_pos) {
+  if (PyObject_IsInstance(
+          obj, reinterpret_cast<PyObject*>(g_tensor_dist_attr_pytype))) {
+    return ::pybind11::handle(obj).cast<std::shared_ptr<TensorDistAttr>>();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "argument (position %d) must be "
+        "TensorDistAttr, but got %s",
+        arg_pos + 1,
+        reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
+  }
+}
+#endif
+
 phi::DenseTensor CastPyArg2FrameworkTensor(PyObject* obj, ssize_t arg_pos) {
   if (PyObject_TypeCheck(obj, g_framework_tensor_pytype)) {
     return ::pybind11::handle(obj).cast<phi::DenseTensor>();
@@ -838,6 +858,21 @@ PyObject* ToPyObject(const phi::DenseTensor* value) {
   return obj.ptr();
 }
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+PyObject* ToPyObject(const phi::distributed::auto_parallel::DistTensor* value) {
+  auto obj = ::pybind11::cast(value, py::return_value_policy::reference);
+  obj.inc_ref();
+  return obj.ptr();
+}
+
+PyObject* ToPyObject(
+    const phi::distributed::auto_parallel::TensorDistAttr* value) {
+  auto obj = ::pybind11::cast(value, py::return_value_policy::reference);
+  obj.inc_ref();
+  return obj.ptr();
+}
+#endif
+
 PyObject* ToPyObject(const phi::SelectedRows* value) {
   auto obj = ::pybind11::cast(value, py::return_value_policy::reference);
   obj.inc_ref();
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index 8f83e8f880f..3d8e6371d01 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -37,6 +37,11 @@ typedef SSIZE_T ssize_t;
 #include "paddle/utils/pybind.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
+
 namespace paddle {
 class CustomOpKernelContext;
 namespace framework {
@@ -106,6 +111,11 @@ PyObject* ToPyObject(const std::vector<std::vector<paddle::Tensor>>& value,
                      bool return_py_none_if_not_initialize = false);
 PyObject* ToPyObject(const platform::Place& value);
 PyObject* ToPyObject(const phi::DenseTensor* value);
+#ifdef PADDLE_WITH_DISTRIBUTE
+PyObject* ToPyObject(const phi::distributed::auto_parallel::DistTensor* value);
+PyObject* ToPyObject(
+    const phi::distributed::auto_parallel::TensorDistAttr* value);
+#endif
 PyObject* ToPyObject(const phi::SelectedRows* value);
 PyObject* ToPyObject(const paddle::framework::proto::VarType::Type& dtype);
 PyObject* ToPyObject(const paddle::framework::proto::VarType& type);
@@ -287,6 +297,11 @@ paddle::DataType CastPyArg2DataType(PyObject* obj,
                                     const std::string& op_type,
                                     ssize_t arg_pos);
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+std::shared_ptr<phi::distributed::auto_parallel::TensorDistAttr>
+CastPyArg2DistAttr(PyObject* obj, ssize_t arg_pos);
+#endif
+
 paddle::optional<paddle::Tensor> GetOptionalTensorFromArgs(
     const std::string& op_type,
     const std::string& arg_name,
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index bdd2a50d96e..ff47d1a2117 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -174,6 +174,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/autotune/cache.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "pybind11/stl.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
 
 PHI_DECLARE_bool(use_mkldnn);
 PHI_DECLARE_bool(use_shm_cache);
@@ -1021,6 +1024,17 @@ void BindTensor(pybind11::module &m) {  // NOLINT
           }));
 #endif
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+  using phi::distributed::auto_parallel::DistTensor;
+  py::class_<DistTensor>(m, "DistTensor")
+      .def(
+          "get_tensor",
+          [](DistTensor &self) { return self.mutable_value(); },
+          py::return_value_policy::reference)
+      .def("numel",
+           [](DistTensor &self) -> int64_t { return self.value().numel(); });
+#endif
+
   py::class_<phi::SelectedRows>(m, "SelectedRows")
       .def("__init__",
            [](phi::SelectedRows &instance) {
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index fb4a3c6f7d2..b626df6c670 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -219,6 +219,13 @@ class PADDLE_API Tensor final {
    */
   bool is_dense_tensor() const;
 
+  /**
+   * @brief Determine whether tensor is DistTensor
+   *
+   * @return bool
+   */
+  bool is_dist_tensor() const;
+
   /**
    * @brief Determine whether tensor is SelectedRows
    *
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index 23b375eaf6e..9d05fb15519 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -28,6 +28,10 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
+
 // TODO(chenweihang): split Key, Kernel, Factory into diff files
 #include "paddle/phi/core/kernel_factory.h"
 
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index e9c68367b16..e8caf525308 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -34,6 +34,9 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
 
 namespace paddle {
 
@@ -128,6 +131,13 @@ DataLayout Tensor::layout() const { return impl_->layout(); }
 bool Tensor::is_dense_tensor() const {
   return phi::DenseTensor::classof(impl_.get());
 }
+bool Tensor::is_dist_tensor() const {
+#ifdef PADDLE_WITH_DISTRIBUTE
+  return phi::distributed::auto_parallel::DistTensor::classof(impl_.get());
+#else
+  return false;
+#endif
+}
 bool Tensor::is_selected_rows() const {
   return phi::SelectedRows::classof(impl_.get());
 }
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index f4f1a7cb25c..2cfdd7493c4 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -29,6 +29,11 @@ limitations under the License. */
 namespace phi {
 
 class DenseTensorUtils;
+namespace distributed {
+namespace auto_parallel {
+class DistTensor;
+}  // namespace auto_parallel
+}  // namespace distributed
 
 /// \brief The Dense tensor stores values in a contiguous sequential block
 /// of memory where all values are represented. Tensors or multi-dimensional
@@ -181,6 +186,7 @@ class DenseTensor : public TensorBase,
 
  private:
   friend class DenseTensorUtils;
+  friend class phi::distributed::auto_parallel::DistTensor;
 
  protected:
   DenseTensorMeta meta_;
diff --git a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
index d6e52ca8044..db639bba5f4 100644
--- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
@@ -1,4 +1,10 @@
 proto_library(auto_parallel_proto SRCS auto_parallel.proto)
 
-collect_srcs(core_srcs SRCS device_mesh.cc process_mesh.cc dist_attr.cc
-             dist_mapper.cc)
+collect_srcs(
+  core_srcs
+  SRCS
+  device_mesh.cc
+  process_mesh.cc
+  dist_attr.cc
+  dist_mapper.cc
+  dist_tensor.cc)
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
new file mode 100644
index 00000000000..a1f052ed512
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+
+namespace phi {
+namespace distributed {
+namespace auto_parallel {
+
+void* DistTensor::AllocateFrom(Allocator* allocator,
+                               DataType dtype,
+                               size_t requested_size,
+                               bool fake_alloc) {
+  return value_->AllocateFrom(allocator, dtype, requested_size, fake_alloc);
+}
+
+const Place& DistTensor::place() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      value_->holder_,
+      phi::errors::PreconditionNotMet(
+          "Tensor not initialized yet when DenseTensor::place() is called."));
+  return value_->holder_->place();
+}
+
+int64_t DistTensor::numel() const {
+  if (meta_.is_scalar) {
+    return 1;
+  }
+  return product(meta_.dims);
+}
+
+void DistTensor::set_meta(DenseTensorMeta&& meta) {
+  PADDLE_ENFORCE_EQ(meta_.valid(),
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Only when the original attribute of Tensor is "
+                        "incomplete, can it be reset."));
+  meta_ = std::move(meta);
+}
+
+void DistTensor::set_meta(const DenseTensorMeta& meta) {
+  PADDLE_ENFORCE_EQ(
+      meta.valid(),
+      true,
+      phi::errors::InvalidArgument(
+          "Input meta is invalid, please check the meta attribute."));
+  meta_.dims = meta.dims;
+  meta_.dtype = meta.dtype;
+  meta_.is_scalar = meta.is_scalar;
+  meta_.layout = meta.layout;
+  meta_.lod = meta.lod;
+  meta_.offset = meta.offset;
+  meta_.use_gpudnn = meta.use_gpudnn;
+}
+
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
new file mode 100644
index 00000000000..ef14abe0c45
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/dist_tensor.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+namespace distributed {
+namespace auto_parallel {
+
+class TensorDistAttr;
+
+class DistTensor final
+    : public phi::TensorBase,
+      public phi::TypeInfoTraits<phi::TensorBase, DistTensor> {
+ public:
+  /// \brief Construct a dist tensor and allocate space.
+  /// \param a The allocator used to allocate space.
+  /// \param meta The meta data of dense tensor.
+  DistTensor(Allocator* a,
+             const DenseTensorMeta& meta,
+             const std::shared_ptr<TensorDistAttr>& dist_attr)
+      : meta_(meta), dist_attr_(dist_attr) {
+    value_ = std::make_unique<DenseTensor>(a, meta);
+  }
+
+  DistTensor(Allocator* a,
+             DenseTensorMeta&& meta,
+             const std::shared_ptr<TensorDistAttr>& dist_attr)
+      : meta_(std::move(meta)), dist_attr_(dist_attr) {
+    value_ = std::make_unique<DenseTensor>(a, meta);
+  }
+
+  DistTensor(const std::shared_ptr<phi::Allocation>& holder,
+             const DenseTensorMeta& meta,
+             const std::shared_ptr<TensorDistAttr>& dist_attr)
+      : meta_(meta), dist_attr_(dist_attr) {
+    value_ = std::make_unique<DenseTensor>(holder, meta);
+  }
+
+  DistTensor(const std::shared_ptr<phi::DenseTensor>& dense_tensor,
+             const std::shared_ptr<TensorDistAttr>& dist_attr)
+      : dist_attr_(dist_attr) {
+    value_ = std::make_unique<DenseTensor>(*dense_tensor);
+    set_meta(dense_tensor->meta());
+  }
+
+  ~DistTensor() = default;
+
+  static const char* name() { return "DistTensor"; }
+
+  const DenseTensor& value() const { return *value_; }
+
+  DenseTensor* mutable_value() { return value_.get(); }
+
+  const std::shared_ptr<TensorDistAttr>& dist_attr() const {
+    return dist_attr_;
+  }
+
+  /// \brief Returns the number of elements contained in tensor.
+  /// \return The number of elements contained in tensor.
+  int64_t numel() const override;
+
+  /// \brief Returns the dims of the tensor.
+  /// \return The dims of the tensor.
+  const DDim& dims() const override { return meta_.dims; }
+
+  /// \brief Test whether the storage is allocated.
+  /// \return Whether the storage is allocated.
+  bool initialized() const override {
+    return value_->holder_ && value_->holder_->ptr();
+  }
+
+  bool IsInitialized() const { return value_->holder_ != nullptr; }
+
+  /// \brief Test whether the metadata is valid.
+  /// \return Whether the metadata is valid.
+  bool valid() const override { return meta_.valid(); }
+
+  /// \brief Allocate memory with requested size from allocator.
+  /// \return The mutable data pointer value of type T.
+  void* AllocateFrom(Allocator* allocator,
+                     DataType dtype,
+                     size_t requested_size = 0,
+                     bool fake_alloc = false) override;
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  DataType dtype() const override { return meta_.dtype; }
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  DataLayout layout() const override { return meta_.layout; }
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  const Place& place() const override;
+
+  const DenseTensorMeta& meta() const noexcept { return meta_; }
+
+  /// \brief Sets the meta information of the tensor. Only when the original
+  /// attribute of Tensor is incomplete, can it be reset.
+  /// \param meta The meta information of the tensor.
+  void set_meta(DenseTensorMeta&& meta);
+
+  void set_meta(const DenseTensorMeta& meta);
+
+ private:
+  DenseTensorMeta meta_;
+  std::shared_ptr<TensorDistAttr> dist_attr_{nullptr};
+  std::unique_ptr<DenseTensor> value_{nullptr};
+};
+
+}  // namespace auto_parallel
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc
index 457824820e5..2a554525024 100644
--- a/paddle/phi/core/utils/type_info.cc
+++ b/paddle/phi/core/utils/type_info.cc
@@ -24,6 +24,9 @@ limitations under the License. */
 #include "paddle/phi/core/storage_properties.h"
 #include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/core/tensor_array.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#endif
 #include "paddle/phi/core/utils/type_info.h"
 
 namespace phi {
@@ -52,6 +55,11 @@ template class TypeInfoTraits<phi::TensorBase, TensorArray>;
 template class TypeInfoTraits<phi::DeviceContext, CPUContext>;
 template class TypeInfoTraits<phi::DeviceContext, CustomContext>;
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+template class TypeInfoTraits<phi::TensorBase,
+                              phi::distributed::auto_parallel::DistTensor>;
+#endif
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU_KP)
 template class TypeInfoTraits<phi::DeviceContext, GPUContext>;
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 0d8cd9a6b81..a6018084941 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -362,6 +362,27 @@ def sparse_tensor_to_string(tensor, prefix='Tensor'):
         )
 
 
+def dist_tensor_to_string(tensor, prefix='Tensor'):
+    # TODO(dev): Complete tensor will be printed after reshard
+    # is ready.
+    indent = len(prefix) + 1
+    dtype = convert_dtype(tensor.dtype)
+    if tensor.dtype == core.VarDesc.VarType.BF16:
+        dtype = 'bfloat16'
+
+    _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient}, dist_attr={dist_attr},\n{indent}{data})"
+    return _template.format(
+        prefix=prefix,
+        shape=tensor.shape,
+        dtype=dtype,
+        place=tensor._place_str,
+        stop_gradient=tensor.stop_gradient,
+        dist_attr=tensor.dist_attr,
+        indent=' ' * indent,
+        data=None,
+    )
+
+
 def tensor_to_string(tensor, prefix='Tensor'):
     indent = len(prefix) + 1
 
@@ -374,6 +395,9 @@ def tensor_to_string(tensor, prefix='Tensor'):
     if tensor.is_sparse():
         return sparse_tensor_to_string(tensor, prefix)
 
+    if tensor.is_dist():
+        return dist_tensor_to_string(tensor, prefix)
+
     if not tensor._is_dense_tensor_hold_allocation():
         return "Tensor(Not initialized)"
     else:
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 61c1b578030..a43b51cc8ac 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -142,6 +142,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_dist_saver MODULES test_dist_saver)
   py_test_modules(test_engine_save_load MODULES test_engine_save_load)
   py_test_modules(test_rule_based_tuner MODULES test_rule_based_tuner)
+  py_test_modules(test_dist_tensor MODULES test_dist_tensor)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/test/auto_parallel/test_dist_tensor.py b/test/auto_parallel/test_dist_tensor.py
new file mode 100644
index 00000000000..58ebc085004
--- /dev/null
+++ b/test/auto_parallel/test_dist_tensor.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+class TestDistTensor(unittest.TestCase):
+    def test_dist_tensor_creation(self):
+        shape = [10, 5]
+        dist_attr = paddle.fluid.core.TensorDistAttr()
+
+        # create dist tensor using numpy
+        dist_tensor_with_numpy = paddle.Tensor(
+            np.ones(shape, dtype=np.float32), dist_attr=dist_attr
+        )
+
+        # create dist tensor using tensor
+        dist_tensor_with_tensor = paddle.Tensor(
+            paddle.ones(shape), dist_attr=dist_attr
+        )
+
+        # create normal tensor
+        tensor = paddle.ones(shape)
+
+        # test dist tensor properties
+        self.assertEqual(dist_tensor_with_numpy.shape, shape)
+        self.assertEqual(dist_tensor_with_tensor.shape, shape)
+        self.assertEqual(dist_tensor_with_numpy.is_dist(), True)
+        self.assertEqual(dist_tensor_with_tensor.is_dist(), True)
+        self.assertEqual(tensor.is_dist(), False)
+        self.assertEqual(
+            str(dist_tensor_with_numpy), str(dist_tensor_with_tensor)
+        )
+        self.assertEqual(dist_tensor_with_numpy.dist_attr, dist_attr)
+        self.assertEqual(dist_tensor_with_tensor.dist_attr, dist_attr)
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab