Add eager string tensor (#41039)

* Add core.eager.StringTensor __init__ which pyarray args can be passed * Add the numpy method of core.eager.StringTensor * revert tensor.to_string modification * Add ToPyObject for core.eager.StringTensor * Add debug string for core.eager.StringTensor * Remove place args of core.eager.StringTensor temporarily * Fix check string_tensor error * remove dtype of core.eager.StringTensor * add core.eager.StringTensor unittest * remove pstring from VarDesc * Add InitStringTensorWithStringTensor * Remove to_string modification * Remove zero_copy arg from StringTensor creator

Add eager string tensor (#41039)
* Add core.eager.StringTensor __init__ which pyarray args can be passed * Add the numpy method of core.eager.StringTensor * revert tensor.to_string modification * Add ToPyObject for core.eager.StringTensor * Add debug string for core.eager.StringTensor * Remove place args of core.eager.StringTensor temporarily * Fix check string_tensor error * remove dtype of core.eager.StringTensor * add core.eager.StringTensor unittest * remove pstring from VarDesc * Add InitStringTensorWithStringTensor * Remove to_string modification * Remove zero_copy arg from StringTensor creator
a22b68b8 · Jack Zhou · GitHub · ef6ff4ef · a22b68b8 · a22b68b8
12 changed file
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -36,12 +36,14 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
+#include "paddle/phi/core/string_tensor.h"
 namespace paddle {
 namespace pybind {

 namespace py = ::pybind11;

 PyTypeObject* p_tensor_type;
+PyTypeObject* p_string_tensor_type;  // For StringTensor
 extern PyTypeObject* g_vartype_pytype;
 extern PyTypeObject* g_framework_tensor_pytype;

@@ -101,6 +103,25 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
  }
 }

+void EmptyStringTensorInitializer(TensorObject* self, const std::string& name,
+                                  const paddle::platform::Place& place,
+                                  const std::vector<int>& dims = {}) {
+  auto ddims = phi::make_ddim(dims);
+  self->tensor.set_name(name);
+  // Note(zhoushunjie): Only support CPUPlace when create StringTensor
+  auto actual_place = platform::CPUPlace();
+  // Allocate memory
+  const auto string_allocator =
+      std::make_unique<paddle::experimental::DefaultAllocator>(actual_place);
+  const auto alloc = string_allocator.get();
+  std::shared_ptr<phi::StringTensor> string_tensor =
+      std::make_shared<phi::StringTensor>(alloc, phi::StringTensorMeta{ddims});
+  if (phi::product(ddims) > 0) {
+    string_tensor->mutable_data(actual_place);
+  }
+  self->tensor.set_impl(string_tensor);
+}
+
 void InitTensorWithNumpyValue(TensorObject* self, const py::object& array,
                              const paddle::platform::Place& place,
                              bool zero_copy = false) {
@@ -132,6 +153,28 @@ void InitTensorWithNumpyValue(TensorObject* self, const py::object& array,
  }
 }

+void InitStringTensorWithNumpyValue(TensorObject* self, const py::object& obj) {
+  PADDLE_ENFORCE_EQ(
+      self->tensor.defined(), true,
+      paddle::platform::errors::Fatal(
+          "Calling InitStringTensorWithNumpyValue of Eager StringTensor "
+          "without "
+          "EmptyStringTensorInitializer is "
+          "forbidden. Please check your code and make sure you new a "
+          "eager tensor before init it with NumPy."));
+  phi::StringTensor* impl_ptr =
+      static_cast<phi::StringTensor*>(self->tensor.impl().get());
+  paddle::platform::Place place = impl_ptr->place();
+  auto array = obj.cast<py::array>();
+  if (platform::is_cpu_place(place)) {
+    SetStringTensorFromPyArray<platform::CPUPlace>(impl_ptr, array, place);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "StringTensor only support CPUPlace now, but receive %s",
+        place.DebugString()));
+  }
+}
+
 void InitTensorWithTensor(TensorObject* self,
                          const paddle::experimental::Tensor& src,
                          const paddle::platform::Place& place,
@@ -171,6 +214,17 @@ void InitTensorWithFrameworkTensor(TensorObject* self,
  egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
 }

+void InitStringTensorWithStringTensor(TensorObject* self,
+                                      const paddle::experimental::Tensor& src,
+                                      const paddle::platform::Place& place,
+                                      const std::string& name) {
+  self->tensor.set_name(name);
+  auto impl = std::static_pointer_cast<phi::StringTensor>(src.impl());
+  self->tensor.set_impl(impl);
+  VLOG(4)
+      << "Do ShareDataWith when using StringTensor to initialize StringTensor";
+}
+
 py::object ParsePyArray(
    std::unordered_map<std::string, PyObject*> kws_map,
    std::unordered_map<std::string, Py_ssize_t> kw_order_map, PyObject* args,
@@ -236,13 +290,14 @@ int ParseBooleanArgs(std::string key,

 std::string ParseName(std::unordered_map<std::string, PyObject*> kws_map,
                      std::unordered_map<std::string, Py_ssize_t> kw_order_map,
-                      PyObject* args, bool flag_kwargs, Py_ssize_t args_num) {
+                      PyObject* args, bool flag_kwargs, Py_ssize_t args_num,
+                      std::string unique_name_prefix = "generated_tensor") {
  std::string act_name = "";
  if (kw_order_map["name"] <= args_num) {
    PyObject* name_obj = PyTuple_GET_ITEM(args, kw_order_map["name"] - 1);
    if (name_obj == Py_None) {
      act_name =
-          egr::Controller::Instance().GenerateUniqueName("generated_tensor");
+          egr::Controller::Instance().GenerateUniqueName(unique_name_prefix);
    } else {
      act_name = CastPyArg2AttrString(name_obj, kw_order_map["name"] - 1);
    }
@@ -250,13 +305,13 @@ std::string ParseName(std::unordered_map<std::string, PyObject*> kws_map,
    if (flag_kwargs) {
      if ((kws_map["name"] == NULL) || (kws_map["name"] == Py_None)) {
        act_name =
-            egr::Controller::Instance().GenerateUniqueName("generated_tensor");
+            egr::Controller::Instance().GenerateUniqueName(unique_name_prefix);
      } else {
        act_name = CastPyArg2AttrString(kws_map["name"], 0);
      }
    } else {
      act_name =
-          egr::Controller::Instance().GenerateUniqueName("generated_tensor");
+          egr::Controller::Instance().GenerateUniqueName(unique_name_prefix);
    }
  }
  return act_name;
@@ -368,6 +423,70 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
  }
 }

+void AutoInitStringTensorByPyArray(
+    TensorObject* py_tensor_ptr,
+    std::unordered_map<std::string, PyObject*> kws_map, PyObject* args,
+    bool flag_kwargs, Py_ssize_t args_num) {
+  // The first argument of the StringTensor constructor is PyArray,
+  // there are 4 arguments to construct the new StringTensor,
+  // kw_order_map's key is every arguments of the constructor,
+  // kw_order_map's value is the position of the arguments respectively.
+  // If u want to update this constructor with new arguments,
+  // need to update this map and to add or change related code.
+  std::unordered_map<std::string, Py_ssize_t> kw_order_map{{"value", 1},
+                                                           {"name", 2}};
+  py::object numpy_value = py::object();
+  paddle::platform::Place place =
+      egr::Controller::Instance().GetExpectedPlace();
+  std::string act_name = "";
+
+  numpy_value =
+      ParsePyArray(kws_map, kw_order_map, args, flag_kwargs, args_num);
+  act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num,
+                       "generated_string_tensor");
+  EmptyStringTensorInitializer(py_tensor_ptr, act_name, place);
+  InitStringTensorWithNumpyValue(py_tensor_ptr, numpy_value);
+}
+
+void AutoInitStringTensorByStringTensor(
+    TensorObject* py_tensor_ptr,
+    std::unordered_map<std::string, PyObject*> kws_map, PyObject* args,
+    bool flag_kwargs, Py_ssize_t args_num) {
+  // The first argument of the Tensor constructor is StringTensor,
+  // there are 3 arguments to construct the new StringTensor,
+  // kw_order_map's key is every arguments of the constructor,
+  // kw_order_map's value is the position of the arguments respectively.
+  // If u want to update this constructor with new arguments,
+  // need to update this map and to add or change related code.
+  std::unordered_map<std::string, Py_ssize_t> kw_order_map{{"value", 1},
+                                                           {"name", 2}};
+
+  paddle::platform::Place place =
+      egr::Controller::Instance().GetExpectedPlace();
+  std::string act_name = "";
+
+  act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num,
+                       "generated_string_tensor");
+  paddle::experimental::Tensor src_tensor;
+  if (kw_order_map["value"] <= args_num) {
+    src_tensor =
+        CastPyArg2Tensor(PyTuple_GET_ITEM(args, kw_order_map["value"] - 1),
+                         kw_order_map["value"] - 1);
+  } else {
+    if (flag_kwargs && kws_map["value"] != NULL) {
+      src_tensor = CastPyArg2Tensor(kws_map["value"], 0);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "The first expected kwargs is {value: Tensor}, "
+          "but could not parse the first argument {value: Tensor} "
+          "successfully. "
+          "Please check your input first and make sure you are on the right "
+          "way."));
+    }
+  }
+  InitStringTensorWithStringTensor(py_tensor_ptr, src_tensor, place, act_name);
+}
+
 /** We should have init function with signature:
   * 1.
   * def __init__ ()
@@ -708,6 +827,204 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
  EAGER_CATCH_AND_THROW_RETURN_NEG
 }

+/** We should have init function with signature:
+   * 1.
+   * def __init__ ()
+   *
+   * 2.
+   * def __init__ (
+   * ** dims: vector<int>,
+   * ** name: std::string)
+   *
+   * 3.
+   * (should have at least one parameter, one parameter equals to case 4, zero
+   * parameter equals to case 1)
+   * def __init__ (
+   * ** value: ndarray,
+   * ** zero_copy: bool,
+   * ** name: std::string)
+   *
+   * 4.
+   * def __init__ (
+   * ** value: ndarray)
+   *
+   * 5.
+   * def __init__ (
+   * ** tensor: Tensor)
+   *
+   * 6.
+   * (should have at least one parameter, one parameter equals to case 5, zero
+   * parameter equals to case 1.)
+   * def __init__ (
+   * ** tensor: Tensor,
+   * ** name: std::string)
+   * **/
+int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
+  // set a flag to record use kwargs or not
+  bool flag_kwargs = false;
+  if (kwargs) flag_kwargs = true;
+
+  // all kwargs
+  PyObject* kw_zero_copy = NULL;
+
+  PyObject* kw_value = NULL;  // receive PyArray or Tensor
+  PyObject* kw_name = NULL;
+  PyObject* kw_dims = NULL;
+
+  // the keywords argument
+  static char* kwlist[] = {
+      const_cast<char*>("value"), const_cast<char*>("zero_copy"),
+      const_cast<char*>("name"), const_cast<char*>("dims"), NULL};
+  // 'O' Store a Python object (without any conversion) in a C object pointer,
+  // '|' Indicates that the remaining arguments in the Python argument list are
+  // optional.
+  // PyArg_ParseTupleAndKeywords can Parse the parameters of a function that
+  // takes both positional and keyword parameters into local variables,
+  // which enhance case1, case2, case3, case4, case 5, case 6.
+  bool flag_ =
+      PyArg_ParseTupleAndKeywords(args, kwargs, "|OOOO", kwlist, &kw_value,
+                                  &kw_zero_copy, &kw_name, &kw_dims);
+
+  // helper map
+  std::unordered_map<std::string, PyObject*> kws_map{
+      {"value", kw_value},
+      {"zero_copy", kw_zero_copy},
+      {"name", kw_name},
+      {"dims", kw_dims}};
+
+  PADDLE_ENFORCE_EQ(flag_, true,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "Could not parse args and kwargs successfully, "
+                        "please check your input first and make"
+                        "sure you are on the right way. "
+                        "The expected arguments as follow: ("
+                        "value, zero_copy, name, dims)"));
+
+  PADDLE_ENFORCE_NOT_NULL(
+      self, paddle::platform::errors::Fatal(
+                "Calling __init__ of Eager Tensor without __new__ is "
+                "forbidden. Please check your code and make sure you new a "
+                "eager tensor before init it."));
+
+  auto py_tensor_ptr = reinterpret_cast<TensorObject*>(self);
+
+  Py_ssize_t args_num = PyTuple_Size(args);
+  VLOG(6) << " args_num: " << args_num;
+  // args_num = 0, means that there is no position arguments.
+  if (args_num == (Py_ssize_t)0) {
+    if (!flag_kwargs) {
+      // case 1
+      VLOG(6) << "Calling case1's string initializer.";
+      EmptyStringTensorInitializer(
+          py_tensor_ptr, egr::Controller::Instance().GenerateUniqueName(
+                             "generated_string_tensor"),
+          egr::Controller::Instance().GetExpectedPlace());
+      return 0;
+    } else {
+      if (kw_value != NULL) {
+        if (pybind11::detail::npy_api::get().PyArray_Check_(kw_value)) {
+          VLOG(6) << "Calling case3's or case4's string initializer";
+          AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args,
+                                        flag_kwargs, args_num);
+          return 0;
+        } else if (PyObject_IsInstance(kw_value, reinterpret_cast<PyObject*>(
+                                                     p_string_tensor_type))) {
+          VLOG(6) << "Calling case5's or case6's string initializer";
+          AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args,
+                                             flag_kwargs, args_num);
+          return 0;
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Could not parse the first keyword argument successfully, "
+              "the first keyword argument is value, but it should be PyArray "
+              "or StringTensor."
+              "Please check your input first and make sure you are on the "
+              "right way."));
+        }
+      } else if (kw_dims != NULL) {
+        VLOG(6) << "Calling case2's string initializer.";
+        std::unordered_map<std::string, Py_ssize_t> kw_order_map{{"dims", 1},
+                                                                 {"name", 2}};
+
+        std::vector<int> dims = CastPyArg2VectorOfInt(kw_dims, 0);
+        std::string act_name =
+            ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num,
+                      "generated_string_tensor");
+        EmptyStringTensorInitializer(
+            py_tensor_ptr, act_name,
+            egr::Controller::Instance().GetExpectedPlace(), dims);
+        return 0;
+      } else {
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "We not only support construct Tensor from numpy value "
+            "or StringTensor with python kwargs by this initializer, "
+            "but also even support dtype to init a empty StringTensor. "
+            "Please check your input first and make sure you call the existed "
+            "constructor."));
+      }
+    }
+  } else if (args_num == (Py_ssize_t)1) {  // case 3 ~ 6
+    // 1 position args, remainting arguments are kwargs
+    PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
+    if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
+      VLOG(6) << "Calling case3's or case4's string initializer.";
+      AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                    args_num);
+      return 0;
+    } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast<PyObject*>(
+                                                 p_string_tensor_type))) {
+      VLOG(6) << "Calling case5's or case6's string initializer.";
+      AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args,
+                                         flag_kwargs, args_num);
+      return 0;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Could not parse the first keyword argument successfully, "
+          "the first keyword argument is value, but it should be PyArray "
+          "or StringTensor."
+          "Please check your input first and make sure you are on the "
+          "right way."));
+    }
+  } else if (args_num == (Py_ssize_t)2) {  // case 2
+    // 2 position args
+    if (!flag_kwargs) {
+      PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
+      if (PyObject_IsInstance(
+              arg0_ptr, reinterpret_cast<PyObject*>(p_string_tensor_type))) {
+        VLOG(6) << "Calling case6's string initializer.";
+        AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args,
+                                           flag_kwargs, args_num);
+        return 0;
+      } else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
+        VLOG(6) << "Calling case3's string initializer.";
+        AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
+                                      args_num);
+        return 0;
+      } else {
+        VLOG(6) << "Calling case2's string initializer.";
+        std::vector<int> dims = CastPyArg2VectorOfInt(arg0_ptr, 0);
+        std::string act_name = "";
+        PyObject* name_obj = PyTuple_GET_ITEM(args, 1);
+        if (name_obj == Py_None) {
+          act_name = egr::Controller::Instance().GenerateUniqueName(
+              "generated_string_tensor");
+        } else {
+          act_name = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
+        }
+        EmptyStringTensorInitializer(
+            py_tensor_ptr, act_name,
+            egr::Controller::Instance().GetExpectedPlace(), dims);
+        return 0;
+      }
+    } else {
+      PADDLE_THROW(platform::errors::Fatal(
+          "Can't not find expected num of args, please check your call, and "
+          "make sure u call the existed constructor."));
+    }
+  }
+  return 1;
+}
+
 static void TensorDealloc(TensorObject* self) {
  if (self->weakrefs != NULL)
    PyObject_ClearWeakRefs(reinterpret_cast<PyObject*>(self));
@@ -716,8 +1033,10 @@ static void TensorDealloc(TensorObject* self) {
 }

 extern struct PyGetSetDef variable_properties[];
+extern struct PyGetSetDef string_tensor_variable_properties[];

 extern PyMethodDef variable_methods[];
+extern PyMethodDef string_tensor_variable_methods[];

 PyNumberMethods number_methods;
 PySequenceMethods sequence_methods;
@@ -772,5 +1091,49 @@ void BindEager(pybind11::module* module) {
  BindEagerOpFunctions(&m);
 }

+void BindEagerStringTensor(pybind11::module* module) {
+  auto m = module->def_submodule("eager");
+
+  auto heap_type = reinterpret_cast<PyHeapTypeObject*>(
+      PyType_Type.tp_alloc(&PyType_Type, 0));
+  heap_type->ht_name = ToPyObject("StringTensor");
+  heap_type->ht_qualname = ToPyObject("StringTensor");
+  auto type = &heap_type->ht_type;
+  type->tp_name = "StringTensor";
+  type->tp_basicsize = sizeof(TensorObject);
+  type->tp_dealloc = (destructor)TensorDealloc;
+  type->tp_as_number = &number_methods;
+  type->tp_as_sequence = &sequence_methods;
+  type->tp_as_mapping = &mapping_methods;
+  type->tp_methods = string_tensor_variable_methods;
+  type->tp_getset = string_tensor_variable_properties;
+  type->tp_init = StringTensorInit;
+  type->tp_new = TensorNew;
+  Py_INCREF(&PyBaseObject_Type);
+  type->tp_base = reinterpret_cast<PyTypeObject*>(&PyBaseObject_Type);
+  type->tp_flags |=
+      Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+#if PY_VERSION_HEX >= 0x03050000
+  type->tp_as_async = &heap_type->as_async;
+#endif
+  p_string_tensor_type = type;
+
+  if (PyType_Ready(type) < 0) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Init Paddle error in BindEager(PyType_Ready)."));
+    return;
+  }
+
+  Py_INCREF(type);
+  if (PyModule_AddObject(m.ptr(), "StringTensor",
+                         reinterpret_cast<PyObject*>(type)) < 0) {
+    Py_DECREF(type);
+    Py_DECREF(m.ptr());
+    PADDLE_THROW(platform::errors::Fatal(
+        "Init Paddle error in BindEagerStringTensor(PyModule_AddObject)."));
+    return;
+  }
+}
+
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/eager.h
+++ b/paddle/fluid/pybind/eager.h
@@ -39,6 +39,7 @@ typedef struct {
 } PyLayerObject;

 void BindEager(pybind11::module* m);
+void BindEagerStringTensor(pybind11::module* module);
 void BindFunctions(PyObject* module);
 void BindEagerPyLayer(PyObject* module);


--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -257,6 +257,72 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }

+static PyObject* tensor_method_numpy_for_string_tensor(TensorObject* self,
+                                                       PyObject* args,
+                                                       PyObject* kwargs) {
+  EAGER_TRY
+  auto& api = pybind11::detail::npy_api::get();
+  if (!self->tensor.impl() || !self->tensor.impl()->initialized()) {
+    VLOG(6) << "The StringTensor is uninitialized. Return the empty string "
+               "numpy array.";
+    Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];
+    Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank];
+    py_dims[0] = 0;
+    py_strides[0] = 0;
+
+    PyObject* array = api.PyArray_NewFromDescr_(
+        api.PyArray_Type_,
+        api.PyArray_DescrFromType_(pybind11::detail::npy_api::NPY_UNICODE_), 1,
+        py_dims, py_strides, nullptr,
+        pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ |
+            pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
+        nullptr);
+    return array;
+  }
+
+  if (self->tensor.is_cpu()) {
+    VLOG(6) << "Getting StringTensor's numpy value";
+    auto string_tensor =
+        std::dynamic_pointer_cast<phi::StringTensor>(self->tensor.impl());
+    const auto* st_ptr = string_tensor->data();
+    auto numel = self->tensor.numel();
+    auto tensor_dims = self->tensor.shape();
+    // Get the max unicode length of StringTensor to create numpy unicode string
+    // array.
+    auto* longest_pstring = std::max_element(
+        st_ptr, st_ptr + numel, [](const auto& a, const auto& b) {
+          auto a_unicode_len =
+              phi::strings::GetUnicodeStrLen(a.data(), a.size());
+          auto b_unicode_len =
+              phi::strings::GetUnicodeStrLen(b.data(), b.size());
+          return a_unicode_len < b_unicode_len;
+        });
+    size_t max_unicode_length = phi::strings::GetUnicodeStrLen(
+        longest_pstring->data(), longest_pstring->size());
+    max_unicode_length = (max_unicode_length == 0) ? 1 : max_unicode_length;
+    VLOG(6) << "The max unicode length is " << max_unicode_length;
+    auto sp = std::make_unique<uint32_t[]>(max_unicode_length * numel);
+    auto py_array_data = sp.get();
+    memset(py_array_data, 0, max_unicode_length * numel * sizeof(uint32_t));
+    for (int64_t i = 0; i < numel; ++i) {
+      auto curr_unicode_len =
+          phi::strings::GetUnicodeStrLen(st_ptr[i].data(), st_ptr[i].size());
+      phi::strings::GetUnicodeStr(st_ptr[i].data(),
+                                  py_array_data + i * max_unicode_length,
+                                  curr_unicode_len);
+    }
+    py::array array(py::dtype("U" + std::to_string(max_unicode_length)),
+                    tensor_dims, {}, py_array_data);
+    return array.release().ptr();
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "StringTensor.numpy() only support cpu tensor."));
+    Py_INCREF(Py_None);
+    return Py_None;
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor_method__is_initialized(TensorObject* self,
                                               PyObject* args,
                                               PyObject* kwargs) {
@@ -1433,6 +1499,18 @@ static PyObject* tensor_method__uva(TensorObject* self, PyObject* args,
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 #endif
+static PyObject* tensor_method__is_string_tensor_hold_allocation(
+    TensorObject* self, PyObject* args, PyObject* kwargs) {
+  EAGER_TRY
+  auto string_tensor =
+      std::dynamic_pointer_cast<phi::StringTensor>(self->tensor.impl());
+  if (string_tensor) {
+    return ToPyObject(string_tensor->initialized());
+  } else {
+    return ToPyObject(false);
+  }
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}

 PyMethodDef variable_methods[] = {
    {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
@@ -1545,5 +1623,20 @@ PyMethodDef variable_methods[] = {
 #endif
    {NULL, NULL, 0, NULL}};

+// variable_methods for core.eager.StringTensor
+PyMethodDef string_tensor_variable_methods[] = {
+    {"numpy",
+     (PyCFunction)(void (*)(void))tensor_method_numpy_for_string_tensor,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_is_initialized",
+     (PyCFunction)(void (*)(void))tensor_method__is_initialized,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_is_string_tensor_hold_allocation",
+     (PyCFunction)(
+         void (*)(void))tensor_method__is_string_tensor_hold_allocation,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    // TODO(zhoushunjie): Need to add _copy_to, copy_ for StringTensor.
+    {NULL, NULL, 0, NULL}};
+
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -204,5 +204,15 @@ struct PyGetSetDef variable_properties[] = {
    {"is_leaf", (getter)tensor_properties_is_leaf, nullptr, nullptr, nullptr},
    {nullptr, nullptr, nullptr, nullptr, nullptr}};

+// variable_properties for core.eager.StringTensor
+struct PyGetSetDef string_tensor_variable_properties[] = {
+    {"name", (getter)tensor_properties_get_name,
+     (setter)tensor_properties_set_name, nullptr, nullptr},
+    {"shape", (getter)tensor_properties_get_shape, nullptr, nullptr, nullptr},
+    {"place", (getter)tensor_properties_get_place, nullptr, nullptr, nullptr},
+    {"_place_str", (getter)tensor_properties_get_place_str, nullptr, nullptr,
+     nullptr},
+    {nullptr, nullptr, nullptr, nullptr, nullptr}};
+
 }  // namespace pybind
 }  // namespace paddle
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -36,6 +36,7 @@ namespace paddle {
 namespace pybind {

 extern PyTypeObject* p_tensor_type;
+extern PyTypeObject* p_string_tensor_type;

 extern PyTypeObject* g_framework_scope_pytype;
 extern PyTypeObject* g_vartype_pytype;
@@ -75,6 +76,8 @@ int TensorDtype2NumpyDtype(phi::DataType dtype) {
      return pybind11::detail::NPY_COMPLEX64;
    case phi::DataType::COMPLEX128:
      return pybind11::detail::NPY_COMPLEX128;
+    case phi::DataType::PSTRING:
+      return pybind11::detail::npy_api::NPY_UNICODE_;
    default:
      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
          "Unknow phi::DataType, the int value = %d.",
@@ -198,7 +201,9 @@ bool IsEagerTensor(PyObject* obj) {
 }

 paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
-  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
+  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type)) ||
+      PyObject_IsInstance(obj,
+                          reinterpret_cast<PyObject*>(p_string_tensor_type))) {
    return reinterpret_cast<TensorObject*>(obj)->tensor;
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
@@ -508,7 +513,14 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value,
    Py_INCREF(Py_None);
    return Py_None;
  }
-  PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
+  PyObject* obj = nullptr;
+  if (value.initialized() && value.is_string_tensor()) {
+    // In order to return the core.eager.StringTensor, there is need
+    // to use p_string_tensor_type to create a python obj.
+    obj = p_string_tensor_type->tp_alloc(p_string_tensor_type, 0);
+  } else {
+    obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
+  }
  if (obj) {
    auto v = reinterpret_cast<TensorObject*>(obj);
    new (&(v->tensor)) paddle::experimental::Tensor();
@@ -753,6 +765,9 @@ static paddle::experimental::Tensor& GetTensorFromPyObject(

  if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
    return reinterpret_cast<TensorObject*>(obj)->tensor;
+  } else if (PyObject_IsInstance(
+                 obj, reinterpret_cast<PyObject*>(p_string_tensor_type))) {
+    return reinterpret_cast<TensorObject*>(obj)->tensor;
  } else {
    PADDLE_THROW(platform::errors::InvalidArgument(
        "%s(): argument '%s' (position %d) must be Tensor, but got %s", op_type,

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -545,6 +545,7 @@ PYBIND11_MODULE(core_noavx, m) {

  BindImperative(&m);
  BindEager(&m);
+  BindEagerStringTensor(&m);
  BindCudaStream(&m);

  // Not used, just make sure cpu_info.cc is linked.

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -36,6 +36,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
+#include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/kernels/strings/unicode.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"

@@ -528,6 +530,60 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj,
  }
 }

+template <typename P>
+void SetStringTensorFromPyArray(phi::StringTensor *self, const py::array &array,
+                                const P &place) {
+  bool is_string_pyarray =
+      array.dtype().kind() == 'S' || array.dtype().kind() == 'U';
+  PADDLE_ENFORCE_EQ(is_string_pyarray, true,
+                    platform::errors::InvalidArgument(
+                        "Expect the dtype of numpy array is string or "
+                        "unicode, but recevie dtype %s",
+                        array.dtype()));
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  dims.reserve(array.ndim());
+  for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
+    dims.push_back(static_cast<int>(array.shape()[i]));
+  }
+  self->Resize(phi::make_ddim(dims));
+  auto itemsize = array.itemsize();
+  if (paddle::platform::is_cpu_place(place)) {
+    auto dst = self->mutable_data(place);
+    if (array.dtype().kind() == 'S') {
+      for (int i = 0; i < self->numel(); ++i) {
+        dst[i] =
+            pstring(reinterpret_cast<const char *>(array.data()) + itemsize * i,
+                    itemsize);
+      }
+    } else {
+      // array.dtype().kind() == 'U'
+      VLOG(6) << "numpy array itemsize: " << itemsize;
+      for (int i = 0; i < self->numel(); ++i) {
+        // Note(zhoushunjie): The itemsize of unicode numpy array is the
+        // the size of each unicode string. Each unicode string is aligned
+        // to max length of the array of unicode strings, so the size of
+        // each unicode string is same. The size of each unicode character is
+        // 4, so the size of unicode string is 4 times of the length of
+        // unicode string.
+        auto unicode_len = itemsize / 4;
+        auto utf8_len = phi::strings::GetUTF8StrLen(
+            reinterpret_cast<const uint32_t *>(array.data()) + unicode_len * i,
+            unicode_len);
+        pstring pstr(utf8_len - 1, 0);
+        phi::strings::GetUTF8Str(
+            reinterpret_cast<const uint32_t *>(array.data()) + unicode_len * i,
+            pstr.mdata(), unicode_len);
+        dst[i] = pstr;
+      }
+    }
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "StringTensor only support CPUPlace now, but receive %s",
+        place.DebugString()));
+  }
+}
+
 template <typename T>
 void SetUVATensorFromPyArrayImpl(framework::LoDTensor *self_tensor,
                                 const py::array_t<T> &array, int device_id) {

--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -245,6 +245,14 @@ class PADDLE_API Tensor final {
   */
  bool is_sparse_csr_tensor() const;

+  /**
+   * @brief Determine whether tensor is StringTensor
+   *
+   * @return true
+   * @return false
+   */
+  bool is_string_tensor() const;
+
  /* Part 3: Device and Backend methods */

  /**

--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/phi/core/selected_rows.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/core/string_tensor.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -155,6 +156,9 @@ bool Tensor::is_sparse_coo_tensor() const {
 bool Tensor::is_sparse_csr_tensor() const {
  return phi::SparseCsrTensor::classof(impl_.get());
 }
+bool Tensor::is_string_tensor() const {
+  return phi::StringTensor::classof(impl_.get());
+}
 /* Part 3: Device and Backend methods */

 Place Tensor::place() const {

--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/phi/core/string_tensor.h"
+#include "paddle/phi/api/lib/utils/storage.h"

 namespace phi {

@@ -161,4 +162,32 @@ void* StringTensor::AllocateFrom(Allocator* allocator,
                                 meta_.offset);
 }

+dtype::pstring* StringTensor::mutable_data(const phi::Place& place,
+                                           size_t requested_size) {
+  PADDLE_ENFORCE_GE(
+      numel(),
+      0,
+      phi::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(),
+          "] now"));
+  size_t size = numel() * SizeOf(dtype());
+  if (requested_size && (requested_size > size)) {
+    size = requested_size;
+  }
+
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + meta_.offset) {
+    holder_.reset();
+    holder_ = paddle::memory::AllocShared(place, size);
+    // Initialize the allocated bytes
+    init_holder();
+    meta_.offset = 0;
+  }
+  return reinterpret_cast<dtype::pstring*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + meta_.offset);
+}
+
 }  // namespace phi
--- a/paddle/phi/core/string_tensor.h
+++ b/paddle/phi/core/string_tensor.h
@@ -122,6 +122,8 @@ class StringTensor : public TensorBase,
  void* AllocateFrom(Allocator* allocator,
                     DataType dtype,
                     size_t requested_size = 0);
+  dtype::pstring* mutable_data(const phi::Place& place,
+                               size_t requested_size = 0);

 private:
  friend class StringTensorUtils;

--- a/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.core as core
+import paddle
+import numpy as np
+from paddle.fluid.framework import _test_eager_guard, in_dygraph_mode
+import unittest
+import copy
+
+
+class EagerStringTensorTestCase(unittest.TestCase):
+    def setUp(self):
+        self.str_arr = np.array([
+            ["15.4寸笔记本的键盘确实爽，基本跟台式机差不多了，蛮喜欢数字小键盘，输数字特方便，样子也很美观，做工也相当不错"
+             ],  # From ChnSentiCorp
+            ["One of the very best Three Stooges shorts ever."]
+        ])  # From IMDB
+
+    def test_constructor_with_args(self):
+        with _test_eager_guard():
+            ST1 = core.eager.StringTensor()  # constructor 1
+            self.assertEqual(ST1.name, "generated_string_tensor_0")
+            self.assertEqual(ST1.shape, [])
+            self.assertEqual(ST1.numpy(), '')
+
+            shape = [2, 3]
+            ST2 = core.eager.StringTensor(shape, "ST2")  # constructor 2
+            self.assertEqual(ST2.name, "ST2")
+            self.assertEqual(ST2.shape, shape)
+            self.assertTrue(
+                np.array_equal(
+                    ST2.numpy(), np.empty(
+                        shape, dtype=np.unicode_)))
+
+            ST3 = core.eager.StringTensor(self.str_arr, "ST3")  # constructor 3
+            self.assertEqual(ST3.name, "ST3")
+            self.assertEqual(ST3.shape, list(self.str_arr.shape))
+            self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr))
+
+            ST4 = core.eager.StringTensor(self.str_arr)  # constructor 4
+            self.assertEqual(ST4.name, "generated_string_tensor_1")
+            self.assertEqual(ST4.shape, list(self.str_arr.shape))
+            self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr))
+
+            ST5 = core.eager.StringTensor(ST4)  # constructor 5
+            self.assertEqual(ST5.name, "generated_string_tensor_2")
+            self.assertEqual(ST5.shape, list(self.str_arr.shape))
+            self.assertTrue(np.array_equal(ST5.numpy(), self.str_arr))
+
+            ST6 = core.eager.StringTensor(ST5, "ST6")  # constructor 6
+            self.assertEqual(ST6.name, "ST6")
+            self.assertEqual(ST6.shape, list(self.str_arr.shape))
+            self.assertTrue(np.array_equal(ST6.numpy(), self.str_arr))
+
+            for st in [ST1, ST2, ST3, ST4, ST5, ST6]:
+                # All StringTensors are on cpu place so far.
+                self.assertTrue(st.place._equals(core.CPUPlace()))
+
+    def test_constructor_with_kwargs(self):
+        with _test_eager_guard():
+            shape = [2, 3]
+            ST1 = core.eager.StringTensor(
+                dims=shape, name="ST1")  # constructor 2
+            self.assertEqual(ST1.name, "ST1")
+            self.assertEqual(ST1.shape, shape)
+            self.assertTrue(
+                np.array_equal(
+                    ST1.numpy(), np.empty(
+                        shape, dtype=np.unicode_)))
+
+            ST2 = core.eager.StringTensor(
+                self.str_arr, name="ST2")  # constructor 3
+            self.assertEqual(ST2.name, "ST2")
+            self.assertEqual(ST2.shape, list(self.str_arr.shape))
+            self.assertTrue(np.array_equal(ST2.numpy(), self.str_arr))
+
+            ST3 = core.eager.StringTensor(ST2, name="ST3")  # constructor 6
+            self.assertEqual(ST3.name, "ST3")
+            self.assertEqual(ST3.shape, list(self.str_arr.shape))
+            self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr))
+
+            ST4 = core.eager.StringTensor(
+                value=ST2, name="ST4")  # constructor 6
+            self.assertEqual(ST4.name, "ST4")
+            self.assertEqual(ST4.shape, list(self.str_arr.shape))
+            self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr))
+            for st in [ST1, ST2, ST3, ST4]:
+                # All StringTensors are on cpu place so far.
+                self.assertTrue(st.place._equals(core.CPUPlace()))
+
+
+if __name__ == "__main__":
+    unittest.main()