From a22b68b81fd798e67853fcf8fd7d7e06286fff00 Mon Sep 17 00:00:00 2001 From: Jack Zhou Date: Fri, 15 Apr 2022 13:53:18 +0800 Subject: [PATCH] Add eager string tensor (#41039) * Add core.eager.StringTensor __init__ which pyarray args can be passed * Add the numpy method of core.eager.StringTensor * revert tensor.to_string modification * Add ToPyObject for core.eager.StringTensor * Add debug string for core.eager.StringTensor * Remove place args of core.eager.StringTensor temporarily * Fix check string_tensor error * remove dtype of core.eager.StringTensor * add core.eager.StringTensor unittest * remove pstring from VarDesc * Add InitStringTensorWithStringTensor * Remove to_string modification * Remove zero_copy arg from StringTensor creator --- paddle/fluid/pybind/eager.cc | 371 +++++++++++++++++- paddle/fluid/pybind/eager.h | 1 + paddle/fluid/pybind/eager_method.cc | 93 +++++ paddle/fluid/pybind/eager_properties.cc | 10 + paddle/fluid/pybind/eager_utils.cc | 19 +- paddle/fluid/pybind/pybind.cc | 1 + paddle/fluid/pybind/tensor_py.h | 56 +++ paddle/phi/api/include/tensor.h | 8 + paddle/phi/api/lib/tensor.cc | 4 + paddle/phi/core/string_tensor.cc | 29 ++ paddle/phi/core/string_tensor.h | 2 + .../unittests/test_egr_string_tensor_api.py | 105 +++++ 12 files changed, 693 insertions(+), 6 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index c529d121f39..fa66e55e9c5 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -36,12 +36,14 @@ limitations under the License. */ #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" +#include "paddle/phi/core/string_tensor.h" namespace paddle { namespace pybind { namespace py = ::pybind11; PyTypeObject* p_tensor_type; +PyTypeObject* p_string_tensor_type; // For StringTensor extern PyTypeObject* g_vartype_pytype; extern PyTypeObject* g_framework_tensor_pytype; @@ -101,6 +103,25 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name, } } +void EmptyStringTensorInitializer(TensorObject* self, const std::string& name, + const paddle::platform::Place& place, + const std::vector& dims = {}) { + auto ddims = phi::make_ddim(dims); + self->tensor.set_name(name); + // Note(zhoushunjie): Only support CPUPlace when create StringTensor + auto actual_place = platform::CPUPlace(); + // Allocate memory + const auto string_allocator = + std::make_unique(actual_place); + const auto alloc = string_allocator.get(); + std::shared_ptr string_tensor = + std::make_shared(alloc, phi::StringTensorMeta{ddims}); + if (phi::product(ddims) > 0) { + string_tensor->mutable_data(actual_place); + } + self->tensor.set_impl(string_tensor); +} + void InitTensorWithNumpyValue(TensorObject* self, const py::object& array, const paddle::platform::Place& place, bool zero_copy = false) { @@ -132,6 +153,28 @@ void InitTensorWithNumpyValue(TensorObject* self, const py::object& array, } } +void InitStringTensorWithNumpyValue(TensorObject* self, const py::object& obj) { + PADDLE_ENFORCE_EQ( + self->tensor.defined(), true, + paddle::platform::errors::Fatal( + "Calling InitStringTensorWithNumpyValue of Eager StringTensor " + "without " + "EmptyStringTensorInitializer is " + "forbidden. Please check your code and make sure you new a " + "eager tensor before init it with NumPy.")); + phi::StringTensor* impl_ptr = + static_cast(self->tensor.impl().get()); + paddle::platform::Place place = impl_ptr->place(); + auto array = obj.cast(); + if (platform::is_cpu_place(place)) { + SetStringTensorFromPyArray(impl_ptr, array, place); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "StringTensor only support CPUPlace now, but receive %s", + place.DebugString())); + } +} + void InitTensorWithTensor(TensorObject* self, const paddle::experimental::Tensor& src, const paddle::platform::Place& place, @@ -171,6 +214,17 @@ void InitTensorWithFrameworkTensor(TensorObject* self, egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false); } +void InitStringTensorWithStringTensor(TensorObject* self, + const paddle::experimental::Tensor& src, + const paddle::platform::Place& place, + const std::string& name) { + self->tensor.set_name(name); + auto impl = std::static_pointer_cast(src.impl()); + self->tensor.set_impl(impl); + VLOG(4) + << "Do ShareDataWith when using StringTensor to initialize StringTensor"; +} + py::object ParsePyArray( std::unordered_map kws_map, std::unordered_map kw_order_map, PyObject* args, @@ -236,13 +290,14 @@ int ParseBooleanArgs(std::string key, std::string ParseName(std::unordered_map kws_map, std::unordered_map kw_order_map, - PyObject* args, bool flag_kwargs, Py_ssize_t args_num) { + PyObject* args, bool flag_kwargs, Py_ssize_t args_num, + std::string unique_name_prefix = "generated_tensor") { std::string act_name = ""; if (kw_order_map["name"] <= args_num) { PyObject* name_obj = PyTuple_GET_ITEM(args, kw_order_map["name"] - 1); if (name_obj == Py_None) { act_name = - egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + egr::Controller::Instance().GenerateUniqueName(unique_name_prefix); } else { act_name = CastPyArg2AttrString(name_obj, kw_order_map["name"] - 1); } @@ -250,13 +305,13 @@ std::string ParseName(std::unordered_map kws_map, if (flag_kwargs) { if ((kws_map["name"] == NULL) || (kws_map["name"] == Py_None)) { act_name = - egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + egr::Controller::Instance().GenerateUniqueName(unique_name_prefix); } else { act_name = CastPyArg2AttrString(kws_map["name"], 0); } } else { act_name = - egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + egr::Controller::Instance().GenerateUniqueName(unique_name_prefix); } } return act_name; @@ -368,6 +423,70 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr, } } +void AutoInitStringTensorByPyArray( + TensorObject* py_tensor_ptr, + std::unordered_map kws_map, PyObject* args, + bool flag_kwargs, Py_ssize_t args_num) { + // The first argument of the StringTensor constructor is PyArray, + // there are 4 arguments to construct the new StringTensor, + // kw_order_map's key is every arguments of the constructor, + // kw_order_map's value is the position of the arguments respectively. + // If u want to update this constructor with new arguments, + // need to update this map and to add or change related code. + std::unordered_map kw_order_map{{"value", 1}, + {"name", 2}}; + py::object numpy_value = py::object(); + paddle::platform::Place place = + egr::Controller::Instance().GetExpectedPlace(); + std::string act_name = ""; + + numpy_value = + ParsePyArray(kws_map, kw_order_map, args, flag_kwargs, args_num); + act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num, + "generated_string_tensor"); + EmptyStringTensorInitializer(py_tensor_ptr, act_name, place); + InitStringTensorWithNumpyValue(py_tensor_ptr, numpy_value); +} + +void AutoInitStringTensorByStringTensor( + TensorObject* py_tensor_ptr, + std::unordered_map kws_map, PyObject* args, + bool flag_kwargs, Py_ssize_t args_num) { + // The first argument of the Tensor constructor is StringTensor, + // there are 3 arguments to construct the new StringTensor, + // kw_order_map's key is every arguments of the constructor, + // kw_order_map's value is the position of the arguments respectively. + // If u want to update this constructor with new arguments, + // need to update this map and to add or change related code. + std::unordered_map kw_order_map{{"value", 1}, + {"name", 2}}; + + paddle::platform::Place place = + egr::Controller::Instance().GetExpectedPlace(); + std::string act_name = ""; + + act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num, + "generated_string_tensor"); + paddle::experimental::Tensor src_tensor; + if (kw_order_map["value"] <= args_num) { + src_tensor = + CastPyArg2Tensor(PyTuple_GET_ITEM(args, kw_order_map["value"] - 1), + kw_order_map["value"] - 1); + } else { + if (flag_kwargs && kws_map["value"] != NULL) { + src_tensor = CastPyArg2Tensor(kws_map["value"], 0); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The first expected kwargs is {value: Tensor}, " + "but could not parse the first argument {value: Tensor} " + "successfully. " + "Please check your input first and make sure you are on the right " + "way.")); + } + } + InitStringTensorWithStringTensor(py_tensor_ptr, src_tensor, place, act_name); +} + /** We should have init function with signature: * 1. * def __init__ () @@ -708,6 +827,204 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { EAGER_CATCH_AND_THROW_RETURN_NEG } +/** We should have init function with signature: + * 1. + * def __init__ () + * + * 2. + * def __init__ ( + * ** dims: vector, + * ** name: std::string) + * + * 3. + * (should have at least one parameter, one parameter equals to case 4, zero + * parameter equals to case 1) + * def __init__ ( + * ** value: ndarray, + * ** zero_copy: bool, + * ** name: std::string) + * + * 4. + * def __init__ ( + * ** value: ndarray) + * + * 5. + * def __init__ ( + * ** tensor: Tensor) + * + * 6. + * (should have at least one parameter, one parameter equals to case 5, zero + * parameter equals to case 1.) + * def __init__ ( + * ** tensor: Tensor, + * ** name: std::string) + * **/ +int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { + // set a flag to record use kwargs or not + bool flag_kwargs = false; + if (kwargs) flag_kwargs = true; + + // all kwargs + PyObject* kw_zero_copy = NULL; + + PyObject* kw_value = NULL; // receive PyArray or Tensor + PyObject* kw_name = NULL; + PyObject* kw_dims = NULL; + + // the keywords argument + static char* kwlist[] = { + const_cast("value"), const_cast("zero_copy"), + const_cast("name"), const_cast("dims"), NULL}; + // 'O' Store a Python object (without any conversion) in a C object pointer, + // '|' Indicates that the remaining arguments in the Python argument list are + // optional. + // PyArg_ParseTupleAndKeywords can Parse the parameters of a function that + // takes both positional and keyword parameters into local variables, + // which enhance case1, case2, case3, case4, case 5, case 6. + bool flag_ = + PyArg_ParseTupleAndKeywords(args, kwargs, "|OOOO", kwlist, &kw_value, + &kw_zero_copy, &kw_name, &kw_dims); + + // helper map + std::unordered_map kws_map{ + {"value", kw_value}, + {"zero_copy", kw_zero_copy}, + {"name", kw_name}, + {"dims", kw_dims}}; + + PADDLE_ENFORCE_EQ(flag_, true, + paddle::platform::errors::PreconditionNotMet( + "Could not parse args and kwargs successfully, " + "please check your input first and make" + "sure you are on the right way. " + "The expected arguments as follow: (" + "value, zero_copy, name, dims)")); + + PADDLE_ENFORCE_NOT_NULL( + self, paddle::platform::errors::Fatal( + "Calling __init__ of Eager Tensor without __new__ is " + "forbidden. Please check your code and make sure you new a " + "eager tensor before init it.")); + + auto py_tensor_ptr = reinterpret_cast(self); + + Py_ssize_t args_num = PyTuple_Size(args); + VLOG(6) << " args_num: " << args_num; + // args_num = 0, means that there is no position arguments. + if (args_num == (Py_ssize_t)0) { + if (!flag_kwargs) { + // case 1 + VLOG(6) << "Calling case1's string initializer."; + EmptyStringTensorInitializer( + py_tensor_ptr, egr::Controller::Instance().GenerateUniqueName( + "generated_string_tensor"), + egr::Controller::Instance().GetExpectedPlace()); + return 0; + } else { + if (kw_value != NULL) { + if (pybind11::detail::npy_api::get().PyArray_Check_(kw_value)) { + VLOG(6) << "Calling case3's or case4's string initializer"; + AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, + flag_kwargs, args_num); + return 0; + } else if (PyObject_IsInstance(kw_value, reinterpret_cast( + p_string_tensor_type))) { + VLOG(6) << "Calling case5's or case6's string initializer"; + AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args, + flag_kwargs, args_num); + return 0; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Could not parse the first keyword argument successfully, " + "the first keyword argument is value, but it should be PyArray " + "or StringTensor." + "Please check your input first and make sure you are on the " + "right way.")); + } + } else if (kw_dims != NULL) { + VLOG(6) << "Calling case2's string initializer."; + std::unordered_map kw_order_map{{"dims", 1}, + {"name", 2}}; + + std::vector dims = CastPyArg2VectorOfInt(kw_dims, 0); + std::string act_name = + ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num, + "generated_string_tensor"); + EmptyStringTensorInitializer( + py_tensor_ptr, act_name, + egr::Controller::Instance().GetExpectedPlace(), dims); + return 0; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "We not only support construct Tensor from numpy value " + "or StringTensor with python kwargs by this initializer, " + "but also even support dtype to init a empty StringTensor. " + "Please check your input first and make sure you call the existed " + "constructor.")); + } + } + } else if (args_num == (Py_ssize_t)1) { // case 3 ~ 6 + // 1 position args, remainting arguments are kwargs + PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0); + if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) { + VLOG(6) << "Calling case3's or case4's string initializer."; + AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); + return 0; + } else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast( + p_string_tensor_type))) { + VLOG(6) << "Calling case5's or case6's string initializer."; + AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args, + flag_kwargs, args_num); + return 0; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Could not parse the first keyword argument successfully, " + "the first keyword argument is value, but it should be PyArray " + "or StringTensor." + "Please check your input first and make sure you are on the " + "right way.")); + } + } else if (args_num == (Py_ssize_t)2) { // case 2 + // 2 position args + if (!flag_kwargs) { + PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0); + if (PyObject_IsInstance( + arg0_ptr, reinterpret_cast(p_string_tensor_type))) { + VLOG(6) << "Calling case6's string initializer."; + AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args, + flag_kwargs, args_num); + return 0; + } else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) { + VLOG(6) << "Calling case3's string initializer."; + AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs, + args_num); + return 0; + } else { + VLOG(6) << "Calling case2's string initializer."; + std::vector dims = CastPyArg2VectorOfInt(arg0_ptr, 0); + std::string act_name = ""; + PyObject* name_obj = PyTuple_GET_ITEM(args, 1); + if (name_obj == Py_None) { + act_name = egr::Controller::Instance().GenerateUniqueName( + "generated_string_tensor"); + } else { + act_name = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1); + } + EmptyStringTensorInitializer( + py_tensor_ptr, act_name, + egr::Controller::Instance().GetExpectedPlace(), dims); + return 0; + } + } else { + PADDLE_THROW(platform::errors::Fatal( + "Can't not find expected num of args, please check your call, and " + "make sure u call the existed constructor.")); + } + } + return 1; +} + static void TensorDealloc(TensorObject* self) { if (self->weakrefs != NULL) PyObject_ClearWeakRefs(reinterpret_cast(self)); @@ -716,8 +1033,10 @@ static void TensorDealloc(TensorObject* self) { } extern struct PyGetSetDef variable_properties[]; +extern struct PyGetSetDef string_tensor_variable_properties[]; extern PyMethodDef variable_methods[]; +extern PyMethodDef string_tensor_variable_methods[]; PyNumberMethods number_methods; PySequenceMethods sequence_methods; @@ -772,5 +1091,49 @@ void BindEager(pybind11::module* module) { BindEagerOpFunctions(&m); } +void BindEagerStringTensor(pybind11::module* module) { + auto m = module->def_submodule("eager"); + + auto heap_type = reinterpret_cast( + PyType_Type.tp_alloc(&PyType_Type, 0)); + heap_type->ht_name = ToPyObject("StringTensor"); + heap_type->ht_qualname = ToPyObject("StringTensor"); + auto type = &heap_type->ht_type; + type->tp_name = "StringTensor"; + type->tp_basicsize = sizeof(TensorObject); + type->tp_dealloc = (destructor)TensorDealloc; + type->tp_as_number = &number_methods; + type->tp_as_sequence = &sequence_methods; + type->tp_as_mapping = &mapping_methods; + type->tp_methods = string_tensor_variable_methods; + type->tp_getset = string_tensor_variable_properties; + type->tp_init = StringTensorInit; + type->tp_new = TensorNew; + Py_INCREF(&PyBaseObject_Type); + type->tp_base = reinterpret_cast(&PyBaseObject_Type); + type->tp_flags |= + Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE; +#if PY_VERSION_HEX >= 0x03050000 + type->tp_as_async = &heap_type->as_async; +#endif + p_string_tensor_type = type; + + if (PyType_Ready(type) < 0) { + PADDLE_THROW(platform::errors::Fatal( + "Init Paddle error in BindEager(PyType_Ready).")); + return; + } + + Py_INCREF(type); + if (PyModule_AddObject(m.ptr(), "StringTensor", + reinterpret_cast(type)) < 0) { + Py_DECREF(type); + Py_DECREF(m.ptr()); + PADDLE_THROW(platform::errors::Fatal( + "Init Paddle error in BindEagerStringTensor(PyModule_AddObject).")); + return; + } +} + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager.h b/paddle/fluid/pybind/eager.h index 03676a677ac..84df71ddeeb 100644 --- a/paddle/fluid/pybind/eager.h +++ b/paddle/fluid/pybind/eager.h @@ -39,6 +39,7 @@ typedef struct { } PyLayerObject; void BindEager(pybind11::module* m); +void BindEagerStringTensor(pybind11::module* module); void BindFunctions(PyObject* module); void BindEagerPyLayer(PyObject* module); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 8304db13c46..542d59318bb 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -257,6 +257,72 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_method_numpy_for_string_tensor(TensorObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto& api = pybind11::detail::npy_api::get(); + if (!self->tensor.impl() || !self->tensor.impl()->initialized()) { + VLOG(6) << "The StringTensor is uninitialized. Return the empty string " + "numpy array."; + Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank]; + Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank]; + py_dims[0] = 0; + py_strides[0] = 0; + + PyObject* array = api.PyArray_NewFromDescr_( + api.PyArray_Type_, + api.PyArray_DescrFromType_(pybind11::detail::npy_api::NPY_UNICODE_), 1, + py_dims, py_strides, nullptr, + pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ | + pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_, + nullptr); + return array; + } + + if (self->tensor.is_cpu()) { + VLOG(6) << "Getting StringTensor's numpy value"; + auto string_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + const auto* st_ptr = string_tensor->data(); + auto numel = self->tensor.numel(); + auto tensor_dims = self->tensor.shape(); + // Get the max unicode length of StringTensor to create numpy unicode string + // array. + auto* longest_pstring = std::max_element( + st_ptr, st_ptr + numel, [](const auto& a, const auto& b) { + auto a_unicode_len = + phi::strings::GetUnicodeStrLen(a.data(), a.size()); + auto b_unicode_len = + phi::strings::GetUnicodeStrLen(b.data(), b.size()); + return a_unicode_len < b_unicode_len; + }); + size_t max_unicode_length = phi::strings::GetUnicodeStrLen( + longest_pstring->data(), longest_pstring->size()); + max_unicode_length = (max_unicode_length == 0) ? 1 : max_unicode_length; + VLOG(6) << "The max unicode length is " << max_unicode_length; + auto sp = std::make_unique(max_unicode_length * numel); + auto py_array_data = sp.get(); + memset(py_array_data, 0, max_unicode_length * numel * sizeof(uint32_t)); + for (int64_t i = 0; i < numel; ++i) { + auto curr_unicode_len = + phi::strings::GetUnicodeStrLen(st_ptr[i].data(), st_ptr[i].size()); + phi::strings::GetUnicodeStr(st_ptr[i].data(), + py_array_data + i * max_unicode_length, + curr_unicode_len); + } + py::array array(py::dtype("U" + std::to_string(max_unicode_length)), + tensor_dims, {}, py_array_data); + return array.release().ptr(); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "StringTensor.numpy() only support cpu tensor.")); + Py_INCREF(Py_None); + return Py_None; + } + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* tensor_method__is_initialized(TensorObject* self, PyObject* args, PyObject* kwargs) { @@ -1433,6 +1499,18 @@ static PyObject* tensor_method__uva(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } #endif +static PyObject* tensor_method__is_string_tensor_hold_allocation( + TensorObject* self, PyObject* args, PyObject* kwargs) { + EAGER_TRY + auto string_tensor = + std::dynamic_pointer_cast(self->tensor.impl()); + if (string_tensor) { + return ToPyObject(string_tensor->initialized()); + } else { + return ToPyObject(false); + } + EAGER_CATCH_AND_THROW_RETURN_NULL +} PyMethodDef variable_methods[] = { {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, @@ -1545,5 +1623,20 @@ PyMethodDef variable_methods[] = { #endif {NULL, NULL, 0, NULL}}; +// variable_methods for core.eager.StringTensor +PyMethodDef string_tensor_variable_methods[] = { + {"numpy", + (PyCFunction)(void (*)(void))tensor_method_numpy_for_string_tensor, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_is_initialized", + (PyCFunction)(void (*)(void))tensor_method__is_initialized, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_is_string_tensor_hold_allocation", + (PyCFunction)( + void (*)(void))tensor_method__is_string_tensor_hold_allocation, + METH_VARARGS | METH_KEYWORDS, NULL}, + // TODO(zhoushunjie): Need to add _copy_to, copy_ for StringTensor. + {NULL, NULL, 0, NULL}}; + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 797b68fcb36..de66308a7ba 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -204,5 +204,15 @@ struct PyGetSetDef variable_properties[] = { {"is_leaf", (getter)tensor_properties_is_leaf, nullptr, nullptr, nullptr}, {nullptr, nullptr, nullptr, nullptr, nullptr}}; +// variable_properties for core.eager.StringTensor +struct PyGetSetDef string_tensor_variable_properties[] = { + {"name", (getter)tensor_properties_get_name, + (setter)tensor_properties_set_name, nullptr, nullptr}, + {"shape", (getter)tensor_properties_get_shape, nullptr, nullptr, nullptr}, + {"place", (getter)tensor_properties_get_place, nullptr, nullptr, nullptr}, + {"_place_str", (getter)tensor_properties_get_place_str, nullptr, nullptr, + nullptr}, + {nullptr, nullptr, nullptr, nullptr, nullptr}}; + } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 8baea3d0dbf..8fa21ef45f8 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -36,6 +36,7 @@ namespace paddle { namespace pybind { extern PyTypeObject* p_tensor_type; +extern PyTypeObject* p_string_tensor_type; extern PyTypeObject* g_framework_scope_pytype; extern PyTypeObject* g_vartype_pytype; @@ -75,6 +76,8 @@ int TensorDtype2NumpyDtype(phi::DataType dtype) { return pybind11::detail::NPY_COMPLEX64; case phi::DataType::COMPLEX128: return pybind11::detail::NPY_COMPLEX128; + case phi::DataType::PSTRING: + return pybind11::detail::npy_api::NPY_UNICODE_; default: PADDLE_THROW(paddle::platform::errors::InvalidArgument( "Unknow phi::DataType, the int value = %d.", @@ -198,7 +201,9 @@ bool IsEagerTensor(PyObject* obj) { } paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { - if (PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type))) { + if (PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type)) || + PyObject_IsInstance(obj, + reinterpret_cast(p_string_tensor_type))) { return reinterpret_cast(obj)->tensor; } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -508,7 +513,14 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value, Py_INCREF(Py_None); return Py_None; } - PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); + PyObject* obj = nullptr; + if (value.initialized() && value.is_string_tensor()) { + // In order to return the core.eager.StringTensor, there is need + // to use p_string_tensor_type to create a python obj. + obj = p_string_tensor_type->tp_alloc(p_string_tensor_type, 0); + } else { + obj = p_tensor_type->tp_alloc(p_tensor_type, 0); + } if (obj) { auto v = reinterpret_cast(obj); new (&(v->tensor)) paddle::experimental::Tensor(); @@ -753,6 +765,9 @@ static paddle::experimental::Tensor& GetTensorFromPyObject( if (PyObject_IsInstance(obj, reinterpret_cast(p_tensor_type))) { return reinterpret_cast(obj)->tensor; + } else if (PyObject_IsInstance( + obj, reinterpret_cast(p_string_tensor_type))) { + return reinterpret_cast(obj)->tensor; } else { PADDLE_THROW(platform::errors::InvalidArgument( "%s(): argument '%s' (position %d) must be Tensor, but got %s", op_type, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 45fcd2fad98..d6071617224 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -545,6 +545,7 @@ PYBIND11_MODULE(core_noavx, m) { BindImperative(&m); BindEager(&m); + BindEagerStringTensor(&m); BindCudaStream(&m); // Not used, just make sure cpu_info.cc is linked. diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 3f7ce8b63f9..63b36bd9173 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -36,6 +36,8 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/phi/core/string_tensor.h" +#include "paddle/phi/kernels/strings/unicode.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" @@ -528,6 +530,60 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj, } } +template +void SetStringTensorFromPyArray(phi::StringTensor *self, const py::array &array, + const P &place) { + bool is_string_pyarray = + array.dtype().kind() == 'S' || array.dtype().kind() == 'U'; + PADDLE_ENFORCE_EQ(is_string_pyarray, true, + platform::errors::InvalidArgument( + "Expect the dtype of numpy array is string or " + "unicode, but recevie dtype %s", + array.dtype())); + std::vector dims; + dims.reserve(array.ndim()); + dims.reserve(array.ndim()); + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { + dims.push_back(static_cast(array.shape()[i])); + } + self->Resize(phi::make_ddim(dims)); + auto itemsize = array.itemsize(); + if (paddle::platform::is_cpu_place(place)) { + auto dst = self->mutable_data(place); + if (array.dtype().kind() == 'S') { + for (int i = 0; i < self->numel(); ++i) { + dst[i] = + pstring(reinterpret_cast(array.data()) + itemsize * i, + itemsize); + } + } else { + // array.dtype().kind() == 'U' + VLOG(6) << "numpy array itemsize: " << itemsize; + for (int i = 0; i < self->numel(); ++i) { + // Note(zhoushunjie): The itemsize of unicode numpy array is the + // the size of each unicode string. Each unicode string is aligned + // to max length of the array of unicode strings, so the size of + // each unicode string is same. The size of each unicode character is + // 4, so the size of unicode string is 4 times of the length of + // unicode string. + auto unicode_len = itemsize / 4; + auto utf8_len = phi::strings::GetUTF8StrLen( + reinterpret_cast(array.data()) + unicode_len * i, + unicode_len); + pstring pstr(utf8_len - 1, 0); + phi::strings::GetUTF8Str( + reinterpret_cast(array.data()) + unicode_len * i, + pstr.mdata(), unicode_len); + dst[i] = pstr; + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "StringTensor only support CPUPlace now, but receive %s", + place.DebugString())); + } +} + template void SetUVATensorFromPyArrayImpl(framework::LoDTensor *self_tensor, const py::array_t &array, int device_id) { diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index 3c3da4b749e..e4a97e2c16f 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -245,6 +245,14 @@ class PADDLE_API Tensor final { */ bool is_sparse_csr_tensor() const; + /** + * @brief Determine whether tensor is StringTensor + * + * @return true + * @return false + */ + bool is_string_tensor() const; + /* Part 3: Device and Backend methods */ /** diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 1fb08033798..67c1b711fc9 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/phi/core/selected_rows.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" +#include "paddle/phi/core/string_tensor.h" #include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_utils.h" @@ -155,6 +156,9 @@ bool Tensor::is_sparse_coo_tensor() const { bool Tensor::is_sparse_csr_tensor() const { return phi::SparseCsrTensor::classof(impl_.get()); } +bool Tensor::is_string_tensor() const { + return phi::StringTensor::classof(impl_.get()); +} /* Part 3: Device and Backend methods */ Place Tensor::place() const { diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc index 42f12b78204..35444dc33fe 100644 --- a/paddle/phi/core/string_tensor.cc +++ b/paddle/phi/core/string_tensor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/phi/core/string_tensor.h" +#include "paddle/phi/api/lib/utils/storage.h" namespace phi { @@ -161,4 +162,32 @@ void* StringTensor::AllocateFrom(Allocator* allocator, meta_.offset); } +dtype::pstring* StringTensor::mutable_data(const phi::Place& place, + size_t requested_size) { + PADDLE_ENFORCE_GE( + numel(), + 0, + phi::errors::PreconditionNotMet( + "The Tensor's element number must be equal or greater than zero. " + "The Tensor's shape is [", + dims(), + "] now")); + size_t size = numel() * SizeOf(dtype()); + if (requested_size && (requested_size > size)) { + size = requested_size; + } + + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + meta_.offset) { + holder_.reset(); + holder_ = paddle::memory::AllocShared(place, size); + // Initialize the allocated bytes + init_holder(); + meta_.offset = 0; + } + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + meta_.offset); +} + } // namespace phi diff --git a/paddle/phi/core/string_tensor.h b/paddle/phi/core/string_tensor.h index 223ecaca581..916c2a2bd4a 100644 --- a/paddle/phi/core/string_tensor.h +++ b/paddle/phi/core/string_tensor.h @@ -122,6 +122,8 @@ class StringTensor : public TensorBase, void* AllocateFrom(Allocator* allocator, DataType dtype, size_t requested_size = 0); + dtype::pstring* mutable_data(const phi::Place& place, + size_t requested_size = 0); private: friend class StringTensorUtils; diff --git a/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py new file mode 100644 index 00000000000..def5f569b8f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_egr_string_tensor_api.py @@ -0,0 +1,105 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid.core as core +import paddle +import numpy as np +from paddle.fluid.framework import _test_eager_guard, in_dygraph_mode +import unittest +import copy + + +class EagerStringTensorTestCase(unittest.TestCase): + def setUp(self): + self.str_arr = np.array([ + ["15.4寸笔记本的键盘确实爽,基本跟台式机差不多了,蛮喜欢数字小键盘,输数字特方便,样子也很美观,做工也相当不错" + ], # From ChnSentiCorp + ["One of the very best Three Stooges shorts ever."] + ]) # From IMDB + + def test_constructor_with_args(self): + with _test_eager_guard(): + ST1 = core.eager.StringTensor() # constructor 1 + self.assertEqual(ST1.name, "generated_string_tensor_0") + self.assertEqual(ST1.shape, []) + self.assertEqual(ST1.numpy(), '') + + shape = [2, 3] + ST2 = core.eager.StringTensor(shape, "ST2") # constructor 2 + self.assertEqual(ST2.name, "ST2") + self.assertEqual(ST2.shape, shape) + self.assertTrue( + np.array_equal( + ST2.numpy(), np.empty( + shape, dtype=np.unicode_))) + + ST3 = core.eager.StringTensor(self.str_arr, "ST3") # constructor 3 + self.assertEqual(ST3.name, "ST3") + self.assertEqual(ST3.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr)) + + ST4 = core.eager.StringTensor(self.str_arr) # constructor 4 + self.assertEqual(ST4.name, "generated_string_tensor_1") + self.assertEqual(ST4.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr)) + + ST5 = core.eager.StringTensor(ST4) # constructor 5 + self.assertEqual(ST5.name, "generated_string_tensor_2") + self.assertEqual(ST5.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST5.numpy(), self.str_arr)) + + ST6 = core.eager.StringTensor(ST5, "ST6") # constructor 6 + self.assertEqual(ST6.name, "ST6") + self.assertEqual(ST6.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST6.numpy(), self.str_arr)) + + for st in [ST1, ST2, ST3, ST4, ST5, ST6]: + # All StringTensors are on cpu place so far. + self.assertTrue(st.place._equals(core.CPUPlace())) + + def test_constructor_with_kwargs(self): + with _test_eager_guard(): + shape = [2, 3] + ST1 = core.eager.StringTensor( + dims=shape, name="ST1") # constructor 2 + self.assertEqual(ST1.name, "ST1") + self.assertEqual(ST1.shape, shape) + self.assertTrue( + np.array_equal( + ST1.numpy(), np.empty( + shape, dtype=np.unicode_))) + + ST2 = core.eager.StringTensor( + self.str_arr, name="ST2") # constructor 3 + self.assertEqual(ST2.name, "ST2") + self.assertEqual(ST2.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST2.numpy(), self.str_arr)) + + ST3 = core.eager.StringTensor(ST2, name="ST3") # constructor 6 + self.assertEqual(ST3.name, "ST3") + self.assertEqual(ST3.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr)) + + ST4 = core.eager.StringTensor( + value=ST2, name="ST4") # constructor 6 + self.assertEqual(ST4.name, "ST4") + self.assertEqual(ST4.shape, list(self.str_arr.shape)) + self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr)) + for st in [ST1, ST2, ST3, ST4]: + # All StringTensors are on cpu place so far. + self.assertTrue(st.place._equals(core.CPUPlace())) + + +if __name__ == "__main__": + unittest.main() -- GitLab