未验证 提交 a22b68b8 编写于 作者: J Jack Zhou 提交者: GitHub

Add eager string tensor (#41039)

* Add core.eager.StringTensor __init__ which pyarray args can be passed

* Add the numpy method of core.eager.StringTensor

* revert tensor.to_string modification

* Add ToPyObject for core.eager.StringTensor

* Add debug string for core.eager.StringTensor

* Remove place args of core.eager.StringTensor temporarily

* Fix check string_tensor error

* remove dtype of core.eager.StringTensor

* add core.eager.StringTensor unittest

* remove pstring from VarDesc

* Add InitStringTensorWithStringTensor

* Remove to_string modification

* Remove zero_copy arg from StringTensor creator
上级 ef6ff4ef
...@@ -36,12 +36,14 @@ limitations under the License. */ ...@@ -36,12 +36,14 @@ limitations under the License. */
#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/api/lib/utils/storage.h"
#include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/api/lib/utils/tensor_utils.h"
#include "paddle/phi/core/string_tensor.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
namespace py = ::pybind11; namespace py = ::pybind11;
PyTypeObject* p_tensor_type; PyTypeObject* p_tensor_type;
PyTypeObject* p_string_tensor_type; // For StringTensor
extern PyTypeObject* g_vartype_pytype; extern PyTypeObject* g_vartype_pytype;
extern PyTypeObject* g_framework_tensor_pytype; extern PyTypeObject* g_framework_tensor_pytype;
...@@ -101,6 +103,25 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name, ...@@ -101,6 +103,25 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
} }
} }
void EmptyStringTensorInitializer(TensorObject* self, const std::string& name,
const paddle::platform::Place& place,
const std::vector<int>& dims = {}) {
auto ddims = phi::make_ddim(dims);
self->tensor.set_name(name);
// Note(zhoushunjie): Only support CPUPlace when create StringTensor
auto actual_place = platform::CPUPlace();
// Allocate memory
const auto string_allocator =
std::make_unique<paddle::experimental::DefaultAllocator>(actual_place);
const auto alloc = string_allocator.get();
std::shared_ptr<phi::StringTensor> string_tensor =
std::make_shared<phi::StringTensor>(alloc, phi::StringTensorMeta{ddims});
if (phi::product(ddims) > 0) {
string_tensor->mutable_data(actual_place);
}
self->tensor.set_impl(string_tensor);
}
void InitTensorWithNumpyValue(TensorObject* self, const py::object& array, void InitTensorWithNumpyValue(TensorObject* self, const py::object& array,
const paddle::platform::Place& place, const paddle::platform::Place& place,
bool zero_copy = false) { bool zero_copy = false) {
...@@ -132,6 +153,28 @@ void InitTensorWithNumpyValue(TensorObject* self, const py::object& array, ...@@ -132,6 +153,28 @@ void InitTensorWithNumpyValue(TensorObject* self, const py::object& array,
} }
} }
void InitStringTensorWithNumpyValue(TensorObject* self, const py::object& obj) {
PADDLE_ENFORCE_EQ(
self->tensor.defined(), true,
paddle::platform::errors::Fatal(
"Calling InitStringTensorWithNumpyValue of Eager StringTensor "
"without "
"EmptyStringTensorInitializer is "
"forbidden. Please check your code and make sure you new a "
"eager tensor before init it with NumPy."));
phi::StringTensor* impl_ptr =
static_cast<phi::StringTensor*>(self->tensor.impl().get());
paddle::platform::Place place = impl_ptr->place();
auto array = obj.cast<py::array>();
if (platform::is_cpu_place(place)) {
SetStringTensorFromPyArray<platform::CPUPlace>(impl_ptr, array, place);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"StringTensor only support CPUPlace now, but receive %s",
place.DebugString()));
}
}
void InitTensorWithTensor(TensorObject* self, void InitTensorWithTensor(TensorObject* self,
const paddle::experimental::Tensor& src, const paddle::experimental::Tensor& src,
const paddle::platform::Place& place, const paddle::platform::Place& place,
...@@ -171,6 +214,17 @@ void InitTensorWithFrameworkTensor(TensorObject* self, ...@@ -171,6 +214,17 @@ void InitTensorWithFrameworkTensor(TensorObject* self,
egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false); egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
} }
void InitStringTensorWithStringTensor(TensorObject* self,
const paddle::experimental::Tensor& src,
const paddle::platform::Place& place,
const std::string& name) {
self->tensor.set_name(name);
auto impl = std::static_pointer_cast<phi::StringTensor>(src.impl());
self->tensor.set_impl(impl);
VLOG(4)
<< "Do ShareDataWith when using StringTensor to initialize StringTensor";
}
py::object ParsePyArray( py::object ParsePyArray(
std::unordered_map<std::string, PyObject*> kws_map, std::unordered_map<std::string, PyObject*> kws_map,
std::unordered_map<std::string, Py_ssize_t> kw_order_map, PyObject* args, std::unordered_map<std::string, Py_ssize_t> kw_order_map, PyObject* args,
...@@ -236,13 +290,14 @@ int ParseBooleanArgs(std::string key, ...@@ -236,13 +290,14 @@ int ParseBooleanArgs(std::string key,
std::string ParseName(std::unordered_map<std::string, PyObject*> kws_map, std::string ParseName(std::unordered_map<std::string, PyObject*> kws_map,
std::unordered_map<std::string, Py_ssize_t> kw_order_map, std::unordered_map<std::string, Py_ssize_t> kw_order_map,
PyObject* args, bool flag_kwargs, Py_ssize_t args_num) { PyObject* args, bool flag_kwargs, Py_ssize_t args_num,
std::string unique_name_prefix = "generated_tensor") {
std::string act_name = ""; std::string act_name = "";
if (kw_order_map["name"] <= args_num) { if (kw_order_map["name"] <= args_num) {
PyObject* name_obj = PyTuple_GET_ITEM(args, kw_order_map["name"] - 1); PyObject* name_obj = PyTuple_GET_ITEM(args, kw_order_map["name"] - 1);
if (name_obj == Py_None) { if (name_obj == Py_None) {
act_name = act_name =
egr::Controller::Instance().GenerateUniqueName("generated_tensor"); egr::Controller::Instance().GenerateUniqueName(unique_name_prefix);
} else { } else {
act_name = CastPyArg2AttrString(name_obj, kw_order_map["name"] - 1); act_name = CastPyArg2AttrString(name_obj, kw_order_map["name"] - 1);
} }
...@@ -250,13 +305,13 @@ std::string ParseName(std::unordered_map<std::string, PyObject*> kws_map, ...@@ -250,13 +305,13 @@ std::string ParseName(std::unordered_map<std::string, PyObject*> kws_map,
if (flag_kwargs) { if (flag_kwargs) {
if ((kws_map["name"] == NULL) || (kws_map["name"] == Py_None)) { if ((kws_map["name"] == NULL) || (kws_map["name"] == Py_None)) {
act_name = act_name =
egr::Controller::Instance().GenerateUniqueName("generated_tensor"); egr::Controller::Instance().GenerateUniqueName(unique_name_prefix);
} else { } else {
act_name = CastPyArg2AttrString(kws_map["name"], 0); act_name = CastPyArg2AttrString(kws_map["name"], 0);
} }
} else { } else {
act_name = act_name =
egr::Controller::Instance().GenerateUniqueName("generated_tensor"); egr::Controller::Instance().GenerateUniqueName(unique_name_prefix);
} }
} }
return act_name; return act_name;
...@@ -368,6 +423,70 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr, ...@@ -368,6 +423,70 @@ void AutoInitTensorByTensor(TensorObject* py_tensor_ptr,
} }
} }
void AutoInitStringTensorByPyArray(
TensorObject* py_tensor_ptr,
std::unordered_map<std::string, PyObject*> kws_map, PyObject* args,
bool flag_kwargs, Py_ssize_t args_num) {
// The first argument of the StringTensor constructor is PyArray,
// there are 4 arguments to construct the new StringTensor,
// kw_order_map's key is every arguments of the constructor,
// kw_order_map's value is the position of the arguments respectively.
// If u want to update this constructor with new arguments,
// need to update this map and to add or change related code.
std::unordered_map<std::string, Py_ssize_t> kw_order_map{{"value", 1},
{"name", 2}};
py::object numpy_value = py::object();
paddle::platform::Place place =
egr::Controller::Instance().GetExpectedPlace();
std::string act_name = "";
numpy_value =
ParsePyArray(kws_map, kw_order_map, args, flag_kwargs, args_num);
act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num,
"generated_string_tensor");
EmptyStringTensorInitializer(py_tensor_ptr, act_name, place);
InitStringTensorWithNumpyValue(py_tensor_ptr, numpy_value);
}
void AutoInitStringTensorByStringTensor(
TensorObject* py_tensor_ptr,
std::unordered_map<std::string, PyObject*> kws_map, PyObject* args,
bool flag_kwargs, Py_ssize_t args_num) {
// The first argument of the Tensor constructor is StringTensor,
// there are 3 arguments to construct the new StringTensor,
// kw_order_map's key is every arguments of the constructor,
// kw_order_map's value is the position of the arguments respectively.
// If u want to update this constructor with new arguments,
// need to update this map and to add or change related code.
std::unordered_map<std::string, Py_ssize_t> kw_order_map{{"value", 1},
{"name", 2}};
paddle::platform::Place place =
egr::Controller::Instance().GetExpectedPlace();
std::string act_name = "";
act_name = ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num,
"generated_string_tensor");
paddle::experimental::Tensor src_tensor;
if (kw_order_map["value"] <= args_num) {
src_tensor =
CastPyArg2Tensor(PyTuple_GET_ITEM(args, kw_order_map["value"] - 1),
kw_order_map["value"] - 1);
} else {
if (flag_kwargs && kws_map["value"] != NULL) {
src_tensor = CastPyArg2Tensor(kws_map["value"], 0);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"The first expected kwargs is {value: Tensor}, "
"but could not parse the first argument {value: Tensor} "
"successfully. "
"Please check your input first and make sure you are on the right "
"way."));
}
}
InitStringTensorWithStringTensor(py_tensor_ptr, src_tensor, place, act_name);
}
/** We should have init function with signature: /** We should have init function with signature:
* 1. * 1.
* def __init__ () * def __init__ ()
...@@ -708,6 +827,204 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) { ...@@ -708,6 +827,204 @@ int TensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
EAGER_CATCH_AND_THROW_RETURN_NEG EAGER_CATCH_AND_THROW_RETURN_NEG
} }
/** We should have init function with signature:
* 1.
* def __init__ ()
*
* 2.
* def __init__ (
* ** dims: vector<int>,
* ** name: std::string)
*
* 3.
* (should have at least one parameter, one parameter equals to case 4, zero
* parameter equals to case 1)
* def __init__ (
* ** value: ndarray,
* ** zero_copy: bool,
* ** name: std::string)
*
* 4.
* def __init__ (
* ** value: ndarray)
*
* 5.
* def __init__ (
* ** tensor: Tensor)
*
* 6.
* (should have at least one parameter, one parameter equals to case 5, zero
* parameter equals to case 1.)
* def __init__ (
* ** tensor: Tensor,
* ** name: std::string)
* **/
int StringTensorInit(PyObject* self, PyObject* args, PyObject* kwargs) {
// set a flag to record use kwargs or not
bool flag_kwargs = false;
if (kwargs) flag_kwargs = true;
// all kwargs
PyObject* kw_zero_copy = NULL;
PyObject* kw_value = NULL; // receive PyArray or Tensor
PyObject* kw_name = NULL;
PyObject* kw_dims = NULL;
// the keywords argument
static char* kwlist[] = {
const_cast<char*>("value"), const_cast<char*>("zero_copy"),
const_cast<char*>("name"), const_cast<char*>("dims"), NULL};
// 'O' Store a Python object (without any conversion) in a C object pointer,
// '|' Indicates that the remaining arguments in the Python argument list are
// optional.
// PyArg_ParseTupleAndKeywords can Parse the parameters of a function that
// takes both positional and keyword parameters into local variables,
// which enhance case1, case2, case3, case4, case 5, case 6.
bool flag_ =
PyArg_ParseTupleAndKeywords(args, kwargs, "|OOOO", kwlist, &kw_value,
&kw_zero_copy, &kw_name, &kw_dims);
// helper map
std::unordered_map<std::string, PyObject*> kws_map{
{"value", kw_value},
{"zero_copy", kw_zero_copy},
{"name", kw_name},
{"dims", kw_dims}};
PADDLE_ENFORCE_EQ(flag_, true,
paddle::platform::errors::PreconditionNotMet(
"Could not parse args and kwargs successfully, "
"please check your input first and make"
"sure you are on the right way. "
"The expected arguments as follow: ("
"value, zero_copy, name, dims)"));
PADDLE_ENFORCE_NOT_NULL(
self, paddle::platform::errors::Fatal(
"Calling __init__ of Eager Tensor without __new__ is "
"forbidden. Please check your code and make sure you new a "
"eager tensor before init it."));
auto py_tensor_ptr = reinterpret_cast<TensorObject*>(self);
Py_ssize_t args_num = PyTuple_Size(args);
VLOG(6) << " args_num: " << args_num;
// args_num = 0, means that there is no position arguments.
if (args_num == (Py_ssize_t)0) {
if (!flag_kwargs) {
// case 1
VLOG(6) << "Calling case1's string initializer.";
EmptyStringTensorInitializer(
py_tensor_ptr, egr::Controller::Instance().GenerateUniqueName(
"generated_string_tensor"),
egr::Controller::Instance().GetExpectedPlace());
return 0;
} else {
if (kw_value != NULL) {
if (pybind11::detail::npy_api::get().PyArray_Check_(kw_value)) {
VLOG(6) << "Calling case3's or case4's string initializer";
AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args,
flag_kwargs, args_num);
return 0;
} else if (PyObject_IsInstance(kw_value, reinterpret_cast<PyObject*>(
p_string_tensor_type))) {
VLOG(6) << "Calling case5's or case6's string initializer";
AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args,
flag_kwargs, args_num);
return 0;
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Could not parse the first keyword argument successfully, "
"the first keyword argument is value, but it should be PyArray "
"or StringTensor."
"Please check your input first and make sure you are on the "
"right way."));
}
} else if (kw_dims != NULL) {
VLOG(6) << "Calling case2's string initializer.";
std::unordered_map<std::string, Py_ssize_t> kw_order_map{{"dims", 1},
{"name", 2}};
std::vector<int> dims = CastPyArg2VectorOfInt(kw_dims, 0);
std::string act_name =
ParseName(kws_map, kw_order_map, args, flag_kwargs, args_num,
"generated_string_tensor");
EmptyStringTensorInitializer(
py_tensor_ptr, act_name,
egr::Controller::Instance().GetExpectedPlace(), dims);
return 0;
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"We not only support construct Tensor from numpy value "
"or StringTensor with python kwargs by this initializer, "
"but also even support dtype to init a empty StringTensor. "
"Please check your input first and make sure you call the existed "
"constructor."));
}
}
} else if (args_num == (Py_ssize_t)1) { // case 3 ~ 6
// 1 position args, remainting arguments are kwargs
PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
VLOG(6) << "Calling case3's or case4's string initializer.";
AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
args_num);
return 0;
} else if (PyObject_IsInstance(arg0_ptr, reinterpret_cast<PyObject*>(
p_string_tensor_type))) {
VLOG(6) << "Calling case5's or case6's string initializer.";
AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args,
flag_kwargs, args_num);
return 0;
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Could not parse the first keyword argument successfully, "
"the first keyword argument is value, but it should be PyArray "
"or StringTensor."
"Please check your input first and make sure you are on the "
"right way."));
}
} else if (args_num == (Py_ssize_t)2) { // case 2
// 2 position args
if (!flag_kwargs) {
PyObject* arg0_ptr = PyTuple_GET_ITEM(args, 0);
if (PyObject_IsInstance(
arg0_ptr, reinterpret_cast<PyObject*>(p_string_tensor_type))) {
VLOG(6) << "Calling case6's string initializer.";
AutoInitStringTensorByStringTensor(py_tensor_ptr, kws_map, args,
flag_kwargs, args_num);
return 0;
} else if (pybind11::detail::npy_api::get().PyArray_Check_(arg0_ptr)) {
VLOG(6) << "Calling case3's string initializer.";
AutoInitStringTensorByPyArray(py_tensor_ptr, kws_map, args, flag_kwargs,
args_num);
return 0;
} else {
VLOG(6) << "Calling case2's string initializer.";
std::vector<int> dims = CastPyArg2VectorOfInt(arg0_ptr, 0);
std::string act_name = "";
PyObject* name_obj = PyTuple_GET_ITEM(args, 1);
if (name_obj == Py_None) {
act_name = egr::Controller::Instance().GenerateUniqueName(
"generated_string_tensor");
} else {
act_name = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1);
}
EmptyStringTensorInitializer(
py_tensor_ptr, act_name,
egr::Controller::Instance().GetExpectedPlace(), dims);
return 0;
}
} else {
PADDLE_THROW(platform::errors::Fatal(
"Can't not find expected num of args, please check your call, and "
"make sure u call the existed constructor."));
}
}
return 1;
}
static void TensorDealloc(TensorObject* self) { static void TensorDealloc(TensorObject* self) {
if (self->weakrefs != NULL) if (self->weakrefs != NULL)
PyObject_ClearWeakRefs(reinterpret_cast<PyObject*>(self)); PyObject_ClearWeakRefs(reinterpret_cast<PyObject*>(self));
...@@ -716,8 +1033,10 @@ static void TensorDealloc(TensorObject* self) { ...@@ -716,8 +1033,10 @@ static void TensorDealloc(TensorObject* self) {
} }
extern struct PyGetSetDef variable_properties[]; extern struct PyGetSetDef variable_properties[];
extern struct PyGetSetDef string_tensor_variable_properties[];
extern PyMethodDef variable_methods[]; extern PyMethodDef variable_methods[];
extern PyMethodDef string_tensor_variable_methods[];
PyNumberMethods number_methods; PyNumberMethods number_methods;
PySequenceMethods sequence_methods; PySequenceMethods sequence_methods;
...@@ -772,5 +1091,49 @@ void BindEager(pybind11::module* module) { ...@@ -772,5 +1091,49 @@ void BindEager(pybind11::module* module) {
BindEagerOpFunctions(&m); BindEagerOpFunctions(&m);
} }
void BindEagerStringTensor(pybind11::module* module) {
auto m = module->def_submodule("eager");
auto heap_type = reinterpret_cast<PyHeapTypeObject*>(
PyType_Type.tp_alloc(&PyType_Type, 0));
heap_type->ht_name = ToPyObject("StringTensor");
heap_type->ht_qualname = ToPyObject("StringTensor");
auto type = &heap_type->ht_type;
type->tp_name = "StringTensor";
type->tp_basicsize = sizeof(TensorObject);
type->tp_dealloc = (destructor)TensorDealloc;
type->tp_as_number = &number_methods;
type->tp_as_sequence = &sequence_methods;
type->tp_as_mapping = &mapping_methods;
type->tp_methods = string_tensor_variable_methods;
type->tp_getset = string_tensor_variable_properties;
type->tp_init = StringTensorInit;
type->tp_new = TensorNew;
Py_INCREF(&PyBaseObject_Type);
type->tp_base = reinterpret_cast<PyTypeObject*>(&PyBaseObject_Type);
type->tp_flags |=
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
#if PY_VERSION_HEX >= 0x03050000
type->tp_as_async = &heap_type->as_async;
#endif
p_string_tensor_type = type;
if (PyType_Ready(type) < 0) {
PADDLE_THROW(platform::errors::Fatal(
"Init Paddle error in BindEager(PyType_Ready)."));
return;
}
Py_INCREF(type);
if (PyModule_AddObject(m.ptr(), "StringTensor",
reinterpret_cast<PyObject*>(type)) < 0) {
Py_DECREF(type);
Py_DECREF(m.ptr());
PADDLE_THROW(platform::errors::Fatal(
"Init Paddle error in BindEagerStringTensor(PyModule_AddObject)."));
return;
}
}
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -39,6 +39,7 @@ typedef struct { ...@@ -39,6 +39,7 @@ typedef struct {
} PyLayerObject; } PyLayerObject;
void BindEager(pybind11::module* m); void BindEager(pybind11::module* m);
void BindEagerStringTensor(pybind11::module* module);
void BindFunctions(PyObject* module); void BindFunctions(PyObject* module);
void BindEagerPyLayer(PyObject* module); void BindEagerPyLayer(PyObject* module);
......
...@@ -257,6 +257,72 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, ...@@ -257,6 +257,72 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* tensor_method_numpy_for_string_tensor(TensorObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto& api = pybind11::detail::npy_api::get();
if (!self->tensor.impl() || !self->tensor.impl()->initialized()) {
VLOG(6) << "The StringTensor is uninitialized. Return the empty string "
"numpy array.";
Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];
Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank];
py_dims[0] = 0;
py_strides[0] = 0;
PyObject* array = api.PyArray_NewFromDescr_(
api.PyArray_Type_,
api.PyArray_DescrFromType_(pybind11::detail::npy_api::NPY_UNICODE_), 1,
py_dims, py_strides, nullptr,
pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ |
pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
nullptr);
return array;
}
if (self->tensor.is_cpu()) {
VLOG(6) << "Getting StringTensor's numpy value";
auto string_tensor =
std::dynamic_pointer_cast<phi::StringTensor>(self->tensor.impl());
const auto* st_ptr = string_tensor->data();
auto numel = self->tensor.numel();
auto tensor_dims = self->tensor.shape();
// Get the max unicode length of StringTensor to create numpy unicode string
// array.
auto* longest_pstring = std::max_element(
st_ptr, st_ptr + numel, [](const auto& a, const auto& b) {
auto a_unicode_len =
phi::strings::GetUnicodeStrLen(a.data(), a.size());
auto b_unicode_len =
phi::strings::GetUnicodeStrLen(b.data(), b.size());
return a_unicode_len < b_unicode_len;
});
size_t max_unicode_length = phi::strings::GetUnicodeStrLen(
longest_pstring->data(), longest_pstring->size());
max_unicode_length = (max_unicode_length == 0) ? 1 : max_unicode_length;
VLOG(6) << "The max unicode length is " << max_unicode_length;
auto sp = std::make_unique<uint32_t[]>(max_unicode_length * numel);
auto py_array_data = sp.get();
memset(py_array_data, 0, max_unicode_length * numel * sizeof(uint32_t));
for (int64_t i = 0; i < numel; ++i) {
auto curr_unicode_len =
phi::strings::GetUnicodeStrLen(st_ptr[i].data(), st_ptr[i].size());
phi::strings::GetUnicodeStr(st_ptr[i].data(),
py_array_data + i * max_unicode_length,
curr_unicode_len);
}
py::array array(py::dtype("U" + std::to_string(max_unicode_length)),
tensor_dims, {}, py_array_data);
return array.release().ptr();
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"StringTensor.numpy() only support cpu tensor."));
Py_INCREF(Py_None);
return Py_None;
}
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method__is_initialized(TensorObject* self, static PyObject* tensor_method__is_initialized(TensorObject* self,
PyObject* args, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
...@@ -1433,6 +1499,18 @@ static PyObject* tensor_method__uva(TensorObject* self, PyObject* args, ...@@ -1433,6 +1499,18 @@ static PyObject* tensor_method__uva(TensorObject* self, PyObject* args,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
#endif #endif
static PyObject* tensor_method__is_string_tensor_hold_allocation(
TensorObject* self, PyObject* args, PyObject* kwargs) {
EAGER_TRY
auto string_tensor =
std::dynamic_pointer_cast<phi::StringTensor>(self->tensor.impl());
if (string_tensor) {
return ToPyObject(string_tensor->initialized());
} else {
return ToPyObject(false);
}
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyMethodDef variable_methods[] = { PyMethodDef variable_methods[] = {
{"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy,
...@@ -1545,5 +1623,20 @@ PyMethodDef variable_methods[] = { ...@@ -1545,5 +1623,20 @@ PyMethodDef variable_methods[] = {
#endif #endif
{NULL, NULL, 0, NULL}}; {NULL, NULL, 0, NULL}};
// variable_methods for core.eager.StringTensor
PyMethodDef string_tensor_variable_methods[] = {
{"numpy",
(PyCFunction)(void (*)(void))tensor_method_numpy_for_string_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_is_initialized",
(PyCFunction)(void (*)(void))tensor_method__is_initialized,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_is_string_tensor_hold_allocation",
(PyCFunction)(
void (*)(void))tensor_method__is_string_tensor_hold_allocation,
METH_VARARGS | METH_KEYWORDS, NULL},
// TODO(zhoushunjie): Need to add _copy_to, copy_ for StringTensor.
{NULL, NULL, 0, NULL}};
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -204,5 +204,15 @@ struct PyGetSetDef variable_properties[] = { ...@@ -204,5 +204,15 @@ struct PyGetSetDef variable_properties[] = {
{"is_leaf", (getter)tensor_properties_is_leaf, nullptr, nullptr, nullptr}, {"is_leaf", (getter)tensor_properties_is_leaf, nullptr, nullptr, nullptr},
{nullptr, nullptr, nullptr, nullptr, nullptr}}; {nullptr, nullptr, nullptr, nullptr, nullptr}};
// variable_properties for core.eager.StringTensor
struct PyGetSetDef string_tensor_variable_properties[] = {
{"name", (getter)tensor_properties_get_name,
(setter)tensor_properties_set_name, nullptr, nullptr},
{"shape", (getter)tensor_properties_get_shape, nullptr, nullptr, nullptr},
{"place", (getter)tensor_properties_get_place, nullptr, nullptr, nullptr},
{"_place_str", (getter)tensor_properties_get_place_str, nullptr, nullptr,
nullptr},
{nullptr, nullptr, nullptr, nullptr, nullptr}};
} // namespace pybind } // namespace pybind
} // namespace paddle } // namespace paddle
...@@ -36,6 +36,7 @@ namespace paddle { ...@@ -36,6 +36,7 @@ namespace paddle {
namespace pybind { namespace pybind {
extern PyTypeObject* p_tensor_type; extern PyTypeObject* p_tensor_type;
extern PyTypeObject* p_string_tensor_type;
extern PyTypeObject* g_framework_scope_pytype; extern PyTypeObject* g_framework_scope_pytype;
extern PyTypeObject* g_vartype_pytype; extern PyTypeObject* g_vartype_pytype;
...@@ -75,6 +76,8 @@ int TensorDtype2NumpyDtype(phi::DataType dtype) { ...@@ -75,6 +76,8 @@ int TensorDtype2NumpyDtype(phi::DataType dtype) {
return pybind11::detail::NPY_COMPLEX64; return pybind11::detail::NPY_COMPLEX64;
case phi::DataType::COMPLEX128: case phi::DataType::COMPLEX128:
return pybind11::detail::NPY_COMPLEX128; return pybind11::detail::NPY_COMPLEX128;
case phi::DataType::PSTRING:
return pybind11::detail::npy_api::NPY_UNICODE_;
default: default:
PADDLE_THROW(paddle::platform::errors::InvalidArgument( PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Unknow phi::DataType, the int value = %d.", "Unknow phi::DataType, the int value = %d.",
...@@ -198,7 +201,9 @@ bool IsEagerTensor(PyObject* obj) { ...@@ -198,7 +201,9 @@ bool IsEagerTensor(PyObject* obj) {
} }
paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) { paddle::experimental::Tensor CastPyArg2Tensor(PyObject* obj, ssize_t arg_pos) {
if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) { if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type)) ||
PyObject_IsInstance(obj,
reinterpret_cast<PyObject*>(p_string_tensor_type))) {
return reinterpret_cast<TensorObject*>(obj)->tensor; return reinterpret_cast<TensorObject*>(obj)->tensor;
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
...@@ -508,7 +513,14 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value, ...@@ -508,7 +513,14 @@ PyObject* ToPyObject(const paddle::experimental::Tensor& value,
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
PyObject* obj = p_tensor_type->tp_alloc(p_tensor_type, 0); PyObject* obj = nullptr;
if (value.initialized() && value.is_string_tensor()) {
// In order to return the core.eager.StringTensor, there is need
// to use p_string_tensor_type to create a python obj.
obj = p_string_tensor_type->tp_alloc(p_string_tensor_type, 0);
} else {
obj = p_tensor_type->tp_alloc(p_tensor_type, 0);
}
if (obj) { if (obj) {
auto v = reinterpret_cast<TensorObject*>(obj); auto v = reinterpret_cast<TensorObject*>(obj);
new (&(v->tensor)) paddle::experimental::Tensor(); new (&(v->tensor)) paddle::experimental::Tensor();
...@@ -753,6 +765,9 @@ static paddle::experimental::Tensor& GetTensorFromPyObject( ...@@ -753,6 +765,9 @@ static paddle::experimental::Tensor& GetTensorFromPyObject(
if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) { if (PyObject_IsInstance(obj, reinterpret_cast<PyObject*>(p_tensor_type))) {
return reinterpret_cast<TensorObject*>(obj)->tensor; return reinterpret_cast<TensorObject*>(obj)->tensor;
} else if (PyObject_IsInstance(
obj, reinterpret_cast<PyObject*>(p_string_tensor_type))) {
return reinterpret_cast<TensorObject*>(obj)->tensor;
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"%s(): argument '%s' (position %d) must be Tensor, but got %s", op_type, "%s(): argument '%s' (position %d) must be Tensor, but got %s", op_type,
......
...@@ -545,6 +545,7 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -545,6 +545,7 @@ PYBIND11_MODULE(core_noavx, m) {
BindImperative(&m); BindImperative(&m);
BindEager(&m); BindEager(&m);
BindEagerStringTensor(&m);
BindCudaStream(&m); BindCudaStream(&m);
// Not used, just make sure cpu_info.cc is linked. // Not used, just make sure cpu_info.cc is linked.
......
...@@ -36,6 +36,8 @@ limitations under the License. */ ...@@ -36,6 +36,8 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/core/string_tensor.h"
#include "paddle/phi/kernels/strings/unicode.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
...@@ -528,6 +530,60 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj, ...@@ -528,6 +530,60 @@ void SetTensorFromPyArray(framework::Tensor *self, const py::object &obj,
} }
} }
template <typename P>
void SetStringTensorFromPyArray(phi::StringTensor *self, const py::array &array,
const P &place) {
bool is_string_pyarray =
array.dtype().kind() == 'S' || array.dtype().kind() == 'U';
PADDLE_ENFORCE_EQ(is_string_pyarray, true,
platform::errors::InvalidArgument(
"Expect the dtype of numpy array is string or "
"unicode, but recevie dtype %s",
array.dtype()));
std::vector<int64_t> dims;
dims.reserve(array.ndim());
dims.reserve(array.ndim());
for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) {
dims.push_back(static_cast<int>(array.shape()[i]));
}
self->Resize(phi::make_ddim(dims));
auto itemsize = array.itemsize();
if (paddle::platform::is_cpu_place(place)) {
auto dst = self->mutable_data(place);
if (array.dtype().kind() == 'S') {
for (int i = 0; i < self->numel(); ++i) {
dst[i] =
pstring(reinterpret_cast<const char *>(array.data()) + itemsize * i,
itemsize);
}
} else {
// array.dtype().kind() == 'U'
VLOG(6) << "numpy array itemsize: " << itemsize;
for (int i = 0; i < self->numel(); ++i) {
// Note(zhoushunjie): The itemsize of unicode numpy array is the
// the size of each unicode string. Each unicode string is aligned
// to max length of the array of unicode strings, so the size of
// each unicode string is same. The size of each unicode character is
// 4, so the size of unicode string is 4 times of the length of
// unicode string.
auto unicode_len = itemsize / 4;
auto utf8_len = phi::strings::GetUTF8StrLen(
reinterpret_cast<const uint32_t *>(array.data()) + unicode_len * i,
unicode_len);
pstring pstr(utf8_len - 1, 0);
phi::strings::GetUTF8Str(
reinterpret_cast<const uint32_t *>(array.data()) + unicode_len * i,
pstr.mdata(), unicode_len);
dst[i] = pstr;
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"StringTensor only support CPUPlace now, but receive %s",
place.DebugString()));
}
}
template <typename T> template <typename T>
void SetUVATensorFromPyArrayImpl(framework::LoDTensor *self_tensor, void SetUVATensorFromPyArrayImpl(framework::LoDTensor *self_tensor,
const py::array_t<T> &array, int device_id) { const py::array_t<T> &array, int device_id) {
......
...@@ -245,6 +245,14 @@ class PADDLE_API Tensor final { ...@@ -245,6 +245,14 @@ class PADDLE_API Tensor final {
*/ */
bool is_sparse_csr_tensor() const; bool is_sparse_csr_tensor() const;
/**
* @brief Determine whether tensor is StringTensor
*
* @return true
* @return false
*/
bool is_string_tensor() const;
/* Part 3: Device and Backend methods */ /* Part 3: Device and Backend methods */
/** /**
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/phi/core/selected_rows.h" #include "paddle/phi/core/selected_rows.h"
#include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/string_tensor.h"
#include "paddle/phi/core/tensor_base.h" #include "paddle/phi/core/tensor_base.h"
#include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/core/tensor_utils.h"
...@@ -155,6 +156,9 @@ bool Tensor::is_sparse_coo_tensor() const { ...@@ -155,6 +156,9 @@ bool Tensor::is_sparse_coo_tensor() const {
bool Tensor::is_sparse_csr_tensor() const { bool Tensor::is_sparse_csr_tensor() const {
return phi::SparseCsrTensor::classof(impl_.get()); return phi::SparseCsrTensor::classof(impl_.get());
} }
bool Tensor::is_string_tensor() const {
return phi::StringTensor::classof(impl_.get());
}
/* Part 3: Device and Backend methods */ /* Part 3: Device and Backend methods */
Place Tensor::place() const { Place Tensor::place() const {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/core/string_tensor.h" #include "paddle/phi/core/string_tensor.h"
#include "paddle/phi/api/lib/utils/storage.h"
namespace phi { namespace phi {
...@@ -161,4 +162,32 @@ void* StringTensor::AllocateFrom(Allocator* allocator, ...@@ -161,4 +162,32 @@ void* StringTensor::AllocateFrom(Allocator* allocator,
meta_.offset); meta_.offset);
} }
dtype::pstring* StringTensor::mutable_data(const phi::Place& place,
size_t requested_size) {
PADDLE_ENFORCE_GE(
numel(),
0,
phi::errors::PreconditionNotMet(
"The Tensor's element number must be equal or greater than zero. "
"The Tensor's shape is [",
dims(),
"] now"));
size_t size = numel() * SizeOf(dtype());
if (requested_size && (requested_size > size)) {
size = requested_size;
}
/* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + meta_.offset) {
holder_.reset();
holder_ = paddle::memory::AllocShared(place, size);
// Initialize the allocated bytes
init_holder();
meta_.offset = 0;
}
return reinterpret_cast<dtype::pstring*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + meta_.offset);
}
} // namespace phi } // namespace phi
...@@ -122,6 +122,8 @@ class StringTensor : public TensorBase, ...@@ -122,6 +122,8 @@ class StringTensor : public TensorBase,
void* AllocateFrom(Allocator* allocator, void* AllocateFrom(Allocator* allocator,
DataType dtype, DataType dtype,
size_t requested_size = 0); size_t requested_size = 0);
dtype::pstring* mutable_data(const phi::Place& place,
size_t requested_size = 0);
private: private:
friend class StringTensorUtils; friend class StringTensorUtils;
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid.core as core
import paddle
import numpy as np
from paddle.fluid.framework import _test_eager_guard, in_dygraph_mode
import unittest
import copy
class EagerStringTensorTestCase(unittest.TestCase):
def setUp(self):
self.str_arr = np.array([
["15.4寸笔记本的键盘确实爽,基本跟台式机差不多了,蛮喜欢数字小键盘,输数字特方便,样子也很美观,做工也相当不错"
], # From ChnSentiCorp
["One of the very best Three Stooges shorts ever."]
]) # From IMDB
def test_constructor_with_args(self):
with _test_eager_guard():
ST1 = core.eager.StringTensor() # constructor 1
self.assertEqual(ST1.name, "generated_string_tensor_0")
self.assertEqual(ST1.shape, [])
self.assertEqual(ST1.numpy(), '')
shape = [2, 3]
ST2 = core.eager.StringTensor(shape, "ST2") # constructor 2
self.assertEqual(ST2.name, "ST2")
self.assertEqual(ST2.shape, shape)
self.assertTrue(
np.array_equal(
ST2.numpy(), np.empty(
shape, dtype=np.unicode_)))
ST3 = core.eager.StringTensor(self.str_arr, "ST3") # constructor 3
self.assertEqual(ST3.name, "ST3")
self.assertEqual(ST3.shape, list(self.str_arr.shape))
self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr))
ST4 = core.eager.StringTensor(self.str_arr) # constructor 4
self.assertEqual(ST4.name, "generated_string_tensor_1")
self.assertEqual(ST4.shape, list(self.str_arr.shape))
self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr))
ST5 = core.eager.StringTensor(ST4) # constructor 5
self.assertEqual(ST5.name, "generated_string_tensor_2")
self.assertEqual(ST5.shape, list(self.str_arr.shape))
self.assertTrue(np.array_equal(ST5.numpy(), self.str_arr))
ST6 = core.eager.StringTensor(ST5, "ST6") # constructor 6
self.assertEqual(ST6.name, "ST6")
self.assertEqual(ST6.shape, list(self.str_arr.shape))
self.assertTrue(np.array_equal(ST6.numpy(), self.str_arr))
for st in [ST1, ST2, ST3, ST4, ST5, ST6]:
# All StringTensors are on cpu place so far.
self.assertTrue(st.place._equals(core.CPUPlace()))
def test_constructor_with_kwargs(self):
with _test_eager_guard():
shape = [2, 3]
ST1 = core.eager.StringTensor(
dims=shape, name="ST1") # constructor 2
self.assertEqual(ST1.name, "ST1")
self.assertEqual(ST1.shape, shape)
self.assertTrue(
np.array_equal(
ST1.numpy(), np.empty(
shape, dtype=np.unicode_)))
ST2 = core.eager.StringTensor(
self.str_arr, name="ST2") # constructor 3
self.assertEqual(ST2.name, "ST2")
self.assertEqual(ST2.shape, list(self.str_arr.shape))
self.assertTrue(np.array_equal(ST2.numpy(), self.str_arr))
ST3 = core.eager.StringTensor(ST2, name="ST3") # constructor 6
self.assertEqual(ST3.name, "ST3")
self.assertEqual(ST3.shape, list(self.str_arr.shape))
self.assertTrue(np.array_equal(ST3.numpy(), self.str_arr))
ST4 = core.eager.StringTensor(
value=ST2, name="ST4") # constructor 6
self.assertEqual(ST4.name, "ST4")
self.assertEqual(ST4.shape, list(self.str_arr.shape))
self.assertTrue(np.array_equal(ST4.numpy(), self.str_arr))
for st in [ST1, ST2, ST3, ST4]:
# All StringTensors are on cpu place so far.
self.assertTrue(st.place._equals(core.CPUPlace()))
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册