提交 6f733ec1 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!1308 Stage 2 of adding support for string Tensor

Merge pull request !1308 from h.farahat/string_tensor2
ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto)
ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto)
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
add_library(core OBJECT add_library(core OBJECT
${EXAMPLE_SRCS}
${FEATURE_SRCS}
client.cc client.cc
config_manager.cc config_manager.cc
cv_tensor.cc cv_tensor.cc
...@@ -9,4 +13,5 @@ add_library(core OBJECT ...@@ -9,4 +13,5 @@ add_library(core OBJECT
tensor.cc tensor.cc
tensor_shape.cc tensor_shape.cc
) )
add_dependencies(core mindspore::protobuf)
target_include_directories(core PRIVATE ${pybind11_INCLUDE_DIRS}) target_include_directories(core PRIVATE ${pybind11_INCLUDE_DIRS})
...@@ -25,14 +25,14 @@ namespace dataset { ...@@ -25,14 +25,14 @@ namespace dataset {
uint8_t DataType::SizeInBytes() const { uint8_t DataType::SizeInBytes() const {
if (type_ < DataType::NUM_OF_TYPES) if (type_ < DataType::NUM_OF_TYPES)
return SIZE_IN_BYTES[type_]; return kTypeInfo[type_].sizeInBytes_;
else else
return 0; return 0;
} }
py::dtype DataType::AsNumpyType() const { py::dtype DataType::AsNumpyType() const {
if (type_ < DataType::NUM_OF_TYPES) if (type_ < DataType::NUM_OF_TYPES)
return py::dtype(PYBIND_TYPES[type_]); return py::dtype(kTypeInfo[type_].pybindType_);
else else
return py::dtype("unknown"); return py::dtype("unknown");
} }
...@@ -40,7 +40,7 @@ py::dtype DataType::AsNumpyType() const { ...@@ -40,7 +40,7 @@ py::dtype DataType::AsNumpyType() const {
uint8_t DataType::AsCVType() const { uint8_t DataType::AsCVType() const {
uint8_t res = kCVInvalidType; uint8_t res = kCVInvalidType;
if (type_ < DataType::NUM_OF_TYPES) { if (type_ < DataType::NUM_OF_TYPES) {
res = CV_TYPES[type_]; res = kTypeInfo[type_].cvType_;
} }
if (res == kCVInvalidType) { if (res == kCVInvalidType) {
...@@ -108,7 +108,7 @@ DataType::DataType(const std::string &type_str) { ...@@ -108,7 +108,7 @@ DataType::DataType(const std::string &type_str) {
std::string DataType::ToString() const { std::string DataType::ToString() const {
if (type_ < DataType::NUM_OF_TYPES) if (type_ < DataType::NUM_OF_TYPES)
return TO_STRINGS[type_]; return kTypeInfo[type_].name_;
else else
return "unknown"; return "unknown";
} }
...@@ -149,7 +149,7 @@ DataType DataType::FromNpArray(const py::array &arr) { ...@@ -149,7 +149,7 @@ DataType DataType::FromNpArray(const py::array &arr) {
std::string DataType::GetPybindFormat() const { std::string DataType::GetPybindFormat() const {
std::string res; std::string res;
if (type_ < DataType::NUM_OF_TYPES) { if (type_ < DataType::NUM_OF_TYPES) {
res = PYBIND_FORMAT_DESCRIPTOR[type_]; res = kTypeInfo[type_].pybindFormatDescriptor_;
} }
if (res.empty()) { if (res.empty()) {
......
...@@ -51,56 +51,31 @@ class DataType { ...@@ -51,56 +51,31 @@ class DataType {
NUM_OF_TYPES NUM_OF_TYPES
}; };
inline static constexpr uint8_t SIZE_IN_BYTES[] = {0, // DE_UNKNOWN struct TypeInfo {
1, // DE_BOOL const char *name_; // name to be represent the type while printing
1, // DE_INT8 const uint8_t sizeInBytes_; // number of bytes needed for this type
1, // DE_UINT8 const char *pybindType_; // Python matching type, used in get_output_types
2, // DE_INT16 const std::string pybindFormatDescriptor_; // pybind format used for numpy types
2, // DE_UINT16 const uint8_t cvType_; // OpenCv matching type
4, // DE_INT32 };
4, // DE_UINT32
8, // DE_INT64 static inline const TypeInfo kTypeInfo[] = {
8, // DE_UINT64 // name, sizeInBytes, pybindTypem formatDescriptor, openCV
2, // DE_FLOAT16 {"unknown", 0, "object", "", kCVInvalidType}, // DE_UNKNOWN
4, // DE_FLOAT32 {"bool", 1, "bool", py::format_descriptor<bool>::format(), CV_8U}, // DE_BOOL
8, // DE_FLOAT64 {"int8", 1, "int8", py::format_descriptor<int8_t>::format(), CV_8S}, // DE_INT8
0}; // DE_STRING {"uint8", 1, "uint8", py::format_descriptor<uint8_t>::format(), CV_8U}, // DE_UINT8
{"int16", 2, "int16", py::format_descriptor<int16_t>::format(), CV_16S}, // DE_INT16
inline static const char *TO_STRINGS[] = {"unknown", "bool", "int8", "uint8", "int16", "uint16", "int32", {"uint16", 2, "uint16", py::format_descriptor<uint16_t>::format(), CV_16U}, // DE_UINT16
"uint32", "int64", "uint64", "float16", "float32", "float64", "string"}; {"int32", 4, "int32", py::format_descriptor<int32_t>::format(), CV_32S}, // DE_INT32
{"uint32", 4, "uint32", py::format_descriptor<uint32_t>::format(), kCVInvalidType}, // DE_UINT32
inline static const char *PYBIND_TYPES[] = {"object", "bool", "int8", "uint8", "int16", "uint16", "int32", {"int64", 8, "int64", py::format_descriptor<int64_t>::format(), kCVInvalidType}, // DE_INT64
"uint32", "int64", "uint64", "float16", "float32", "double", "bytes"}; {"uint64", 8, "uint64", py::format_descriptor<uint64_t>::format(), kCVInvalidType}, // DE_UINT64
{"float16", 2, "float16", "e", CV_16F}, // DE_FLOAT16
inline static const std::string PYBIND_FORMAT_DESCRIPTOR[] = {"", // DE_UNKNOWN {"float32", 4, "float32", py::format_descriptor<float>::format(), CV_32F}, // DE_FLOAT32
py::format_descriptor<bool>::format(), // DE_BOOL {"float64", 8, "double", py::format_descriptor<double>::format(), CV_64F}, // DE_FLOAT64
py::format_descriptor<int8_t>::format(), // DE_INT8 {"string", 0, "bytes", "S", kCVInvalidType} // DE_STRING
py::format_descriptor<uint8_t>::format(), // DE_UINT8 };
py::format_descriptor<int16_t>::format(), // DE_INT16
py::format_descriptor<uint16_t>::format(), // DE_UINT16
py::format_descriptor<int32_t>::format(), // DE_INT32
py::format_descriptor<uint32_t>::format(), // DE_UINT32
py::format_descriptor<int64_t>::format(), // DE_INT64
py::format_descriptor<uint64_t>::format(), // DE_UINT64
"e", // DE_FLOAT16
py::format_descriptor<float>::format(), // DE_FLOAT32
py::format_descriptor<double>::format(), // DE_FLOAT64
"S"}; // DE_STRING
inline static constexpr uint8_t CV_TYPES[] = {kCVInvalidType, // DE_UNKNOWN
CV_8U, // DE_BOOL
CV_8S, // DE_INT8
CV_8U, // DE_UINT8
CV_16S, // DE_INT16
CV_16U, // DE_UINT16
CV_32S, // DE_INT32
kCVInvalidType, // DE_UINT32
kCVInvalidType, // DE_INT64
kCVInvalidType, // DE_UINT64
CV_16F, // DE_FLOAT16
CV_32F, // DE_FLOAT32
CV_64F, // DE_FLOAT64
kCVInvalidType}; // DE_STRING
// No arg constructor to create an unknown shape // No arg constructor to create an unknown shape
DataType() : type_(DE_UNKNOWN) {} DataType() : type_(DE_UNKNOWN) {}
......
...@@ -57,18 +57,40 @@ Tensor::Tensor(const TensorShape &shape, const DataType &type) : shape_(shape), ...@@ -57,18 +57,40 @@ Tensor::Tensor(const TensorShape &shape, const DataType &type) : shape_(shape),
} }
Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data) : Tensor(shape, type) { Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data) : Tensor(shape, type) {
if (type.IsNumeric()) {
// If the data pointer was given, then we can also populate the tensor with data
if (data != nullptr) {
// Given the shape/type of this tensor, compute the data size and copy in the input bytes.
int64_t byte_size = this->SizeInBytes();
Status s = this->AllocateBuffer(byte_size); // Allocates data_ inside itself
if (s.IsOk() && data_ != nullptr) {
int ret_code = memcpy_s(data_, byte_size, data, byte_size);
if (ret_code != 0) {
MS_LOG(ERROR) << "Failed to copy data into Tensor!";
}
} else {
MS_LOG(ERROR) << "Failed to create memory for Tensor!";
}
}
} else {
MS_LOG(ERROR) << "Type should be numeric to use this constructor.";
}
}
Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data, const dsize_t &length)
: Tensor(shape, type) {
// If the data pointer was given, then we can also populate the tensor with data // If the data pointer was given, then we can also populate the tensor with data
if (data != nullptr) { if (data != nullptr) {
// Given the shape/type of this tensor, compute the data size and copy in the input bytes. // Allocates data_ inside itself
int64_t byte_size = this->SizeInBytes(); Status s = AllocateBuffer(length);
static_cast<void>(this->GetMutableBuffer()); // Allocates data_ inside itself if (s.IsError()) {
MS_LOG(ERROR) << "Failed to create memory for Tensor!";
}
if (data_ != nullptr) { if (data_ != nullptr) {
int ret_code = memcpy_s(data_, byte_size, data, byte_size); int ret_code = memcpy_s(data_, length, data, length);
if (ret_code != 0) { if (ret_code != 0) {
MS_LOG(ERROR) << "Failed to copy data into Tensor!"; MS_LOG(ERROR) << "Failed to copy data into Tensor!";
} }
} else {
MS_LOG(ERROR) << "Failed to create memory for Tensor!";
} }
} }
} }
...@@ -98,32 +120,79 @@ Tensor::Tensor(const std::vector<std::string> &strings, const TensorShape &shape ...@@ -98,32 +120,79 @@ Tensor::Tensor(const std::vector<std::string> &strings, const TensorShape &shape
auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; }; auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; };
dsize_t total_length = std::accumulate(strings.begin(), strings.end(), 0, length_sum); dsize_t total_length = std::accumulate(strings.begin(), strings.end(), 0, length_sum);
dsize_t num_bytes = (kOffsetSize + 1) * shape_.NumOfElements() + total_length; // total bytes needed = offset array + strings
// offset array needs to store one offset var per element + 1 extra to get the length of the last string.
// strings will be null-terminated --> need 1 extra byte per element
dsize_t num_bytes = (kOffsetSize + 1) * shape_.NumOfElements() + kOffsetSize + total_length;
data_ = data_allocator_->allocate(num_bytes); data_ = data_allocator_->allocate(num_bytes);
auto offset_arr = reinterpret_cast<offset_t *>(data_); auto offset_arr = reinterpret_cast<offset_t *>(data_);
uchar *buf = GetStringsBuffer(); uchar *buf = GetStringsBuffer();
offset_t offset = -1; offset_t offset = buf - data_; // the first string will start here
uint32_t i = 0; uint32_t i = 0;
for (const auto &str : strings) { for (const auto &str : strings) {
// insert the end index of the string // insert the start index of the string.
// end index of a string is the end index of previous string + the length (including \0)
offset = offset + str.length() + 1;
offset_arr[i++] = offset; offset_arr[i++] = offset;
// total bytes are reduced by kOffsetSize // total bytes are reduced by kOffsetSize
num_bytes -= kOffsetSize; num_bytes -= kOffsetSize;
// insert actual string // insert actual string
memcpy_s(buf, num_bytes, str.c_str(), str.length() + 1); int ret_code = memcpy_s(data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1);
buf += str.length() + 1; if (ret_code != 0) MS_LOG(ERROR) << "Cannot copy string into Tensor";
// next string will be stored right after the current one.
offset = offset + str.length() + 1;
// total bytes are reduced by the length of the string
num_bytes -= str.length() + 1; num_bytes -= str.length() + 1;
} }
this->data_end_ = buf; // store one more offset value so we can get the length of the last string
// length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element]
offset_arr[i] = offset;
this->data_end_ = data_ + offset_arr[i];
DS_ASSERT(num_bytes == 0); DS_ASSERT(num_bytes == 0);
if (shape.known()) Tensor::Reshape(shape); if (shape.known()) Tensor::Reshape(shape);
} }
Tensor::Tensor(const dataengine::BytesList &bytes_list, const TensorShape &shape)
: Tensor(TensorShape({static_cast<dsize_t>(bytes_list.value_size())}), DataType(DataType::DE_STRING)) {
// total bytes needed = offset array + strings
// offset array needs to store one offset var per element + 1 extra to get the length of the last string.
// strings will be null-terminated --> need 1 extra byte per element
dsize_t num_bytes = (kOffsetSize)*shape_.NumOfElements() + kOffsetSize + bytes_list.ByteSizeLong();
data_ = data_allocator_->allocate(num_bytes);
auto offset_arr = reinterpret_cast<offset_t *>(data_);
uchar *buf = GetStringsBuffer();
offset_t offset = buf - data_; // the first string will start here
uint32_t i = 0;
for (; i < bytes_list.value_size(); i++) {
const std::string &str = bytes_list.value(i);
// insert the start index of the string.
offset_arr[i] = offset;
// total bytes are reduced by kOffsetSize
num_bytes -= kOffsetSize;
// insert actual string
int ret_code = memcpy_s(data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1);
if (ret_code != 0) {
MS_LOG(ERROR) << "Cannot copy string into Tensor";
}
// next string will be stored right after the current one.
offset = offset + str.length() + 1;
// total bytes are reduced by the length of the string
num_bytes -= str.length() + 1;
}
// store one more offset value so we can get the length of the last string
// length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element]
offset_arr[i] = offset;
data_end_ = data_ + offset_arr[i];
DS_ASSERT(num_bytes == 0);
if (shape.known()) Tensor::Reshape(shape);
}
Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl, const TensorShape &shape, Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl, const TensorShape &shape,
DataType type, const unsigned char *data) { DataType type, const unsigned char *data) {
if (!shape.known()) { if (!shape.known()) {
...@@ -152,20 +221,17 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl ...@@ -152,20 +221,17 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl
} }
return Status::OK(); // returns base-class shared_ptr return Status::OK(); // returns base-class shared_ptr
} }
std::string to(std::string x) { return x; }
Status Tensor::CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::array arr) { Status Tensor::CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::array arr) {
std::vector<dsize_t> shape; std::vector<dsize_t> shape;
for (dsize_t i = 0; i < arr.ndim(); i++) { for (dsize_t i = 0; i < arr.ndim(); i++) {
shape.push_back(static_cast<dsize_t>(arr.shape()[i])); shape.push_back(static_cast<dsize_t>(arr.shape()[i]));
} }
arr.resize({arr.size()}); arr.resize({arr.size()}); // flatten the py::array so we can iterate once
auto itr = arr.begin();
std::vector<std::string> strings; std::vector<std::string> strings;
for (; itr != arr.end(); itr++) { std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast<py::bytes>(s)); });
std::string s = to(py::cast<py::bytes>(*itr));
strings.push_back(s); arr.resize(shape); // resize arr back to the original shape
}
arr.resize(shape);
return CreateTensor(ptr, strings, TensorShape{shape}); return CreateTensor(ptr, strings, TensorShape{shape});
} }
...@@ -190,8 +256,9 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr) { ...@@ -190,8 +256,9 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr) {
std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool(); std::shared_ptr<MemoryPool> global_pool = GlobalContext::Instance()->mem_pool();
(*ptr)->data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool); (*ptr)->data_allocator_ = std::make_unique<Allocator<unsigned char>>(global_pool);
static_cast<void>((*ptr)->GetMutableBuffer());
int64_t byte_size = (*ptr)->SizeInBytes(); int64_t byte_size = (*ptr)->SizeInBytes();
RETURN_IF_NOT_OK((*ptr)->AllocateBuffer(byte_size));
unsigned char *data = static_cast<unsigned char *>(arr.request().ptr); unsigned char *data = static_cast<unsigned char *>(arr.request().ptr);
if ((*ptr)->data_ == nullptr) { if ((*ptr)->data_ == nullptr) {
RETURN_STATUS_UNEXPECTED("Failed to create memory for Tensor."); RETURN_STATUS_UNEXPECTED("Failed to create memory for Tensor.");
...@@ -232,6 +299,13 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std: ...@@ -232,6 +299,13 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std:
return Status::OK(); return Status::OK();
} }
Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
const TensorShape &shape) {
const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator();
*ptr = std::allocate_shared<Tensor>(*alloc, bytes_list, shape);
return Status::OK();
}
// Memcpy the given strided array's used part to consecutive memory // Memcpy the given strided array's used part to consecutive memory
// Consider a 3-d array // Consider a 3-d array
// A[(i * shape[1] + j) * shape[2] + k] = B[i][j][k] = C[i * strides[0] + j * strides[1] + k * strides[2]] // A[(i * shape[1] + j) * shape[2] + k] = B[i][j][k] = C[i * strides[0] + j * strides[1] + k * strides[2]]
...@@ -370,25 +444,20 @@ void Tensor::Print(std::ostream &out) const { ...@@ -370,25 +444,20 @@ void Tensor::Print(std::ostream &out) const {
out << "[Data area is null]"; out << "[Data area is null]";
} }
} }
Status Tensor::AllocateBuffer(const dsize_t &length) {
// Name: ToFlatIndex() if (data_ == nullptr) {
// Description: convert a vector style index to number, used to access memory internal use only if (data_allocator_ != nullptr) {
Status Tensor::ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const { data_ = data_allocator_->allocate(length);
if (!shape_.IsValidIndex(index)) { RETURN_UNEXPECTED_IF_NULL(data_);
std::string err = "Not a valid index"; data_end_ = data_ + length;
RETURN_STATUS_UNEXPECTED(err); } else {
} data_ = static_cast<unsigned char *>(malloc(length));
*flat_index = 0; data_end_ = data_ + length;
for (size_t k = 0; k < index.size(); k++) { RETURN_UNEXPECTED_IF_NULL(data_);
dsize_t product = 1;
for (size_t l = k + 1; l < index.size(); l++) {
product *= shape_[l];
} }
*flat_index += index[k] * product;
} }
return Status::OK(); return Status::OK();
} }
const unsigned char *Tensor::GetBuffer() const { const unsigned char *Tensor::GetBuffer() const {
// This version cannot modify anything. data_ could possibly be null. // This version cannot modify anything. data_ could possibly be null.
return data_; return data_;
...@@ -404,17 +473,11 @@ unsigned char *Tensor::GetMutableBuffer() { ...@@ -404,17 +473,11 @@ unsigned char *Tensor::GetMutableBuffer() {
} else { } else {
// If the data area is not created, then identify the memory size based // If the data area is not created, then identify the memory size based
// on the shape and type and allocate it. // on the shape and type and allocate it.
if (data_allocator_ != nullptr) { if (this->AllocateBuffer(this->SizeInBytes()).IsOk()) {
data_ = data_allocator_->allocate(this->SizeInBytes()); return data_;
data_end_ = data_ + SizeInBytes();
} else { } else {
data_ = static_cast<unsigned char *>(malloc(this->SizeInBytes())); return nullptr;
data_end_ = data_ + SizeInBytes();
if (data_ == nullptr) {
return nullptr;
}
} }
return data_;
} }
} }
...@@ -444,7 +507,7 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector<dsize_t> &index) const { ...@@ -444,7 +507,7 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector<dsize_t> &index) const {
RETURN_STATUS_UNEXPECTED(err); RETURN_STATUS_UNEXPECTED(err);
} }
dsize_t flat_idx; dsize_t flat_idx;
RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx)); RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &flat_idx));
*ptr = reinterpret_cast<T *>(data_ + flat_idx * type_.SizeInBytes()); *ptr = reinterpret_cast<T *>(data_ + flat_idx * type_.SizeInBytes());
return Status::OK(); return Status::OK();
...@@ -461,7 +524,7 @@ Status Tensor::GetItemPtr(uchar **ptr, const std::vector<dsize_t> &index, offset ...@@ -461,7 +524,7 @@ Status Tensor::GetItemPtr(uchar **ptr, const std::vector<dsize_t> &index, offset
RETURN_STATUS_UNEXPECTED(err); RETURN_STATUS_UNEXPECTED(err);
} }
dsize_t flat_idx; dsize_t flat_idx;
RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx)); RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &flat_idx));
offset_t length_temp = 0; offset_t length_temp = 0;
RETURN_IF_NOT_OK(GetStringAt(flat_idx, ptr, &length_temp)); RETURN_IF_NOT_OK(GetStringAt(flat_idx, ptr, &length_temp));
if (length != nullptr) *length = length_temp; if (length != nullptr) *length = length_temp;
...@@ -481,7 +544,7 @@ Status Tensor::StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_ ...@@ -481,7 +544,7 @@ Status Tensor::StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_
std::vector<dsize_t> r(t_shape.begin() + ind.size(), t_shape.end()); std::vector<dsize_t> r(t_shape.begin() + ind.size(), t_shape.end());
*remaining = TensorShape(r); *remaining = TensorShape(r);
ind.resize(this->Rank(), 0); // same as -> while (ind.size() < this->Rank()) ind.push_back(0); ind.resize(this->Rank(), 0); // same as -> while (ind.size() < this->Rank()) ind.push_back(0);
RETURN_IF_NOT_OK(ToFlatIndex(ind, &flat_ind)); RETURN_IF_NOT_OK(shape_.ToFlatIndex(ind, &flat_ind));
// check if GetBuffer() returns null, we should flag this as an error, this sanity check will only // check if GetBuffer() returns null, we should flag this as an error, this sanity check will only
// be true is the tensor failed to allocate memory. // be true is the tensor failed to allocate memory.
if (GetMutableBuffer() == nullptr) { if (GetMutableBuffer() == nullptr) {
...@@ -588,10 +651,10 @@ Status Tensor::GetItemAt(std::string_view *o, const std::vector<dsize_t> &index) ...@@ -588,10 +651,10 @@ Status Tensor::GetItemAt(std::string_view *o, const std::vector<dsize_t> &index)
RETURN_UNEXPECTED_IF_NULL(o); RETURN_UNEXPECTED_IF_NULL(o);
CHECK_FAIL_RETURN_UNEXPECTED(type_ == DataType::DE_STRING, "Type is not DE_STRING"); CHECK_FAIL_RETURN_UNEXPECTED(type_ == DataType::DE_STRING, "Type is not DE_STRING");
uchar *buf = nullptr; uchar *start = nullptr;
offset_t length = 0; offset_t length = 0;
RETURN_IF_NOT_OK(GetItemPtr(&buf, index, &length)); RETURN_IF_NOT_OK(GetItemPtr(&start, index, &length));
std::string_view sv{reinterpret_cast<const char *>(buf), length}; std::string_view sv{reinterpret_cast<const char *>(start)};
o->swap(sv); o->swap(sv);
return Status::OK(); return Status::OK();
} }
...@@ -778,13 +841,11 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length ...@@ -778,13 +841,11 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length
RETURN_UNEXPECTED_IF_NULL(string_start); RETURN_UNEXPECTED_IF_NULL(string_start);
RETURN_UNEXPECTED_IF_NULL(length); RETURN_UNEXPECTED_IF_NULL(length);
auto *offset_ptr = reinterpret_cast<offset_t *>(data_); // offsets starts here auto *offset_ptr = reinterpret_cast<offset_t *>(data_); // offsets starts here
offset_t end = offset_ptr[index]; offset_t start = offset_ptr[index];
offset_t start = 0; *string_start = data_ + start;
if (index != 0) start = offset_ptr[index - 1] + 1; // string starts at where the previous string ends + 1 *length = offset_ptr[index + 1] - start - 1; // -1 to skip the \0 from the string length
uchar *buf = GetStringsBuffer(); // string data starts here
*string_start = buf + start;
*length = end - start;
return Status::OK(); return Status::OK();
} }
} // namespace dataset } // namespace dataset
} // namespace mindspore } // namespace mindspore
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include "dataset/util/allocator.h" #include "dataset/util/allocator.h"
#include "dataset/util/de_error.h" #include "dataset/util/de_error.h"
#include "dataset/util/status.h" #include "dataset/util/status.h"
#include "proto/example.pb.h"
namespace py = pybind11; namespace py = pybind11;
namespace mindspore { namespace mindspore {
...@@ -64,6 +65,8 @@ class Tensor { ...@@ -64,6 +65,8 @@ class Tensor {
// @param data unsigned char*, pointer to the data. // @param data unsigned char*, pointer to the data.
Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data); Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data);
Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data, const dsize_t &length);
Tensor(const Tensor &other) = delete; Tensor(const Tensor &other) = delete;
Tensor &operator=(const Tensor &other) = delete; Tensor &operator=(const Tensor &other) = delete;
...@@ -72,6 +75,8 @@ class Tensor { ...@@ -72,6 +75,8 @@ class Tensor {
Tensor &operator=(Tensor &&other) noexcept; Tensor &operator=(Tensor &&other) noexcept;
Status AllocateBuffer(const dsize_t &length);
// type of offest values to store strings information // type of offest values to store strings information
using offset_t = uint32_t; using offset_t = uint32_t;
// const of the size of the offset variable // const of the size of the offset variable
...@@ -84,15 +89,24 @@ class Tensor { ...@@ -84,15 +89,24 @@ class Tensor {
// Construct a tensor from a list of strings. Reshape the tensor with `shape` if given, otherwise assume the shape is // Construct a tensor from a list of strings. Reshape the tensor with `shape` if given, otherwise assume the shape is
// the size of the vector `strings`. // the size of the vector `strings`.
// The memory layout of a Tensor of strings consists of the Offset_array followed by the strings. // The memory layout of a Tensor of strings consists of the Offset_array followed by the strings.
// OFFSET1, OFFSET2, ... String1, String2, ... // Thr offset array will store one extra value to find the length of the last string.
// The value of each offset is the end index of the corresponding string // OFFSET1, OFFSET2, ..., OFFSETn+1, STRING1, STRING2, ..., STRINGn
// The value of each offset is the start index of the corresponding string
// Offsets is of type offest_t // Offsets is of type offest_t
// strings will ne null-terminated // strings will ne null-terminated
// example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING) // example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING)
// 3 6 a b c \0 d e \0 // |----------------------------------------------------------------|
// | OFFSET ARRAY | STRINGS |
// | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 |
// | 11 | 15 | 18 | abc\0 | de\0 |
// |----------------------------------------------------------------|
explicit Tensor(const std::vector<std::string> &strings, explicit Tensor(const std::vector<std::string> &strings,
const TensorShape &shape = TensorShape::CreateUnknownRankShape()); const TensorShape &shape = TensorShape::CreateUnknownRankShape());
// Same as Tensor(vector<string>) but the input is protobuf bytelist
explicit Tensor(const dataengine::BytesList &bytes_list,
const TensorShape &shape = TensorShape::CreateUnknownRankShape());
// A static factory method to create the given flavour of derived Tensor // A static factory method to create the given flavour of derived Tensor
// Returns the base class reference for the Tensor. // Returns the base class reference for the Tensor.
// @param ptr output argument to hold the created Tensor of given tensor_impl // @param ptr output argument to hold the created Tensor of given tensor_impl
...@@ -121,6 +135,9 @@ class Tensor { ...@@ -121,6 +135,9 @@ class Tensor {
static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std::string> &strings, static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std::string> &strings,
const TensorShape &shape = TensorShape::CreateUnknownRankShape()); const TensorShape &shape = TensorShape::CreateUnknownRankShape());
static Status CreateTensor(std::shared_ptr<Tensor> *ptr, const dataengine::BytesList &bytes_list,
const TensorShape &shape);
// Copy raw data of a array based on shape and strides to the destination pointer // Copy raw data of a array based on shape and strides to the destination pointer
// @param dst Pointer to the destination array where the content is to be copied // @param dst Pointer to the destination array where the content is to be copied
// @param src Pointer to the source of strided array to be copied // @param src Pointer to the source of strided array to be copied
...@@ -166,7 +183,7 @@ class Tensor { ...@@ -166,7 +183,7 @@ class Tensor {
// @param value of type `T` // @param value of type `T`
template <typename T> template <typename T>
Status SetItemAt(const std::vector<dsize_t> &index, const T &value) { Status SetItemAt(const std::vector<dsize_t> &index, const T &value) {
static_cast<void>(GetMutableBuffer()); RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes()));
T *ptr = nullptr; T *ptr = nullptr;
RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index)); RETURN_IF_NOT_OK(GetItemPtr<T>(&ptr, index));
*ptr = value; *ptr = value;
...@@ -203,7 +220,7 @@ class Tensor { ...@@ -203,7 +220,7 @@ class Tensor {
template <typename T> template <typename T>
Status Fill(const T &value) { Status Fill(const T &value) {
CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings."); CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings.");
static_cast<void>(GetMutableBuffer()); RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes()));
int64_t cellSize = type_.SizeInBytes(); int64_t cellSize = type_.SizeInBytes();
if ((data_ != nullptr) && type_.IsCompatible<T>()) { if ((data_ != nullptr) && type_.IsCompatible<T>()) {
for (dsize_t i = 0; i < Size(); i++) { for (dsize_t i = 0; i < Size(); i++) {
...@@ -418,32 +435,28 @@ class Tensor { ...@@ -418,32 +435,28 @@ class Tensor {
using pointer = std::string_view *; using pointer = std::string_view *;
using reference = std::string_view &; using reference = std::string_view &;
explicit TensorIterator(uchar *offset = nullptr, const uchar *buf = nullptr, dsize_t index = 0) { explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) {
offset_ = reinterpret_cast<offset_t *>(offset); data_ = reinterpret_cast<const char *>(data);
buf_ = reinterpret_cast<const char *>(buf);
index_ = index; index_ = index;
} }
TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) { TensorIterator(const TensorIterator<std::string_view, DUMMY> &raw_iterator) {
offset_ = raw_iterator.offset_; data_ = raw_iterator.data_;
buf_ = raw_iterator.buf_;
index_ = raw_iterator.index_; index_ = raw_iterator.index_;
} }
~TensorIterator() = default; ~TensorIterator() = default;
bool operator==(const TensorIterator<std::string_view> &rhs) { bool operator==(const TensorIterator<std::string_view> &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; }
return buf_ == rhs.buf_ && offset_ == rhs.offset_ && index_ == rhs.index_;
}
bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); } bool operator!=(const TensorIterator<std::string_view> &rhs) { return !(*this == rhs); }
operator bool() const { return offset_ != nullptr; } operator bool() const { return data_ != nullptr; }
std::string_view operator*() const { std::string_view operator*() const {
offset_t start = 0; auto offset_ = reinterpret_cast<const offset_t *>(data_);
if (index_ != 0) start = offset_[index_ - 1] + 1; offset_t start = offset_[index_];
return std::string_view{buf_ + start}; return std::string_view{data_ + start};
} }
TensorIterator<std::string_view> &operator+=(const dsize_t &inc) { TensorIterator<std::string_view> &operator+=(const dsize_t &inc) {
...@@ -496,8 +509,7 @@ class Tensor { ...@@ -496,8 +509,7 @@ class Tensor {
protected: protected:
dsize_t index_; dsize_t index_;
offset_t *offset_; const char *data_;
const char *buf_;
}; };
// Return a TensorIterator that points to the start of the Tensor. // Return a TensorIterator that points to the start of the Tensor.
...@@ -518,11 +530,6 @@ class Tensor { ...@@ -518,11 +530,6 @@ class Tensor {
} }
protected: protected:
// Returns the location of the item assuming row major memory layout.
// @param index
// @return
Status ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const;
// A function that prints Tensor recursively, first called by print // A function that prints Tensor recursively, first called by print
// @param out // @param out
// @param cur_dim // @param cur_dim
...@@ -559,7 +566,7 @@ class Tensor { ...@@ -559,7 +566,7 @@ class Tensor {
// Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the // Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the
// tensor's type is a string, otherwise undefined address would be returned. // tensor's type is a string, otherwise undefined address would be returned.
// @return address of the first string of the tensor. // @return address of the first string of the tensor.
uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements(); } uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; }
// all access to shape_ should be via shape // all access to shape_ should be via shape
TensorShape shape_; TensorShape shape_;
...@@ -573,14 +580,8 @@ class Tensor { ...@@ -573,14 +580,8 @@ class Tensor {
unsigned char *data_end_ = nullptr; unsigned char *data_end_ = nullptr;
}; };
template <> template <>
inline Tensor::TensorIterator<std::string_view> Tensor::begin<std::string_view>() {
uchar *buf = GetStringsBuffer();
return TensorIterator<std::string_view>(data_, buf);
}
template <>
inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() { inline Tensor::TensorIterator<std::string_view> Tensor::end<std::string_view>() {
uchar *buf = GetStringsBuffer(); return TensorIterator<std::string_view>(data_, shape_.NumOfElements());
return TensorIterator<std::string_view>(data_, buf, shape_.NumOfElements());
} }
} // namespace dataset } // namespace dataset
} // namespace mindspore } // namespace mindspore
......
...@@ -40,16 +40,7 @@ dsize_t TensorShape::NumOfElements() const { ...@@ -40,16 +40,7 @@ dsize_t TensorShape::NumOfElements() const {
if (!known()) { if (!known()) {
return 0; return 0;
} }
dsize_t num = 1; return strides_[0];
for (auto i : raw_shape_) {
if (multi_ok(num, i)) {
num *= i;
} else {
// dsize_t can wrap since it is signed int, we double check here
MS_LOG(ERROR) << "Tensor shape larger than maximum allowed value!";
}
}
return num;
} }
void TensorShape::Print(std::ostream &out) const { void TensorShape::Print(std::ostream &out) const {
...@@ -72,20 +63,23 @@ void TensorShape::Print(std::ostream &out) const { ...@@ -72,20 +63,23 @@ void TensorShape::Print(std::ostream &out) const {
} }
TensorShape::TensorShape(const std::initializer_list<dsize_t> &list) TensorShape::TensorShape(const std::initializer_list<dsize_t> &list)
: raw_shape_(*GlobalContext::Instance()->int_allocator()) { : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
AddListToShape(list); AddListToShape(list);
} }
TensorShape::TensorShape(const std::vector<dsize_t> &list) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { TensorShape::TensorShape(const std::vector<dsize_t> &list)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
AddListToShape(list); AddListToShape(list);
} }
TensorShape::TensorShape(const TensorShape &shape) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { TensorShape::TensorShape(const TensorShape &shape)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
AddListToShape(shape.AsVector()); AddListToShape(shape.AsVector());
known_ = shape.known_; // override with the input shape in case of unknown-rank tensor shape. known_ = shape.known_; // override with the input shape in case of unknown-rank tensor shape.
} }
TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { TensorShape::TensorShape(py::list l)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
std::vector<dsize_t> list_c; std::vector<dsize_t> list_c;
for (auto &i : l) { for (auto &i : l) {
if (!i.is_none()) { if (!i.is_none()) {
...@@ -97,6 +91,18 @@ TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->in ...@@ -97,6 +91,18 @@ TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->in
AddListToShape(list_c); AddListToShape(list_c);
} }
TensorShape::TensorShape(cv::MatSize cv_size, uint32_t type)
: raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) {
for (int i = 0; i < cv_size.dims(); i++) {
raw_shape_.push_back(cv_size[i]);
}
auto channels = static_cast<uint8_t>(1 + (type >> static_cast<uint8_t>(CV_CN_SHIFT)));
if (channels != 1) {
raw_shape_.push_back(channels);
}
known_ = true;
}
TensorShape TensorShape::CreateUnknownRankShape() { TensorShape TensorShape::CreateUnknownRankShape() {
TensorShape s({}); TensorShape s({});
s.known_ = false; s.known_ = false;
...@@ -109,17 +115,6 @@ TensorShape TensorShape::InsertDim(dsize_t axis, dsize_t dim) const { ...@@ -109,17 +115,6 @@ TensorShape TensorShape::InsertDim(dsize_t axis, dsize_t dim) const {
return TensorShape(tmp); return TensorShape(tmp);
} }
TensorShape::TensorShape(cv::MatSize cv_size, uint32_t type) : raw_shape_(*GlobalContext::Instance()->int_allocator()) {
for (int i = 0; i < cv_size.dims(); i++) {
raw_shape_.push_back(cv_size[i]);
}
auto channels = static_cast<uint8_t>(1 + (type >> static_cast<uint8_t>(CV_CN_SHIFT)));
if (channels != 1) {
raw_shape_.push_back(channels);
}
known_ = true;
}
std::vector<dsize_t> TensorShape::AsVector() const { std::vector<dsize_t> TensorShape::AsVector() const {
return std::vector<dsize_t>(raw_shape_.begin(), raw_shape_.end()); return std::vector<dsize_t>(raw_shape_.begin(), raw_shape_.end());
} }
...@@ -139,23 +134,28 @@ bool TensorShape::IsValidIndex(const std::vector<dsize_t> &index) const { ...@@ -139,23 +134,28 @@ bool TensorShape::IsValidIndex(const std::vector<dsize_t> &index) const {
template <typename T> template <typename T>
void TensorShape::AddListToShape(const T &list) { void TensorShape::AddListToShape(const T &list) {
raw_shape_.resize(list.size());
strides_.resize(list.size() + 1);
strides_[list.size()] = 1;
known_ = true; known_ = true;
dsize_t num = 1;
dsize_t size = 0; dsize_t size = 0;
for (const auto &itr : list) { auto itr = std::rbegin(list); // iterate over the list in reverse order
if (itr > 0) { auto s = list.size() - 1; // to compute strides while adding dims
if (num > std::numeric_limits<int64_t>::max() / itr) { for (; itr != std::rend(list); itr++, s--) {
dsize_t dim = *itr;
if (dim > 0) {
if (strides_[s + 1] > std::numeric_limits<int64_t>::max() / dim) {
MS_LOG(ERROR) << "Invalid shape data, overflow occurred!"; MS_LOG(ERROR) << "Invalid shape data, overflow occurred!";
known_ = false; known_ = false;
raw_shape_.clear(); raw_shape_.clear();
return; return;
} }
num *= itr; strides_[s] = dim * strides_[s + 1];
} }
if (itr < 0) { if (dim < 0) {
known_ = false; known_ = false;
} }
if (itr > kDeMaxDim) { if (dim > kDeMaxDim) {
std::stringstream ss; std::stringstream ss;
ss << "Invalid shape data, dim (" << size << ") is larger than the maximum dim size(" << kDeMaxDim << ")!"; ss << "Invalid shape data, dim (" << size << ") is larger than the maximum dim size(" << kDeMaxDim << ")!";
MS_LOG(ERROR) << ss.str().c_str(); MS_LOG(ERROR) << ss.str().c_str();
...@@ -163,7 +163,7 @@ void TensorShape::AddListToShape(const T &list) { ...@@ -163,7 +163,7 @@ void TensorShape::AddListToShape(const T &list) {
raw_shape_.clear(); raw_shape_.clear();
return; return;
} }
raw_shape_.push_back(itr); raw_shape_[s] = dim;
size++; size++;
} }
if (size > kDeMaxRank) { if (size > kDeMaxRank) {
...@@ -215,17 +215,18 @@ TensorShape TensorShape::Squeeze() const { ...@@ -215,17 +215,18 @@ TensorShape TensorShape::Squeeze() const {
} }
return TensorShape(new_shape); return TensorShape(new_shape);
} }
std::vector<dsize_t> TensorShape::Strides() {
std::vector<dsize_t> strides(Rank()); std::vector<dsize_t> TensorShape::Strides() const { return std::vector<dsize_t>{strides_.begin() + 1, strides_.end()}; }
dsize_t count = NumOfElements();
for (dsize_t i = 0; i < Rank(); i++) { // Name: ToFlatIndex()
if (raw_shape_[i] != 0) // Description: convert a vector style index to number, used to access memory internal use only
count /= raw_shape_[i]; Status TensorShape::ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const {
else *flat_index = 0;
count = 0; for (size_t k = 0; k < index.size(); k++) {
strides[i] = count; *flat_index += index[k] * strides_[k + 1]; // skip the first element of strides_ which is numOfElements
} }
return strides; CHECK_FAIL_RETURN_UNEXPECTED(*flat_index < NumOfElements(), "Not a valid index");
return Status::OK();
} }
} // namespace dataset } // namespace dataset
} // namespace mindspore } // namespace mindspore
...@@ -156,13 +156,20 @@ class TensorShape { ...@@ -156,13 +156,20 @@ class TensorShape {
TensorShape Squeeze() const; TensorShape Squeeze() const;
std::vector<dsize_t> Strides(); std::vector<dsize_t> Strides() const;
// Returns the location of the item assuming row major memory layout.
// @param index
// @return
Status ToFlatIndex(const std::vector<dsize_t> &index, dsize_t *flat_index) const;
private: private:
// True if known and valid shape, false otherwise // True if known and valid shape, false otherwise
bool known_; bool known_;
// Vector to keep the dims of the shape. // Vector to keep the dims of the shape.
std::vector<dsize_t, IntAlloc> raw_shape_; std::vector<dsize_t, IntAlloc> raw_shape_;
// Vector to keep the strides of the shape. The size is rank+1
std::vector<dsize_t, IntAlloc> strides_;
// Internal utility function to iterate over a list, check if the dim is valid and then insert it into the shape. // Internal utility function to iterate over a list, check if the dim is valid and then insert it into the shape.
// @tparam T list // @tparam T list
......
ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto)
ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto)
add_subdirectory(sampler) add_subdirectory(sampler)
file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc")
set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD)
...@@ -15,13 +13,9 @@ add_library(engine-datasetops-source OBJECT ...@@ -15,13 +13,9 @@ add_library(engine-datasetops-source OBJECT
image_folder_op.cc image_folder_op.cc
mnist_op.cc mnist_op.cc
voc_op.cc voc_op.cc
${EXAMPLE_SRCS}
${FEATURE_SRCS}
manifest_op.cc manifest_op.cc
cifar_op.cc cifar_op.cc
random_data_op.cc random_data_op.cc
celeba_op.cc celeba_op.cc
text_file_op.cc text_file_op.cc
) )
\ No newline at end of file
add_dependencies(engine-datasetops-source mindspore::protobuf)
...@@ -127,8 +127,10 @@ Status MindRecordOp::Init() { ...@@ -127,8 +127,10 @@ Status MindRecordOp::Init() {
std::string type_str = mindrecord::ColumnDataTypeNameNormalized[col_data_types[i]]; std::string type_str = mindrecord::ColumnDataTypeNameNormalized[col_data_types[i]];
DataType t_dtype = DataType(type_str); // valid types: {"bytes", "string", "int32", "int64", "float32", "float64"} DataType t_dtype = DataType(type_str); // valid types: {"bytes", "string", "int32", "int64", "float32", "float64"}
if (col_data_types[i] == mindrecord::ColumnBytes || col_data_types[i] == mindrecord::ColumnString) { // rank = 1 if (col_data_types[i] == mindrecord::ColumnBytes) { // rank = 1
col_desc = ColDescriptor(colname, t_dtype, TensorImpl::kFlexible, 1); col_desc = ColDescriptor(colname, t_dtype, TensorImpl::kFlexible, 1);
} else if (col_data_types[i] == mindrecord::ColumnString) { // rank = 0
col_desc = ColDescriptor(colname, t_dtype, TensorImpl::kFlexible, 0);
} else if (col_shapes[i].size() > 0) { } else if (col_shapes[i].size() > 0) {
std::vector<dsize_t> vec(col_shapes[i].size()); // temporary vector to hold shape std::vector<dsize_t> vec(col_shapes[i].size()); // temporary vector to hold shape
(void)std::copy(col_shapes[i].begin(), col_shapes[i].end(), vec.begin()); (void)std::copy(col_shapes[i].begin(), col_shapes[i].end(), vec.begin());
...@@ -309,7 +311,10 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint ...@@ -309,7 +311,10 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint
// Set shape // Set shape
auto num_elements = n_bytes / column_data_type_size; auto num_elements = n_bytes / column_data_type_size;
if (column.hasShape()) { if (type == DataType::DE_STRING) {
std::string s{data, data + n_bytes};
RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, {s}, TensorShape::CreateScalar()));
} else if (column.hasShape()) {
auto new_shape = TensorShape(column.shape()); auto new_shape = TensorShape(column.shape());
RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape)); RETURN_IF_NOT_OK(column.MaterializeTensorShape(static_cast<int32_t>(num_elements), &new_shape));
RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, column.tensorImpl(), new_shape, type, data)); RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, column.tensorImpl(), new_shape, type, data));
......
...@@ -63,7 +63,8 @@ Status Sampler::CreateSamplerTensor(std::shared_ptr<Tensor> *sample_ids, int64_t ...@@ -63,7 +63,8 @@ Status Sampler::CreateSamplerTensor(std::shared_ptr<Tensor> *sample_ids, int64_t
} }
TensorShape shape(std::vector<dsize_t>(1, num_elements)); TensorShape shape(std::vector<dsize_t>(1, num_elements));
RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, col_desc_->tensorImpl(), shape, col_desc_->type())); RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, col_desc_->tensorImpl(), shape, col_desc_->type()));
(void)(*sample_ids)->GetMutableBuffer(); // allocate memory in case user forgets! RETURN_IF_NOT_OK(
(*sample_ids)->AllocateBuffer((*sample_ids)->SizeInBytes())); // allocate memory in case user forgets!
return Status::OK(); return Status::OK();
} }
......
...@@ -724,18 +724,26 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor &current_col, const dataeng ...@@ -724,18 +724,26 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor &current_col, const dataeng
// kBytesList can map to the following DE types ONLY! // kBytesList can map to the following DE types ONLY!
// DE_UINT8, DE_INT8 // DE_UINT8, DE_INT8
// Must be single byte type for each element! // Must be single byte type for each element!
if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8) { if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8 &&
current_col.type() != DataType::DE_STRING) {
std::string err_msg = "Invalid datatype for Tensor at column: " + current_col.name(); std::string err_msg = "Invalid datatype for Tensor at column: " + current_col.name();
RETURN_STATUS_UNEXPECTED(err_msg); RETURN_STATUS_UNEXPECTED(err_msg);
} }
const dataengine::BytesList &bytes_list = column_values_list.bytes_list(); const dataengine::BytesList &bytes_list = column_values_list.bytes_list();
*num_elements = bytes_list.value_size();
if (current_col.type() == DataType::DE_STRING) {
TensorShape shape = TensorShape::CreateScalar();
RETURN_IF_NOT_OK(current_col.MaterializeTensorShape(*num_elements, &shape));
RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, bytes_list, shape));
return Status::OK();
}
uint64_t max_size = 0; uint64_t max_size = 0;
for (uint32_t i = 0; i < bytes_list.value_size(); ++i) max_size = std::max(max_size, bytes_list.value(i).size()); for (uint32_t i = 0; i < bytes_list.value_size(); ++i) max_size = std::max(max_size, bytes_list.value(i).size());
*num_elements = bytes_list.value_size();
int64_t pad_size = max_size; int64_t pad_size = max_size;
// if user provides a shape in the form of [-1, d1, 2d, ... , dn], we need to pad to d1 * d2 * ... * dn // if user provides a shape in the form of [-1, d1, 2d, ... , dn], we need to pad to d1 * d2 * ... * dn
...@@ -879,7 +887,7 @@ Status TFReaderOp::LoadIntList(const ColDescriptor &current_col, const dataengin ...@@ -879,7 +887,7 @@ Status TFReaderOp::LoadIntList(const ColDescriptor &current_col, const dataengin
RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, current_col.tensorImpl(), current_shape, current_col.type())); RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, current_col.tensorImpl(), current_shape, current_col.type()));
// Tensors are lazily allocated, this eagerly allocates memory for the tensor. // Tensors are lazily allocated, this eagerly allocates memory for the tensor.
(void)(*tensor)->GetMutableBuffer(); RETURN_IF_NOT_OK((*tensor)->AllocateBuffer((*tensor)->SizeInBytes()));
int64_t i = 0; int64_t i = 0;
auto it = (*tensor)->begin<T>(); auto it = (*tensor)->begin<T>();
......
...@@ -162,7 +162,7 @@ void CastFrom(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out ...@@ -162,7 +162,7 @@ void CastFrom(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
Status TypeCast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const DataType &data_type) { Status TypeCast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output, const DataType &data_type) {
RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), data_type)); RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), data_type));
static_cast<void>((*output)->GetMutableBuffer()); RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes()));
switch (input->type().value()) { switch (input->type().value()) {
case DataType::DE_BOOL: case DataType::DE_BOOL:
CastFrom<bool>(input, output); CastFrom<bool>(input, output);
...@@ -211,7 +211,7 @@ Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> * ...@@ -211,7 +211,7 @@ Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *
// initiate new tensor for type cast // initiate new tensor for type cast
DataType new_type = DataType("float16"); DataType new_type = DataType("float16");
RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), new_type)); RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), new_type));
static_cast<void>((*output)->GetMutableBuffer()); RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes()));
auto in_itr = input->begin<float>(); auto in_itr = input->begin<float>();
auto out_itr = (*output)->begin<float16>(); auto out_itr = (*output)->begin<float16>();
......
...@@ -64,7 +64,8 @@ Status Flip(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, int ...@@ -64,7 +64,8 @@ Status Flip(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, int
std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(input_cv->shape(), input_cv->type()); std::shared_ptr<CVTensor> output_cv = std::make_shared<CVTensor>(input_cv->shape(), input_cv->type());
RETURN_UNEXPECTED_IF_NULL(output_cv); RETURN_UNEXPECTED_IF_NULL(output_cv);
(void)output_cv->GetMutableBuffer(); RETURN_IF_NOT_OK(output_cv->AllocateBuffer(output_cv->SizeInBytes()));
if (input_cv->mat().data) { if (input_cv->mat().data) {
try { try {
cv::flip(input_cv->mat(), output_cv->mat(), flip_code); cv::flip(input_cv->mat(), output_cv->mat(), flip_code);
......
...@@ -51,7 +51,7 @@ enum ColumnDataType { ...@@ -51,7 +51,7 @@ enum ColumnDataType {
// mapping as {"bytes", "string", "int32", "int64", "float32", "float64"}; // mapping as {"bytes", "string", "int32", "int64", "float32", "float64"};
const uint32_t ColumnDataTypeSize[kDataTypes] = {1, 1, 4, 8, 4, 8}; const uint32_t ColumnDataTypeSize[kDataTypes] = {1, 1, 4, 8, 4, 8};
const std::vector<std::string> ColumnDataTypeNameNormalized = {"uint8", "uint8", "int32", const std::vector<std::string> ColumnDataTypeNameNormalized = {"uint8", "string", "int32",
"int64", "float32", "float64"}; "int64", "float32", "float64"};
const std::unordered_map<std::string, ColumnDataType> ColumnDataTypeMap = { const std::unordered_map<std::string, ColumnDataType> ColumnDataTypeMap = {
......
...@@ -48,6 +48,7 @@ def mstype_to_detype(type_): ...@@ -48,6 +48,7 @@ def mstype_to_detype(type_):
mstype.float16: cde.DataType("float16"), mstype.float16: cde.DataType("float16"),
mstype.float32: cde.DataType("float32"), mstype.float32: cde.DataType("float32"),
mstype.float64: cde.DataType("float64"), mstype.float64: cde.DataType("float64"),
mstype.string: cde.DataType("string"),
}[type_] }[type_]
......
...@@ -26,7 +26,7 @@ from . import datasets ...@@ -26,7 +26,7 @@ from . import datasets
INT32_MAX = 2147483647 INT32_MAX = 2147483647
valid_detype = [ valid_detype = [
"bool", "int8", "int16", "int32", "int64", "uint8", "uint16", "bool", "int8", "int16", "int32", "int64", "uint8", "uint16",
"uint32", "uint64", "float16", "float32", "float64" "uint32", "uint64", "float16", "float32", "float64", "string"
] ]
......
...@@ -32,47 +32,47 @@ class MindDataTestDatatype : public UT::Common { ...@@ -32,47 +32,47 @@ class MindDataTestDatatype : public UT::Common {
TEST_F(MindDataTestDatatype, TestSizes) { TEST_F(MindDataTestDatatype, TestSizes) {
uint8_t x = DataType::SIZE_IN_BYTES[DataType::DE_BOOL]; uint8_t x = DataType::kTypeInfo[DataType::DE_BOOL].sizeInBytes_;
DataType d = DataType(DataType::DE_BOOL); DataType d = DataType(DataType::DE_BOOL);
ASSERT_EQ(x, 1); ASSERT_EQ(x, 1);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_INT8]; x = DataType::kTypeInfo[DataType::DE_INT8].sizeInBytes_;
d = DataType(DataType::DE_INT8); d = DataType(DataType::DE_INT8);
ASSERT_EQ(x, 1); ASSERT_EQ(x, 1);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_UINT8]; x = DataType::kTypeInfo[DataType::DE_UINT8].sizeInBytes_;
d = DataType(DataType::DE_UINT8); d = DataType(DataType::DE_UINT8);
ASSERT_EQ(x, 1); ASSERT_EQ(x, 1);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_INT16]; x = DataType::kTypeInfo[DataType::DE_INT16].sizeInBytes_;
d = DataType(DataType::DE_INT16); d = DataType(DataType::DE_INT16);
ASSERT_EQ(x, 2); ASSERT_EQ(x, 2);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_UINT16]; x = DataType::kTypeInfo[DataType::DE_UINT16].sizeInBytes_;
d = DataType(DataType::DE_UINT16); d = DataType(DataType::DE_UINT16);
ASSERT_EQ(x, 2); ASSERT_EQ(x, 2);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_INT32]; x = DataType::kTypeInfo[DataType::DE_INT32].sizeInBytes_;
d = DataType(DataType::DE_INT32); d = DataType(DataType::DE_INT32);
ASSERT_EQ(x, 4); ASSERT_EQ(x, 4);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_UINT32]; x = DataType::kTypeInfo[DataType::DE_UINT32].sizeInBytes_;
d = DataType(DataType::DE_UINT32); d = DataType(DataType::DE_UINT32);
ASSERT_EQ(x, 4); ASSERT_EQ(x, 4);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_INT64]; x = DataType::kTypeInfo[DataType::DE_INT64].sizeInBytes_;
d = DataType(DataType::DE_INT64); d = DataType(DataType::DE_INT64);
ASSERT_EQ(x, 8); ASSERT_EQ(x, 8);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_UINT64]; x = DataType::kTypeInfo[DataType::DE_UINT64].sizeInBytes_;
d = DataType(DataType::DE_UINT64); d = DataType(DataType::DE_UINT64);
ASSERT_EQ(x, 8); ASSERT_EQ(x, 8);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT32]; x = DataType::kTypeInfo[DataType::DE_FLOAT32].sizeInBytes_;
d = DataType(DataType::DE_FLOAT32); d = DataType(DataType::DE_FLOAT32);
ASSERT_EQ(x, 4); ASSERT_EQ(x, 4);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT64]; x = DataType::kTypeInfo[DataType::DE_FLOAT64].sizeInBytes_;
d = DataType(DataType::DE_FLOAT64); d = DataType(DataType::DE_FLOAT64);
ASSERT_EQ(x, 8); ASSERT_EQ(x, 8);
ASSERT_EQ(d.SizeInBytes(), x); ASSERT_EQ(d.SizeInBytes(), x);
......
...@@ -14,9 +14,7 @@ ...@@ -14,9 +14,7 @@
* limitations under the License. * limitations under the License.
*/ */
#include "common/common.h" #include "common/common.h"
#include "common/cvop_common.h"
#include "dataset/kernels/data/one_hot_op.h" #include "dataset/kernels/data/one_hot_op.h"
#include "dataset/core/cv_tensor.h"
#include "utils/log_adapter.h" #include "utils/log_adapter.h"
using namespace mindspore::dataset; using namespace mindspore::dataset;
...@@ -24,9 +22,9 @@ using mindspore::MsLogLevel::INFO; ...@@ -24,9 +22,9 @@ using mindspore::MsLogLevel::INFO;
using mindspore::ExceptionType::NoExceptionType; using mindspore::ExceptionType::NoExceptionType;
using mindspore::LogStream; using mindspore::LogStream;
class MindDataTestOneHotOp : public UT::CVOP::CVOpCommon { class MindDataTestOneHotOp : public UT::Common {
protected: protected:
MindDataTestOneHotOp() : CVOpCommon() {} MindDataTestOneHotOp() {}
}; };
TEST_F(MindDataTestOneHotOp, TestOp) { TEST_F(MindDataTestOneHotOp, TestOp) {
......
...@@ -65,14 +65,14 @@ TEST_F(MindDataTestStringTensorDE, Basics) { ...@@ -65,14 +65,14 @@ TEST_F(MindDataTestStringTensorDE, Basics) {
TEST_F(MindDataTestStringTensorDE, Basics2) { TEST_F(MindDataTestStringTensorDE, Basics2) {
std::shared_ptr<Tensor> t = std::shared_ptr<Tensor> t =
std::make_shared<Tensor>(std::vector<std::string>{"abc", "defg", "hi", "klmno", "123", "789"}, TensorShape({2, 3})); std::make_shared<Tensor>(std::vector<std::string>{"abc", "defg", "hi", "klmno", "123", "789"}, TensorShape({2, 3}));
ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 20); ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 20 + 4);
std::vector<uint32_t> offsets = {3, 8, 11, 17, 21, 25}; std::vector<uint32_t> offsets = {0, 4, 9, 12, 18, 22, 26};
uint32_t ctr = 0; uint32_t ctr = 0;
for (auto i : offsets) { for (auto i : offsets) {
ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i); ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i + 28);
ctr += 4; ctr += 4;
} }
const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4; const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4 + 4;
std::vector<uint32_t> starts = {0, 4, 9, 12, 18, 22}; std::vector<uint32_t> starts = {0, 4, 9, 12, 18, 22};
uint32_t index = 0; uint32_t index = 0;
...@@ -90,14 +90,14 @@ TEST_F(MindDataTestStringTensorDE, Empty) { ...@@ -90,14 +90,14 @@ TEST_F(MindDataTestStringTensorDE, Empty) {
std::shared_ptr<Tensor> t = std::make_shared<Tensor>(strings, TensorShape({2, 3})); std::shared_ptr<Tensor> t = std::make_shared<Tensor>(strings, TensorShape({2, 3}));
// abc_defg___123__ // abc_defg___123__
// 0123456789012345 // 0123456789012345
ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 10); ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 10 + 4);
std::vector<uint32_t> offsets = {3, 8, 9, 10, 14, 15}; std::vector<uint32_t> offsets = {0, 4, 9, 10, 11, 15, 16};
uint32_t ctr = 0; uint32_t ctr = 0;
for (auto i : offsets) { for (auto i : offsets) {
ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i); ASSERT_TRUE(*(reinterpret_cast<uint32_t *>(t->GetMutableBuffer() + ctr)) == i + 28);
ctr += 4; ctr += 4;
} }
const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4; const char *buf = reinterpret_cast<char *>(t->GetMutableBuffer()) + 6 * 4 + 4;
std::vector<uint32_t> starts = {0, 4, 9, 10, 11, 15}; std::vector<uint32_t> starts = {0, 4, 9, 10, 11, 15};
uint32_t index = 0; uint32_t index = 0;
......
...@@ -41,6 +41,7 @@ class MindDataTestTensorDE : public UT::Common { ...@@ -41,6 +41,7 @@ class MindDataTestTensorDE : public UT::Common {
TEST_F(MindDataTestTensorDE, Basics) { TEST_F(MindDataTestTensorDE, Basics) {
std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({2, 3}), DataType(DataType::DE_UINT64)); std::shared_ptr<Tensor> t = std::make_shared<Tensor>(TensorShape({2, 3}), DataType(DataType::DE_UINT64));
ASSERT_TRUE((t->AllocateBuffer(t->SizeInBytes())).IsOk());
ASSERT_EQ(t->shape(), TensorShape({2, 3})); ASSERT_EQ(t->shape(), TensorShape({2, 3}));
ASSERT_EQ(t->type(), DataType::DE_UINT64); ASSERT_EQ(t->type(), DataType::DE_UINT64);
ASSERT_EQ(t->SizeInBytes(), 2 * 3 * 8); ASSERT_EQ(t->SizeInBytes(), 2 * 3 * 8);
......
{
"datasetType": "TF",
"numRows": 3,
"columns": {
"line": {
"type": "string",
"rank": 0
},
"words": {
"type": "string",
"rank": 1
},
"chinese": {
"type": "string",
"rank": 0
}
}
}
...@@ -584,7 +584,7 @@ def test_cv_minddataset_reader_basic_tutorial_5_epoch(add_and_remove_cv_file): ...@@ -584,7 +584,7 @@ def test_cv_minddataset_reader_basic_tutorial_5_epoch(add_and_remove_cv_file):
def test_cv_minddataset_reader_basic_tutorial_5_epoch_with_batch(add_and_remove_cv_file): def test_cv_minddataset_reader_basic_tutorial_5_epoch_with_batch(add_and_remove_cv_file):
"""tutorial for cv minderdataset.""" """tutorial for cv minderdataset."""
columns_list = ["data", "file_name", "label"] columns_list = ["data", "label"]
num_readers = 4 num_readers = 4
data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers) data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers)
...@@ -948,8 +948,7 @@ def test_write_with_multi_bytes_and_array_and_read_by_MindDataset(): ...@@ -948,8 +948,7 @@ def test_write_with_multi_bytes_and_array_and_read_by_MindDataset():
data_value_to_list = [] data_value_to_list = []
for item in data: for item in data:
new_data = {} new_data = {}
new_data['file_name'] = np.asarray( new_data['file_name'] = np.asarray(item["file_name"], dtype='S')
list(bytes(item["file_name"], encoding='utf-8')), dtype=np.uint8)
new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32)
new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)
...@@ -1153,8 +1152,7 @@ def test_write_with_multi_bytes_and_MindDataset(): ...@@ -1153,8 +1152,7 @@ def test_write_with_multi_bytes_and_MindDataset():
data_value_to_list = [] data_value_to_list = []
for item in data: for item in data:
new_data = {} new_data = {}
new_data['file_name'] = np.asarray( new_data['file_name'] = np.asarray(item["file_name"], dtype='S')
list(bytes(item["file_name"], encoding='utf-8')), dtype=np.uint8)
new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32)
new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8)
new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8)
......
...@@ -27,6 +27,7 @@ import mindspore.dataset as ds ...@@ -27,6 +27,7 @@ import mindspore.dataset as ds
import mindspore.dataset.transforms.vision.c_transforms as vision import mindspore.dataset.transforms.vision.c_transforms as vision
from mindspore import log as logger from mindspore import log as logger
from mindspore.dataset.transforms.vision import Inter from mindspore.dataset.transforms.vision import Inter
from mindspore.dataset.transforms.text import as_text
from mindspore.mindrecord import FileWriter from mindspore.mindrecord import FileWriter
FILES_NUM = 4 FILES_NUM = 4
...@@ -72,7 +73,7 @@ def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file): ...@@ -72,7 +73,7 @@ def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file):
for item in data_set.create_dict_iterator(): for item in data_set.create_dict_iterator():
logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter)) logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
logger.info("-------------- item[file_name]: \ logger.info("-------------- item[file_name]: \
{}------------------------".format("".join([chr(x) for x in item["file_name"]]))) {}------------------------".format(as_text(item["file_name"])))
logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
num_iter += 1 num_iter += 1
...@@ -92,7 +93,7 @@ def test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file): ...@@ -92,7 +93,7 @@ def test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file):
logger.info("-------------- item[data]: \ logger.info("-------------- item[data]: \
{}------------------------".format(item["data"][:10])) {}------------------------".format(item["data"][:10]))
logger.info("-------------- item[file_name]: \ logger.info("-------------- item[file_name]: \
{}------------------------".format("".join([chr(x) for x in item["file_name"]]))) {}------------------------".format(as_text(item["file_name"])))
logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
num_iter += 1 num_iter += 1
...@@ -110,7 +111,7 @@ def test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file): ...@@ -110,7 +111,7 @@ def test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file):
for item in data_set.create_dict_iterator(): for item in data_set.create_dict_iterator():
logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter)) logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
logger.info("-------------- item[file_name]: \ logger.info("-------------- item[file_name]: \
{}------------------------".format("".join([chr(x) for x in item["file_name"]]))) {}------------------------".format(as_text(item["file_name"])))
logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
num_iter += 1 num_iter += 1
...@@ -127,7 +128,7 @@ def test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file): ...@@ -127,7 +128,7 @@ def test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file):
for item in data_set.create_dict_iterator(): for item in data_set.create_dict_iterator():
logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter)) logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter))
logger.info("-------------- item[file_name]: \ logger.info("-------------- item[file_name]: \
{}------------------------".format("".join([chr(x) for x in item["file_name"]]))) {}------------------------".format(as_text(item["file_name"])))
logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) logger.info("-------------- item[label]: {} ----------------------------".format(item["label"]))
num_iter += 1 num_iter += 1
......
...@@ -17,17 +17,15 @@ import numpy as np ...@@ -17,17 +17,15 @@ import numpy as np
import pytest import pytest
import mindspore.dataset as ds import mindspore.dataset as ds
import mindspore.common.dtype as mstype
# pylint: disable=comparison-with-itself # pylint: disable=comparison-with-itself
def test_basic(): def test_basic():
x = np.array([["ab", "cde", "121"], ["x", "km", "789"]], dtype='S') x = np.array([["ab", "cde", "121"], ["x", "km", "789"]], dtype='S')
# x = np.array(["ab", "cde"], dtype='S')
n = cde.Tensor(x) n = cde.Tensor(x)
arr = n.as_array() arr = n.as_array()
y = np.array([1, 2]) np.testing.assert_array_equal(x, arr)
assert all(y == y)
# assert np.testing.assert_array_equal(y,y)
def compare(strings): def compare(strings):
...@@ -60,7 +58,125 @@ def test_batching_strings(): ...@@ -60,7 +58,125 @@ def test_batching_strings():
assert "[Batch ERROR] Batch does not support" in str(info) assert "[Batch ERROR] Batch does not support" in str(info)
def test_map():
def gen():
yield np.array(["ab cde 121"], dtype='S'),
data = ds.GeneratorDataset(gen, column_names=["col"])
def split(b):
splits = b.item().decode("utf8").split()
return np.array(splits, dtype='S')
data = data.map(input_columns=["col"], operations=split)
expected = np.array(["ab", "cde", "121"], dtype='S')
for d in data:
np.testing.assert_array_equal(d[0], expected)
def as_str(arr):
def decode(s): return s.decode("utf8")
decode_v = np.vectorize(decode)
return decode_v(arr)
line = np.array(["This is a text file.",
"Be happy every day.",
"Good luck to everyone."])
words = np.array([["This", "text", "file", "a"],
["Be", "happy", "day", "b"],
["女", "", "everyone", "c"]])
chinese = np.array(["今天天气太好了我们一起去外面玩吧",
"男默女泪",
"江州市长江大桥参加了长江大桥的通车仪式"])
def test_tfrecord1():
s = ds.Schema()
s.add_column("line", "string", [])
s.add_column("words", "string", [-1])
s.add_column("chinese", "string", [])
data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s)
for i, d in enumerate(data.create_dict_iterator()):
assert d["line"].shape == line[i].shape
assert d["words"].shape == words[i].shape
assert d["chinese"].shape == chinese[i].shape
np.testing.assert_array_equal(line[i], as_str(d["line"]))
np.testing.assert_array_equal(words[i], as_str(d["words"]))
np.testing.assert_array_equal(chinese[i], as_str(d["chinese"]))
def test_tfrecord2():
data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False,
schema='../data/dataset/testTextTFRecord/datasetSchema.json')
for i, d in enumerate(data.create_dict_iterator()):
assert d["line"].shape == line[i].shape
assert d["words"].shape == words[i].shape
assert d["chinese"].shape == chinese[i].shape
np.testing.assert_array_equal(line[i], as_str(d["line"]))
np.testing.assert_array_equal(words[i], as_str(d["words"]))
np.testing.assert_array_equal(chinese[i], as_str(d["chinese"]))
def test_tfrecord3():
s = ds.Schema()
s.add_column("line", mstype.string, [])
s.add_column("words", mstype.string, [-1, 2])
s.add_column("chinese", mstype.string, [])
data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s)
for i, d in enumerate(data.create_dict_iterator()):
assert d["line"].shape == line[i].shape
assert d["words"].shape == words[i].reshape([2, 2]).shape
assert d["chinese"].shape == chinese[i].shape
np.testing.assert_array_equal(line[i], as_str(d["line"]))
np.testing.assert_array_equal(words[i].reshape([2, 2]), as_str(d["words"]))
np.testing.assert_array_equal(chinese[i], as_str(d["chinese"]))
def create_text_mindrecord():
# methood to create mindrecord with string data, used to generate testTextMindRecord/test.mindrecord
from mindspore.mindrecord import FileWriter
mindrecord_file_name = "test.mindrecord"
data = [{"english": "This is a text file.",
"chinese": "今天天气太好了我们一起去外面玩吧"},
{"english": "Be happy every day.",
"chinese": "男默女泪"},
{"english": "Good luck to everyone.",
"chinese": "江州市长江大桥参加了长江大桥的通车仪式"},
]
writer = FileWriter(mindrecord_file_name)
schema = {"english": {"type": "string"},
"chinese": {"type": "string"},
}
writer.add_schema(schema)
writer.write_raw_data(data)
writer.commit()
def test_mindrecord():
data = ds.MindDataset("../data/dataset/testTextMindRecord/test.mindrecord", shuffle=False)
for i, d in enumerate(data.create_dict_iterator()):
assert d["english"].shape == line[i].shape
assert d["chinese"].shape == chinese[i].shape
np.testing.assert_array_equal(line[i], as_str(d["english"]))
np.testing.assert_array_equal(chinese[i], as_str(d["chinese"]))
if __name__ == '__main__': if __name__ == '__main__':
test_generator() # test_generator()
test_basic() # test_basic()
test_batching_strings() # test_batching_strings()
test_map()
# test_tfrecord1()
# test_tfrecord2()
# test_tfrecord3()
# test_mindrecord()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册