diff --git a/mindspore/ccsrc/dataset/core/CMakeLists.txt b/mindspore/ccsrc/dataset/core/CMakeLists.txt index 8141009bf6202c1561d8db3042a8f3b7b5105222..0b9f08d0702e1a08caa11b97d758b0895a44bab1 100644 --- a/mindspore/ccsrc/dataset/core/CMakeLists.txt +++ b/mindspore/ccsrc/dataset/core/CMakeLists.txt @@ -1,6 +1,10 @@ +ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto) +ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto) file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) add_library(core OBJECT + ${EXAMPLE_SRCS} + ${FEATURE_SRCS} client.cc config_manager.cc cv_tensor.cc @@ -9,4 +13,5 @@ add_library(core OBJECT tensor.cc tensor_shape.cc ) +add_dependencies(core mindspore::protobuf) target_include_directories(core PRIVATE ${pybind11_INCLUDE_DIRS}) diff --git a/mindspore/ccsrc/dataset/core/data_type.cc b/mindspore/ccsrc/dataset/core/data_type.cc index 4420f78e2dce94f48bb30d80d2c86304560dfd98..744c8c1ca043cce0ca7aaa89754c2f68b444cbbd 100644 --- a/mindspore/ccsrc/dataset/core/data_type.cc +++ b/mindspore/ccsrc/dataset/core/data_type.cc @@ -25,14 +25,14 @@ namespace dataset { uint8_t DataType::SizeInBytes() const { if (type_ < DataType::NUM_OF_TYPES) - return SIZE_IN_BYTES[type_]; + return kTypeInfo[type_].sizeInBytes_; else return 0; } py::dtype DataType::AsNumpyType() const { if (type_ < DataType::NUM_OF_TYPES) - return py::dtype(PYBIND_TYPES[type_]); + return py::dtype(kTypeInfo[type_].pybindType_); else return py::dtype("unknown"); } @@ -40,7 +40,7 @@ py::dtype DataType::AsNumpyType() const { uint8_t DataType::AsCVType() const { uint8_t res = kCVInvalidType; if (type_ < DataType::NUM_OF_TYPES) { - res = CV_TYPES[type_]; + res = kTypeInfo[type_].cvType_; } if (res == kCVInvalidType) { @@ -108,7 +108,7 @@ DataType::DataType(const std::string &type_str) { std::string DataType::ToString() const { if (type_ < DataType::NUM_OF_TYPES) - return TO_STRINGS[type_]; + return kTypeInfo[type_].name_; else return "unknown"; } @@ -149,7 +149,7 @@ DataType DataType::FromNpArray(const py::array &arr) { std::string DataType::GetPybindFormat() const { std::string res; if (type_ < DataType::NUM_OF_TYPES) { - res = PYBIND_FORMAT_DESCRIPTOR[type_]; + res = kTypeInfo[type_].pybindFormatDescriptor_; } if (res.empty()) { diff --git a/mindspore/ccsrc/dataset/core/data_type.h b/mindspore/ccsrc/dataset/core/data_type.h index eb4bc24c77c8976f10430756b5ad662c9757a01c..f1f0bb2ebbb10b4992b66c3a4b9de99133e047c2 100644 --- a/mindspore/ccsrc/dataset/core/data_type.h +++ b/mindspore/ccsrc/dataset/core/data_type.h @@ -51,56 +51,31 @@ class DataType { NUM_OF_TYPES }; - inline static constexpr uint8_t SIZE_IN_BYTES[] = {0, // DE_UNKNOWN - 1, // DE_BOOL - 1, // DE_INT8 - 1, // DE_UINT8 - 2, // DE_INT16 - 2, // DE_UINT16 - 4, // DE_INT32 - 4, // DE_UINT32 - 8, // DE_INT64 - 8, // DE_UINT64 - 2, // DE_FLOAT16 - 4, // DE_FLOAT32 - 8, // DE_FLOAT64 - 0}; // DE_STRING - - inline static const char *TO_STRINGS[] = {"unknown", "bool", "int8", "uint8", "int16", "uint16", "int32", - "uint32", "int64", "uint64", "float16", "float32", "float64", "string"}; - - inline static const char *PYBIND_TYPES[] = {"object", "bool", "int8", "uint8", "int16", "uint16", "int32", - "uint32", "int64", "uint64", "float16", "float32", "double", "bytes"}; - - inline static const std::string PYBIND_FORMAT_DESCRIPTOR[] = {"", // DE_UNKNOWN - py::format_descriptor::format(), // DE_BOOL - py::format_descriptor::format(), // DE_INT8 - py::format_descriptor::format(), // DE_UINT8 - py::format_descriptor::format(), // DE_INT16 - py::format_descriptor::format(), // DE_UINT16 - py::format_descriptor::format(), // DE_INT32 - py::format_descriptor::format(), // DE_UINT32 - py::format_descriptor::format(), // DE_INT64 - py::format_descriptor::format(), // DE_UINT64 - "e", // DE_FLOAT16 - py::format_descriptor::format(), // DE_FLOAT32 - py::format_descriptor::format(), // DE_FLOAT64 - "S"}; // DE_STRING - - inline static constexpr uint8_t CV_TYPES[] = {kCVInvalidType, // DE_UNKNOWN - CV_8U, // DE_BOOL - CV_8S, // DE_INT8 - CV_8U, // DE_UINT8 - CV_16S, // DE_INT16 - CV_16U, // DE_UINT16 - CV_32S, // DE_INT32 - kCVInvalidType, // DE_UINT32 - kCVInvalidType, // DE_INT64 - kCVInvalidType, // DE_UINT64 - CV_16F, // DE_FLOAT16 - CV_32F, // DE_FLOAT32 - CV_64F, // DE_FLOAT64 - kCVInvalidType}; // DE_STRING + struct TypeInfo { + const char *name_; // name to be represent the type while printing + const uint8_t sizeInBytes_; // number of bytes needed for this type + const char *pybindType_; // Python matching type, used in get_output_types + const std::string pybindFormatDescriptor_; // pybind format used for numpy types + const uint8_t cvType_; // OpenCv matching type + }; + + static inline const TypeInfo kTypeInfo[] = { + // name, sizeInBytes, pybindTypem formatDescriptor, openCV + {"unknown", 0, "object", "", kCVInvalidType}, // DE_UNKNOWN + {"bool", 1, "bool", py::format_descriptor::format(), CV_8U}, // DE_BOOL + {"int8", 1, "int8", py::format_descriptor::format(), CV_8S}, // DE_INT8 + {"uint8", 1, "uint8", py::format_descriptor::format(), CV_8U}, // DE_UINT8 + {"int16", 2, "int16", py::format_descriptor::format(), CV_16S}, // DE_INT16 + {"uint16", 2, "uint16", py::format_descriptor::format(), CV_16U}, // DE_UINT16 + {"int32", 4, "int32", py::format_descriptor::format(), CV_32S}, // DE_INT32 + {"uint32", 4, "uint32", py::format_descriptor::format(), kCVInvalidType}, // DE_UINT32 + {"int64", 8, "int64", py::format_descriptor::format(), kCVInvalidType}, // DE_INT64 + {"uint64", 8, "uint64", py::format_descriptor::format(), kCVInvalidType}, // DE_UINT64 + {"float16", 2, "float16", "e", CV_16F}, // DE_FLOAT16 + {"float32", 4, "float32", py::format_descriptor::format(), CV_32F}, // DE_FLOAT32 + {"float64", 8, "double", py::format_descriptor::format(), CV_64F}, // DE_FLOAT64 + {"string", 0, "bytes", "S", kCVInvalidType} // DE_STRING + }; // No arg constructor to create an unknown shape DataType() : type_(DE_UNKNOWN) {} diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/example.proto b/mindspore/ccsrc/dataset/core/example.proto similarity index 100% rename from mindspore/ccsrc/dataset/engine/datasetops/source/example.proto rename to mindspore/ccsrc/dataset/core/example.proto diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/feature.proto b/mindspore/ccsrc/dataset/core/feature.proto similarity index 100% rename from mindspore/ccsrc/dataset/engine/datasetops/source/feature.proto rename to mindspore/ccsrc/dataset/core/feature.proto diff --git a/mindspore/ccsrc/dataset/core/tensor.cc b/mindspore/ccsrc/dataset/core/tensor.cc index 54e11ca0fbe74cfec98241b604bf03be39b6330f..629daefc61646ed84a8cd347d177e241dd768afc 100644 --- a/mindspore/ccsrc/dataset/core/tensor.cc +++ b/mindspore/ccsrc/dataset/core/tensor.cc @@ -57,18 +57,40 @@ Tensor::Tensor(const TensorShape &shape, const DataType &type) : shape_(shape), } Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data) : Tensor(shape, type) { + if (type.IsNumeric()) { + // If the data pointer was given, then we can also populate the tensor with data + if (data != nullptr) { + // Given the shape/type of this tensor, compute the data size and copy in the input bytes. + int64_t byte_size = this->SizeInBytes(); + Status s = this->AllocateBuffer(byte_size); // Allocates data_ inside itself + if (s.IsOk() && data_ != nullptr) { + int ret_code = memcpy_s(data_, byte_size, data, byte_size); + if (ret_code != 0) { + MS_LOG(ERROR) << "Failed to copy data into Tensor!"; + } + } else { + MS_LOG(ERROR) << "Failed to create memory for Tensor!"; + } + } + } else { + MS_LOG(ERROR) << "Type should be numeric to use this constructor."; + } +} + +Tensor::Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data, const dsize_t &length) + : Tensor(shape, type) { // If the data pointer was given, then we can also populate the tensor with data if (data != nullptr) { - // Given the shape/type of this tensor, compute the data size and copy in the input bytes. - int64_t byte_size = this->SizeInBytes(); - static_cast(this->GetMutableBuffer()); // Allocates data_ inside itself + // Allocates data_ inside itself + Status s = AllocateBuffer(length); + if (s.IsError()) { + MS_LOG(ERROR) << "Failed to create memory for Tensor!"; + } if (data_ != nullptr) { - int ret_code = memcpy_s(data_, byte_size, data, byte_size); + int ret_code = memcpy_s(data_, length, data, length); if (ret_code != 0) { MS_LOG(ERROR) << "Failed to copy data into Tensor!"; } - } else { - MS_LOG(ERROR) << "Failed to create memory for Tensor!"; } } } @@ -98,32 +120,79 @@ Tensor::Tensor(const std::vector &strings, const TensorShape &shape auto length_sum = [](dsize_t sum, const std::string &s) { return s.length() + sum; }; dsize_t total_length = std::accumulate(strings.begin(), strings.end(), 0, length_sum); - dsize_t num_bytes = (kOffsetSize + 1) * shape_.NumOfElements() + total_length; + // total bytes needed = offset array + strings + // offset array needs to store one offset var per element + 1 extra to get the length of the last string. + // strings will be null-terminated --> need 1 extra byte per element + dsize_t num_bytes = (kOffsetSize + 1) * shape_.NumOfElements() + kOffsetSize + total_length; data_ = data_allocator_->allocate(num_bytes); auto offset_arr = reinterpret_cast(data_); uchar *buf = GetStringsBuffer(); - offset_t offset = -1; + offset_t offset = buf - data_; // the first string will start here uint32_t i = 0; for (const auto &str : strings) { - // insert the end index of the string - // end index of a string is the end index of previous string + the length (including \0) - offset = offset + str.length() + 1; + // insert the start index of the string. offset_arr[i++] = offset; // total bytes are reduced by kOffsetSize num_bytes -= kOffsetSize; // insert actual string - memcpy_s(buf, num_bytes, str.c_str(), str.length() + 1); - buf += str.length() + 1; + int ret_code = memcpy_s(data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1); + if (ret_code != 0) MS_LOG(ERROR) << "Cannot copy string into Tensor"; + // next string will be stored right after the current one. + offset = offset + str.length() + 1; + // total bytes are reduced by the length of the string num_bytes -= str.length() + 1; } - this->data_end_ = buf; + // store one more offset value so we can get the length of the last string + // length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element] + offset_arr[i] = offset; + + this->data_end_ = data_ + offset_arr[i]; + DS_ASSERT(num_bytes == 0); if (shape.known()) Tensor::Reshape(shape); } +Tensor::Tensor(const dataengine::BytesList &bytes_list, const TensorShape &shape) + : Tensor(TensorShape({static_cast(bytes_list.value_size())}), DataType(DataType::DE_STRING)) { + // total bytes needed = offset array + strings + // offset array needs to store one offset var per element + 1 extra to get the length of the last string. + // strings will be null-terminated --> need 1 extra byte per element + dsize_t num_bytes = (kOffsetSize)*shape_.NumOfElements() + kOffsetSize + bytes_list.ByteSizeLong(); + + data_ = data_allocator_->allocate(num_bytes); + + auto offset_arr = reinterpret_cast(data_); + uchar *buf = GetStringsBuffer(); + + offset_t offset = buf - data_; // the first string will start here + uint32_t i = 0; + for (; i < bytes_list.value_size(); i++) { + const std::string &str = bytes_list.value(i); + // insert the start index of the string. + offset_arr[i] = offset; + // total bytes are reduced by kOffsetSize + num_bytes -= kOffsetSize; + // insert actual string + int ret_code = memcpy_s(data_ + offset, num_bytes, common::SafeCStr(str), str.length() + 1); + if (ret_code != 0) { + MS_LOG(ERROR) << "Cannot copy string into Tensor"; + } + // next string will be stored right after the current one. + offset = offset + str.length() + 1; + // total bytes are reduced by the length of the string + num_bytes -= str.length() + 1; + } + // store one more offset value so we can get the length of the last string + // length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element] + offset_arr[i] = offset; + + data_end_ = data_ + offset_arr[i]; + DS_ASSERT(num_bytes == 0); + if (shape.known()) Tensor::Reshape(shape); +} Status Tensor::CreateTensor(std::shared_ptr *ptr, TensorImpl tensor_impl, const TensorShape &shape, DataType type, const unsigned char *data) { if (!shape.known()) { @@ -152,20 +221,17 @@ Status Tensor::CreateTensor(std::shared_ptr *ptr, TensorImpl tensor_impl } return Status::OK(); // returns base-class shared_ptr } -std::string to(std::string x) { return x; } + Status Tensor::CreateTensorFromNumpyString(std::shared_ptr *ptr, py::array arr) { std::vector shape; for (dsize_t i = 0; i < arr.ndim(); i++) { shape.push_back(static_cast(arr.shape()[i])); } - arr.resize({arr.size()}); - auto itr = arr.begin(); + arr.resize({arr.size()}); // flatten the py::array so we can iterate once std::vector strings; - for (; itr != arr.end(); itr++) { - std::string s = to(py::cast(*itr)); - strings.push_back(s); - } - arr.resize(shape); + std::for_each(arr.begin(), arr.end(), [&strings](const auto &s) { strings.emplace_back(py::cast(s)); }); + + arr.resize(shape); // resize arr back to the original shape return CreateTensor(ptr, strings, TensorShape{shape}); } @@ -190,8 +256,9 @@ Status Tensor::CreateTensor(std::shared_ptr *ptr, py::array arr) { std::shared_ptr global_pool = GlobalContext::Instance()->mem_pool(); (*ptr)->data_allocator_ = std::make_unique>(global_pool); - static_cast((*ptr)->GetMutableBuffer()); int64_t byte_size = (*ptr)->SizeInBytes(); + RETURN_IF_NOT_OK((*ptr)->AllocateBuffer(byte_size)); + unsigned char *data = static_cast(arr.request().ptr); if ((*ptr)->data_ == nullptr) { RETURN_STATUS_UNEXPECTED("Failed to create memory for Tensor."); @@ -232,6 +299,13 @@ Status Tensor::CreateTensor(std::shared_ptr *ptr, const std::vector *ptr, const dataengine::BytesList &bytes_list, + const TensorShape &shape) { + const TensorAlloc *alloc = GlobalContext::Instance()->tensor_allocator(); + *ptr = std::allocate_shared(*alloc, bytes_list, shape); + return Status::OK(); +} + // Memcpy the given strided array's used part to consecutive memory // Consider a 3-d array // A[(i * shape[1] + j) * shape[2] + k] = B[i][j][k] = C[i * strides[0] + j * strides[1] + k * strides[2]] @@ -370,25 +444,20 @@ void Tensor::Print(std::ostream &out) const { out << "[Data area is null]"; } } - -// Name: ToFlatIndex() -// Description: convert a vector style index to number, used to access memory internal use only -Status Tensor::ToFlatIndex(const std::vector &index, dsize_t *flat_index) const { - if (!shape_.IsValidIndex(index)) { - std::string err = "Not a valid index"; - RETURN_STATUS_UNEXPECTED(err); - } - *flat_index = 0; - for (size_t k = 0; k < index.size(); k++) { - dsize_t product = 1; - for (size_t l = k + 1; l < index.size(); l++) { - product *= shape_[l]; +Status Tensor::AllocateBuffer(const dsize_t &length) { + if (data_ == nullptr) { + if (data_allocator_ != nullptr) { + data_ = data_allocator_->allocate(length); + RETURN_UNEXPECTED_IF_NULL(data_); + data_end_ = data_ + length; + } else { + data_ = static_cast(malloc(length)); + data_end_ = data_ + length; + RETURN_UNEXPECTED_IF_NULL(data_); } - *flat_index += index[k] * product; } return Status::OK(); } - const unsigned char *Tensor::GetBuffer() const { // This version cannot modify anything. data_ could possibly be null. return data_; @@ -404,17 +473,11 @@ unsigned char *Tensor::GetMutableBuffer() { } else { // If the data area is not created, then identify the memory size based // on the shape and type and allocate it. - if (data_allocator_ != nullptr) { - data_ = data_allocator_->allocate(this->SizeInBytes()); - data_end_ = data_ + SizeInBytes(); + if (this->AllocateBuffer(this->SizeInBytes()).IsOk()) { + return data_; } else { - data_ = static_cast(malloc(this->SizeInBytes())); - data_end_ = data_ + SizeInBytes(); - if (data_ == nullptr) { - return nullptr; - } + return nullptr; } - return data_; } } @@ -444,7 +507,7 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector &index) const { RETURN_STATUS_UNEXPECTED(err); } dsize_t flat_idx; - RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx)); + RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &flat_idx)); *ptr = reinterpret_cast(data_ + flat_idx * type_.SizeInBytes()); return Status::OK(); @@ -461,7 +524,7 @@ Status Tensor::GetItemPtr(uchar **ptr, const std::vector &index, offset RETURN_STATUS_UNEXPECTED(err); } dsize_t flat_idx; - RETURN_IF_NOT_OK(ToFlatIndex(index, &flat_idx)); + RETURN_IF_NOT_OK(shape_.ToFlatIndex(index, &flat_idx)); offset_t length_temp = 0; RETURN_IF_NOT_OK(GetStringAt(flat_idx, ptr, &length_temp)); if (length != nullptr) *length = length_temp; @@ -481,7 +544,7 @@ Status Tensor::StartAddrOfIndex(std::vector ind, uchar **start_addr_of_ std::vector r(t_shape.begin() + ind.size(), t_shape.end()); *remaining = TensorShape(r); ind.resize(this->Rank(), 0); // same as -> while (ind.size() < this->Rank()) ind.push_back(0); - RETURN_IF_NOT_OK(ToFlatIndex(ind, &flat_ind)); + RETURN_IF_NOT_OK(shape_.ToFlatIndex(ind, &flat_ind)); // check if GetBuffer() returns null, we should flag this as an error, this sanity check will only // be true is the tensor failed to allocate memory. if (GetMutableBuffer() == nullptr) { @@ -588,10 +651,10 @@ Status Tensor::GetItemAt(std::string_view *o, const std::vector &index) RETURN_UNEXPECTED_IF_NULL(o); CHECK_FAIL_RETURN_UNEXPECTED(type_ == DataType::DE_STRING, "Type is not DE_STRING"); - uchar *buf = nullptr; + uchar *start = nullptr; offset_t length = 0; - RETURN_IF_NOT_OK(GetItemPtr(&buf, index, &length)); - std::string_view sv{reinterpret_cast(buf), length}; + RETURN_IF_NOT_OK(GetItemPtr(&start, index, &length)); + std::string_view sv{reinterpret_cast(start)}; o->swap(sv); return Status::OK(); } @@ -778,13 +841,11 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length RETURN_UNEXPECTED_IF_NULL(string_start); RETURN_UNEXPECTED_IF_NULL(length); auto *offset_ptr = reinterpret_cast(data_); // offsets starts here - offset_t end = offset_ptr[index]; - offset_t start = 0; - if (index != 0) start = offset_ptr[index - 1] + 1; // string starts at where the previous string ends + 1 - uchar *buf = GetStringsBuffer(); // string data starts here - *string_start = buf + start; - *length = end - start; + offset_t start = offset_ptr[index]; + *string_start = data_ + start; + *length = offset_ptr[index + 1] - start - 1; // -1 to skip the \0 from the string length return Status::OK(); } + } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/core/tensor.h b/mindspore/ccsrc/dataset/core/tensor.h index 1f3a2a40f8e9adfac9b1cd945f1b321b74c995e2..5efd989fc9c397a452c3edabcb7fdd3a5c800c96 100644 --- a/mindspore/ccsrc/dataset/core/tensor.h +++ b/mindspore/ccsrc/dataset/core/tensor.h @@ -35,6 +35,7 @@ #include "dataset/util/allocator.h" #include "dataset/util/de_error.h" #include "dataset/util/status.h" +#include "proto/example.pb.h" namespace py = pybind11; namespace mindspore { @@ -64,6 +65,8 @@ class Tensor { // @param data unsigned char*, pointer to the data. Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data); + Tensor(const TensorShape &shape, const DataType &type, const unsigned char *data, const dsize_t &length); + Tensor(const Tensor &other) = delete; Tensor &operator=(const Tensor &other) = delete; @@ -72,6 +75,8 @@ class Tensor { Tensor &operator=(Tensor &&other) noexcept; + Status AllocateBuffer(const dsize_t &length); + // type of offest values to store strings information using offset_t = uint32_t; // const of the size of the offset variable @@ -84,15 +89,24 @@ class Tensor { // Construct a tensor from a list of strings. Reshape the tensor with `shape` if given, otherwise assume the shape is // the size of the vector `strings`. // The memory layout of a Tensor of strings consists of the Offset_array followed by the strings. - // OFFSET1, OFFSET2, ... String1, String2, ... - // The value of each offset is the end index of the corresponding string + // Thr offset array will store one extra value to find the length of the last string. + // OFFSET1, OFFSET2, ..., OFFSETn+1, STRING1, STRING2, ..., STRINGn + // The value of each offset is the start index of the corresponding string // Offsets is of type offest_t // strings will ne null-terminated // example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING) - // 3 6 a b c \0 d e \0 + // |----------------------------------------------------------------| + // | OFFSET ARRAY | STRINGS | + // | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 | + // | 11 | 15 | 18 | abc\0 | de\0 | + // |----------------------------------------------------------------| explicit Tensor(const std::vector &strings, const TensorShape &shape = TensorShape::CreateUnknownRankShape()); + // Same as Tensor(vector) but the input is protobuf bytelist + explicit Tensor(const dataengine::BytesList &bytes_list, + const TensorShape &shape = TensorShape::CreateUnknownRankShape()); + // A static factory method to create the given flavour of derived Tensor // Returns the base class reference for the Tensor. // @param ptr output argument to hold the created Tensor of given tensor_impl @@ -121,6 +135,9 @@ class Tensor { static Status CreateTensor(std::shared_ptr *ptr, const std::vector &strings, const TensorShape &shape = TensorShape::CreateUnknownRankShape()); + static Status CreateTensor(std::shared_ptr *ptr, const dataengine::BytesList &bytes_list, + const TensorShape &shape); + // Copy raw data of a array based on shape and strides to the destination pointer // @param dst Pointer to the destination array where the content is to be copied // @param src Pointer to the source of strided array to be copied @@ -166,7 +183,7 @@ class Tensor { // @param value of type `T` template Status SetItemAt(const std::vector &index, const T &value) { - static_cast(GetMutableBuffer()); + RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes())); T *ptr = nullptr; RETURN_IF_NOT_OK(GetItemPtr(&ptr, index)); *ptr = value; @@ -203,7 +220,7 @@ class Tensor { template Status Fill(const T &value) { CHECK_FAIL_RETURN_UNEXPECTED(type_ != DataType::DE_STRING, "Cannot use fill on tensor of strings."); - static_cast(GetMutableBuffer()); + RETURN_IF_NOT_OK(AllocateBuffer(SizeInBytes())); int64_t cellSize = type_.SizeInBytes(); if ((data_ != nullptr) && type_.IsCompatible()) { for (dsize_t i = 0; i < Size(); i++) { @@ -418,32 +435,28 @@ class Tensor { using pointer = std::string_view *; using reference = std::string_view &; - explicit TensorIterator(uchar *offset = nullptr, const uchar *buf = nullptr, dsize_t index = 0) { - offset_ = reinterpret_cast(offset); - buf_ = reinterpret_cast(buf); + explicit TensorIterator(uchar *data = nullptr, dsize_t index = 0) { + data_ = reinterpret_cast(data); index_ = index; } TensorIterator(const TensorIterator &raw_iterator) { - offset_ = raw_iterator.offset_; - buf_ = raw_iterator.buf_; + data_ = raw_iterator.data_; index_ = raw_iterator.index_; } ~TensorIterator() = default; - bool operator==(const TensorIterator &rhs) { - return buf_ == rhs.buf_ && offset_ == rhs.offset_ && index_ == rhs.index_; - } + bool operator==(const TensorIterator &rhs) { return data_ == rhs.data_ && index_ == rhs.index_; } bool operator!=(const TensorIterator &rhs) { return !(*this == rhs); } - operator bool() const { return offset_ != nullptr; } + operator bool() const { return data_ != nullptr; } std::string_view operator*() const { - offset_t start = 0; - if (index_ != 0) start = offset_[index_ - 1] + 1; - return std::string_view{buf_ + start}; + auto offset_ = reinterpret_cast(data_); + offset_t start = offset_[index_]; + return std::string_view{data_ + start}; } TensorIterator &operator+=(const dsize_t &inc) { @@ -496,8 +509,7 @@ class Tensor { protected: dsize_t index_; - offset_t *offset_; - const char *buf_; + const char *data_; }; // Return a TensorIterator that points to the start of the Tensor. @@ -518,11 +530,6 @@ class Tensor { } protected: - // Returns the location of the item assuming row major memory layout. - // @param index - // @return - Status ToFlatIndex(const std::vector &index, dsize_t *flat_index) const; - // A function that prints Tensor recursively, first called by print // @param out // @param cur_dim @@ -559,7 +566,7 @@ class Tensor { // Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the // tensor's type is a string, otherwise undefined address would be returned. // @return address of the first string of the tensor. - uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements(); } + uchar *GetStringsBuffer() const { return data_ + kOffsetSize * shape_.NumOfElements() + kOffsetSize; } // all access to shape_ should be via shape TensorShape shape_; @@ -573,14 +580,8 @@ class Tensor { unsigned char *data_end_ = nullptr; }; template <> -inline Tensor::TensorIterator Tensor::begin() { - uchar *buf = GetStringsBuffer(); - return TensorIterator(data_, buf); -} -template <> inline Tensor::TensorIterator Tensor::end() { - uchar *buf = GetStringsBuffer(); - return TensorIterator(data_, buf, shape_.NumOfElements()); + return TensorIterator(data_, shape_.NumOfElements()); } } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/core/tensor_shape.cc b/mindspore/ccsrc/dataset/core/tensor_shape.cc index e24b2bc12ba350e10169e0b69ba9d4e84828c3b1..30afdf38bc87400bee4b29a2363c24220a180501 100644 --- a/mindspore/ccsrc/dataset/core/tensor_shape.cc +++ b/mindspore/ccsrc/dataset/core/tensor_shape.cc @@ -40,16 +40,7 @@ dsize_t TensorShape::NumOfElements() const { if (!known()) { return 0; } - dsize_t num = 1; - for (auto i : raw_shape_) { - if (multi_ok(num, i)) { - num *= i; - } else { - // dsize_t can wrap since it is signed int, we double check here - MS_LOG(ERROR) << "Tensor shape larger than maximum allowed value!"; - } - } - return num; + return strides_[0]; } void TensorShape::Print(std::ostream &out) const { @@ -72,20 +63,23 @@ void TensorShape::Print(std::ostream &out) const { } TensorShape::TensorShape(const std::initializer_list &list) - : raw_shape_(*GlobalContext::Instance()->int_allocator()) { + : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { AddListToShape(list); } -TensorShape::TensorShape(const std::vector &list) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { +TensorShape::TensorShape(const std::vector &list) + : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { AddListToShape(list); } -TensorShape::TensorShape(const TensorShape &shape) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { +TensorShape::TensorShape(const TensorShape &shape) + : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { AddListToShape(shape.AsVector()); known_ = shape.known_; // override with the input shape in case of unknown-rank tensor shape. } -TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { +TensorShape::TensorShape(py::list l) + : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { std::vector list_c; for (auto &i : l) { if (!i.is_none()) { @@ -97,6 +91,18 @@ TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->in AddListToShape(list_c); } +TensorShape::TensorShape(cv::MatSize cv_size, uint32_t type) + : raw_shape_(*GlobalContext::Instance()->int_allocator()), strides_(*GlobalContext::Instance()->int_allocator()) { + for (int i = 0; i < cv_size.dims(); i++) { + raw_shape_.push_back(cv_size[i]); + } + auto channels = static_cast(1 + (type >> static_cast(CV_CN_SHIFT))); + if (channels != 1) { + raw_shape_.push_back(channels); + } + known_ = true; +} + TensorShape TensorShape::CreateUnknownRankShape() { TensorShape s({}); s.known_ = false; @@ -109,17 +115,6 @@ TensorShape TensorShape::InsertDim(dsize_t axis, dsize_t dim) const { return TensorShape(tmp); } -TensorShape::TensorShape(cv::MatSize cv_size, uint32_t type) : raw_shape_(*GlobalContext::Instance()->int_allocator()) { - for (int i = 0; i < cv_size.dims(); i++) { - raw_shape_.push_back(cv_size[i]); - } - auto channels = static_cast(1 + (type >> static_cast(CV_CN_SHIFT))); - if (channels != 1) { - raw_shape_.push_back(channels); - } - known_ = true; -} - std::vector TensorShape::AsVector() const { return std::vector(raw_shape_.begin(), raw_shape_.end()); } @@ -139,23 +134,28 @@ bool TensorShape::IsValidIndex(const std::vector &index) const { template void TensorShape::AddListToShape(const T &list) { + raw_shape_.resize(list.size()); + strides_.resize(list.size() + 1); + strides_[list.size()] = 1; known_ = true; - dsize_t num = 1; dsize_t size = 0; - for (const auto &itr : list) { - if (itr > 0) { - if (num > std::numeric_limits::max() / itr) { + auto itr = std::rbegin(list); // iterate over the list in reverse order + auto s = list.size() - 1; // to compute strides while adding dims + for (; itr != std::rend(list); itr++, s--) { + dsize_t dim = *itr; + if (dim > 0) { + if (strides_[s + 1] > std::numeric_limits::max() / dim) { MS_LOG(ERROR) << "Invalid shape data, overflow occurred!"; known_ = false; raw_shape_.clear(); return; } - num *= itr; + strides_[s] = dim * strides_[s + 1]; } - if (itr < 0) { + if (dim < 0) { known_ = false; } - if (itr > kDeMaxDim) { + if (dim > kDeMaxDim) { std::stringstream ss; ss << "Invalid shape data, dim (" << size << ") is larger than the maximum dim size(" << kDeMaxDim << ")!"; MS_LOG(ERROR) << ss.str().c_str(); @@ -163,7 +163,7 @@ void TensorShape::AddListToShape(const T &list) { raw_shape_.clear(); return; } - raw_shape_.push_back(itr); + raw_shape_[s] = dim; size++; } if (size > kDeMaxRank) { @@ -215,17 +215,18 @@ TensorShape TensorShape::Squeeze() const { } return TensorShape(new_shape); } -std::vector TensorShape::Strides() { - std::vector strides(Rank()); - dsize_t count = NumOfElements(); - for (dsize_t i = 0; i < Rank(); i++) { - if (raw_shape_[i] != 0) - count /= raw_shape_[i]; - else - count = 0; - strides[i] = count; + +std::vector TensorShape::Strides() const { return std::vector{strides_.begin() + 1, strides_.end()}; } + +// Name: ToFlatIndex() +// Description: convert a vector style index to number, used to access memory internal use only +Status TensorShape::ToFlatIndex(const std::vector &index, dsize_t *flat_index) const { + *flat_index = 0; + for (size_t k = 0; k < index.size(); k++) { + *flat_index += index[k] * strides_[k + 1]; // skip the first element of strides_ which is numOfElements } - return strides; + CHECK_FAIL_RETURN_UNEXPECTED(*flat_index < NumOfElements(), "Not a valid index"); + return Status::OK(); } } // namespace dataset } // namespace mindspore diff --git a/mindspore/ccsrc/dataset/core/tensor_shape.h b/mindspore/ccsrc/dataset/core/tensor_shape.h index 33dd0a81ee9ad5871e6513720e62ff7f594bccc9..6cfb007b561da2c7440008b437c9fd4e1a5f3e56 100644 --- a/mindspore/ccsrc/dataset/core/tensor_shape.h +++ b/mindspore/ccsrc/dataset/core/tensor_shape.h @@ -156,13 +156,20 @@ class TensorShape { TensorShape Squeeze() const; - std::vector Strides(); + std::vector Strides() const; + + // Returns the location of the item assuming row major memory layout. + // @param index + // @return + Status ToFlatIndex(const std::vector &index, dsize_t *flat_index) const; private: // True if known and valid shape, false otherwise bool known_; // Vector to keep the dims of the shape. std::vector raw_shape_; + // Vector to keep the strides of the shape. The size is rank+1 + std::vector strides_; // Internal utility function to iterate over a list, check if the dim is valid and then insert it into the shape. // @tparam T list diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt b/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt index b909a76ee1212a0b19b6a23670cfc5dd7e1dc6dc..a1d0b22f152e955fb4eef1fcbddd7b48598f2519 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt @@ -1,5 +1,3 @@ -ms_protobuf_generate(EXAMPLE_SRCS EXAMPLE_HDRS example.proto) -ms_protobuf_generate(FEATURE_SRCS FEATURE_HDRS feature.proto) add_subdirectory(sampler) file(GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.cc") set_property(SOURCE ${_CURRENT_SRC_FILES} PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD) @@ -15,13 +13,9 @@ add_library(engine-datasetops-source OBJECT image_folder_op.cc mnist_op.cc voc_op.cc - ${EXAMPLE_SRCS} - ${FEATURE_SRCS} manifest_op.cc cifar_op.cc random_data_op.cc celeba_op.cc text_file_op.cc - ) - -add_dependencies(engine-datasetops-source mindspore::protobuf) + ) \ No newline at end of file diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc index 49c7e78a60938f79afa0ff130edc4e4ef2b2a5f7..358dd07872bca150898092729010dc7ed55da392 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc @@ -127,8 +127,10 @@ Status MindRecordOp::Init() { std::string type_str = mindrecord::ColumnDataTypeNameNormalized[col_data_types[i]]; DataType t_dtype = DataType(type_str); // valid types: {"bytes", "string", "int32", "int64", "float32", "float64"} - if (col_data_types[i] == mindrecord::ColumnBytes || col_data_types[i] == mindrecord::ColumnString) { // rank = 1 + if (col_data_types[i] == mindrecord::ColumnBytes) { // rank = 1 col_desc = ColDescriptor(colname, t_dtype, TensorImpl::kFlexible, 1); + } else if (col_data_types[i] == mindrecord::ColumnString) { // rank = 0 + col_desc = ColDescriptor(colname, t_dtype, TensorImpl::kFlexible, 0); } else if (col_shapes[i].size() > 0) { std::vector vec(col_shapes[i].size()); // temporary vector to hold shape (void)std::copy(col_shapes[i].begin(), col_shapes[i].end(), vec.begin()); @@ -309,7 +311,10 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector(num_elements), &new_shape)); RETURN_IF_NOT_OK(Tensor::CreateTensor(&tensor, column.tensorImpl(), new_shape, type, data)); diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc index 90e950fceefe4af1cd620f85ad96c4bbc162faea..695f364b7f22e75a01a4751542156d3231fa1eff 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc @@ -63,7 +63,8 @@ Status Sampler::CreateSamplerTensor(std::shared_ptr *sample_ids, int64_t } TensorShape shape(std::vector(1, num_elements)); RETURN_IF_NOT_OK(Tensor::CreateTensor(sample_ids, col_desc_->tensorImpl(), shape, col_desc_->type())); - (void)(*sample_ids)->GetMutableBuffer(); // allocate memory in case user forgets! + RETURN_IF_NOT_OK( + (*sample_ids)->AllocateBuffer((*sample_ids)->SizeInBytes())); // allocate memory in case user forgets! return Status::OK(); } diff --git a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc index 8b679388a7b04610f5e4dd5ac265906bd4e3de37..1335344e6dccad57ec60f0568c95e6139893a5e0 100644 --- a/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc +++ b/mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc @@ -724,18 +724,26 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor ¤t_col, const dataeng // kBytesList can map to the following DE types ONLY! // DE_UINT8, DE_INT8 // Must be single byte type for each element! - if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8) { + if (current_col.type() != DataType::DE_UINT8 && current_col.type() != DataType::DE_INT8 && + current_col.type() != DataType::DE_STRING) { std::string err_msg = "Invalid datatype for Tensor at column: " + current_col.name(); RETURN_STATUS_UNEXPECTED(err_msg); } const dataengine::BytesList &bytes_list = column_values_list.bytes_list(); + *num_elements = bytes_list.value_size(); + + if (current_col.type() == DataType::DE_STRING) { + TensorShape shape = TensorShape::CreateScalar(); + RETURN_IF_NOT_OK(current_col.MaterializeTensorShape(*num_elements, &shape)); + RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, bytes_list, shape)); + return Status::OK(); + } + uint64_t max_size = 0; for (uint32_t i = 0; i < bytes_list.value_size(); ++i) max_size = std::max(max_size, bytes_list.value(i).size()); - *num_elements = bytes_list.value_size(); - int64_t pad_size = max_size; // if user provides a shape in the form of [-1, d1, 2d, ... , dn], we need to pad to d1 * d2 * ... * dn @@ -879,7 +887,7 @@ Status TFReaderOp::LoadIntList(const ColDescriptor ¤t_col, const dataengin RETURN_IF_NOT_OK(Tensor::CreateTensor(tensor, current_col.tensorImpl(), current_shape, current_col.type())); // Tensors are lazily allocated, this eagerly allocates memory for the tensor. - (void)(*tensor)->GetMutableBuffer(); + RETURN_IF_NOT_OK((*tensor)->AllocateBuffer((*tensor)->SizeInBytes())); int64_t i = 0; auto it = (*tensor)->begin(); diff --git a/mindspore/ccsrc/dataset/kernels/data/data_utils.cc b/mindspore/ccsrc/dataset/kernels/data/data_utils.cc index 03f1b99e2ac34b2657625f206744d56c7123a98b..f2635c1fe343867ea1702fe120f035901edb74ab 100644 --- a/mindspore/ccsrc/dataset/kernels/data/data_utils.cc +++ b/mindspore/ccsrc/dataset/kernels/data/data_utils.cc @@ -162,7 +162,7 @@ void CastFrom(const std::shared_ptr &input, std::shared_ptr *out Status TypeCast(const std::shared_ptr &input, std::shared_ptr *output, const DataType &data_type) { RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), data_type)); - static_cast((*output)->GetMutableBuffer()); + RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes())); switch (input->type().value()) { case DataType::DE_BOOL: CastFrom(input, output); @@ -211,7 +211,7 @@ Status ToFloat16(const std::shared_ptr &input, std::shared_ptr * // initiate new tensor for type cast DataType new_type = DataType("float16"); RETURN_IF_NOT_OK(Tensor::CreateTensor(output, TensorImpl::kFlexible, input->shape(), new_type)); - static_cast((*output)->GetMutableBuffer()); + RETURN_IF_NOT_OK((*output)->AllocateBuffer((*output)->SizeInBytes())); auto in_itr = input->begin(); auto out_itr = (*output)->begin(); diff --git a/mindspore/ccsrc/dataset/kernels/image/image_utils.cc b/mindspore/ccsrc/dataset/kernels/image/image_utils.cc index a166f863b53bcfb5f189d6168c506568c8a39eb0..bf470173d9a693985094f76003905264bce0d3a0 100644 --- a/mindspore/ccsrc/dataset/kernels/image/image_utils.cc +++ b/mindspore/ccsrc/dataset/kernels/image/image_utils.cc @@ -64,7 +64,8 @@ Status Flip(std::shared_ptr input, std::shared_ptr *output, int std::shared_ptr output_cv = std::make_shared(input_cv->shape(), input_cv->type()); RETURN_UNEXPECTED_IF_NULL(output_cv); - (void)output_cv->GetMutableBuffer(); + RETURN_IF_NOT_OK(output_cv->AllocateBuffer(output_cv->SizeInBytes())); + if (input_cv->mat().data) { try { cv::flip(input_cv->mat(), output_cv->mat(), flip_code); diff --git a/mindspore/ccsrc/mindrecord/include/shard_column.h b/mindspore/ccsrc/mindrecord/include/shard_column.h index e327ef511a906e012102eb707f267f376bf1822e..496e7ec3ea39f8260c18093d4e1d1ed9925c87da 100644 --- a/mindspore/ccsrc/mindrecord/include/shard_column.h +++ b/mindspore/ccsrc/mindrecord/include/shard_column.h @@ -51,7 +51,7 @@ enum ColumnDataType { // mapping as {"bytes", "string", "int32", "int64", "float32", "float64"}; const uint32_t ColumnDataTypeSize[kDataTypes] = {1, 1, 4, 8, 4, 8}; -const std::vector ColumnDataTypeNameNormalized = {"uint8", "uint8", "int32", +const std::vector ColumnDataTypeNameNormalized = {"uint8", "string", "int32", "int64", "float32", "float64"}; const std::unordered_map ColumnDataTypeMap = { diff --git a/mindspore/dataset/core/datatypes.py b/mindspore/dataset/core/datatypes.py index a8411d729a815c168b26109520be2b82ffb9b2d3..292af67e8ad4e9e95e47b3634be86369c9e78df4 100644 --- a/mindspore/dataset/core/datatypes.py +++ b/mindspore/dataset/core/datatypes.py @@ -48,6 +48,7 @@ def mstype_to_detype(type_): mstype.float16: cde.DataType("float16"), mstype.float32: cde.DataType("float32"), mstype.float64: cde.DataType("float64"), + mstype.string: cde.DataType("string"), }[type_] diff --git a/mindspore/dataset/engine/validators.py b/mindspore/dataset/engine/validators.py index f5005e688cdb855e6c588ae9452aa80fc12493c4..c9c06e559ca0a0e13c80e906654ea54b16ec3319 100644 --- a/mindspore/dataset/engine/validators.py +++ b/mindspore/dataset/engine/validators.py @@ -26,7 +26,7 @@ from . import datasets INT32_MAX = 2147483647 valid_detype = [ "bool", "int8", "int16", "int32", "int64", "uint8", "uint16", - "uint32", "uint64", "float16", "float32", "float64" + "uint32", "uint64", "float16", "float32", "float64", "string" ] diff --git a/tests/ut/cpp/dataset/datatype_test.cc b/tests/ut/cpp/dataset/datatype_test.cc index 82843d4285d79e9281405a5bf6fb79099e96b262..a55853c4c599b9c1c511b3c97929851ba2439550 100644 --- a/tests/ut/cpp/dataset/datatype_test.cc +++ b/tests/ut/cpp/dataset/datatype_test.cc @@ -32,47 +32,47 @@ class MindDataTestDatatype : public UT::Common { TEST_F(MindDataTestDatatype, TestSizes) { - uint8_t x = DataType::SIZE_IN_BYTES[DataType::DE_BOOL]; + uint8_t x = DataType::kTypeInfo[DataType::DE_BOOL].sizeInBytes_; DataType d = DataType(DataType::DE_BOOL); ASSERT_EQ(x, 1); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_INT8]; + x = DataType::kTypeInfo[DataType::DE_INT8].sizeInBytes_; d = DataType(DataType::DE_INT8); ASSERT_EQ(x, 1); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_UINT8]; + x = DataType::kTypeInfo[DataType::DE_UINT8].sizeInBytes_; d = DataType(DataType::DE_UINT8); ASSERT_EQ(x, 1); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_INT16]; + x = DataType::kTypeInfo[DataType::DE_INT16].sizeInBytes_; d = DataType(DataType::DE_INT16); ASSERT_EQ(x, 2); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_UINT16]; + x = DataType::kTypeInfo[DataType::DE_UINT16].sizeInBytes_; d = DataType(DataType::DE_UINT16); ASSERT_EQ(x, 2); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_INT32]; + x = DataType::kTypeInfo[DataType::DE_INT32].sizeInBytes_; d = DataType(DataType::DE_INT32); ASSERT_EQ(x, 4); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_UINT32]; + x = DataType::kTypeInfo[DataType::DE_UINT32].sizeInBytes_; d = DataType(DataType::DE_UINT32); ASSERT_EQ(x, 4); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_INT64]; + x = DataType::kTypeInfo[DataType::DE_INT64].sizeInBytes_; d = DataType(DataType::DE_INT64); ASSERT_EQ(x, 8); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_UINT64]; + x = DataType::kTypeInfo[DataType::DE_UINT64].sizeInBytes_; d = DataType(DataType::DE_UINT64); ASSERT_EQ(x, 8); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT32]; + x = DataType::kTypeInfo[DataType::DE_FLOAT32].sizeInBytes_; d = DataType(DataType::DE_FLOAT32); ASSERT_EQ(x, 4); ASSERT_EQ(d.SizeInBytes(), x); - x = DataType::SIZE_IN_BYTES[DataType::DE_FLOAT64]; + x = DataType::kTypeInfo[DataType::DE_FLOAT64].sizeInBytes_; d = DataType(DataType::DE_FLOAT64); ASSERT_EQ(x, 8); ASSERT_EQ(d.SizeInBytes(), x); diff --git a/tests/ut/cpp/dataset/one_hot_op_test.cc b/tests/ut/cpp/dataset/one_hot_op_test.cc index 4b8bbc1bdd2b0b529c27ecc5846fb5105b3c498b..c414e371e5ab9f49ff6f4593204f3c52f62aefb6 100644 --- a/tests/ut/cpp/dataset/one_hot_op_test.cc +++ b/tests/ut/cpp/dataset/one_hot_op_test.cc @@ -14,9 +14,7 @@ * limitations under the License. */ #include "common/common.h" -#include "common/cvop_common.h" #include "dataset/kernels/data/one_hot_op.h" -#include "dataset/core/cv_tensor.h" #include "utils/log_adapter.h" using namespace mindspore::dataset; @@ -24,9 +22,9 @@ using mindspore::MsLogLevel::INFO; using mindspore::ExceptionType::NoExceptionType; using mindspore::LogStream; -class MindDataTestOneHotOp : public UT::CVOP::CVOpCommon { +class MindDataTestOneHotOp : public UT::Common { protected: - MindDataTestOneHotOp() : CVOpCommon() {} + MindDataTestOneHotOp() {} }; TEST_F(MindDataTestOneHotOp, TestOp) { diff --git a/tests/ut/cpp/dataset/tensor_string_test.cc b/tests/ut/cpp/dataset/tensor_string_test.cc index 8c58f68982040f577e3fb32866c68139a45da5a3..a440a93c15fa6a6c92063be3681c91c1f670c878 100644 --- a/tests/ut/cpp/dataset/tensor_string_test.cc +++ b/tests/ut/cpp/dataset/tensor_string_test.cc @@ -65,14 +65,14 @@ TEST_F(MindDataTestStringTensorDE, Basics) { TEST_F(MindDataTestStringTensorDE, Basics2) { std::shared_ptr t = std::make_shared(std::vector{"abc", "defg", "hi", "klmno", "123", "789"}, TensorShape({2, 3})); - ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 20); - std::vector offsets = {3, 8, 11, 17, 21, 25}; + ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 20 + 4); + std::vector offsets = {0, 4, 9, 12, 18, 22, 26}; uint32_t ctr = 0; for (auto i : offsets) { - ASSERT_TRUE(*(reinterpret_cast(t->GetMutableBuffer() + ctr)) == i); + ASSERT_TRUE(*(reinterpret_cast(t->GetMutableBuffer() + ctr)) == i + 28); ctr += 4; } - const char *buf = reinterpret_cast(t->GetMutableBuffer()) + 6 * 4; + const char *buf = reinterpret_cast(t->GetMutableBuffer()) + 6 * 4 + 4; std::vector starts = {0, 4, 9, 12, 18, 22}; uint32_t index = 0; @@ -90,14 +90,14 @@ TEST_F(MindDataTestStringTensorDE, Empty) { std::shared_ptr t = std::make_shared(strings, TensorShape({2, 3})); // abc_defg___123__ // 0123456789012345 - ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 10); - std::vector offsets = {3, 8, 9, 10, 14, 15}; + ASSERT_TRUE(t->SizeInBytes() == 6 * 5 + 10 + 4); + std::vector offsets = {0, 4, 9, 10, 11, 15, 16}; uint32_t ctr = 0; for (auto i : offsets) { - ASSERT_TRUE(*(reinterpret_cast(t->GetMutableBuffer() + ctr)) == i); + ASSERT_TRUE(*(reinterpret_cast(t->GetMutableBuffer() + ctr)) == i + 28); ctr += 4; } - const char *buf = reinterpret_cast(t->GetMutableBuffer()) + 6 * 4; + const char *buf = reinterpret_cast(t->GetMutableBuffer()) + 6 * 4 + 4; std::vector starts = {0, 4, 9, 10, 11, 15}; uint32_t index = 0; diff --git a/tests/ut/cpp/dataset/tensor_test.cc b/tests/ut/cpp/dataset/tensor_test.cc index 615427ab9247cb66c2f5416a1486871829e85869..b36f71f4ef88c9ab0c9dcfc7391ef8204db881d7 100644 --- a/tests/ut/cpp/dataset/tensor_test.cc +++ b/tests/ut/cpp/dataset/tensor_test.cc @@ -41,6 +41,7 @@ class MindDataTestTensorDE : public UT::Common { TEST_F(MindDataTestTensorDE, Basics) { std::shared_ptr t = std::make_shared(TensorShape({2, 3}), DataType(DataType::DE_UINT64)); + ASSERT_TRUE((t->AllocateBuffer(t->SizeInBytes())).IsOk()); ASSERT_EQ(t->shape(), TensorShape({2, 3})); ASSERT_EQ(t->type(), DataType::DE_UINT64); ASSERT_EQ(t->SizeInBytes(), 2 * 3 * 8); diff --git a/tests/ut/data/dataset/testTextMindRecord/test.mindrecord b/tests/ut/data/dataset/testTextMindRecord/test.mindrecord new file mode 100644 index 0000000000000000000000000000000000000000..1a3bb4a12db47643ba910dc320c9bb8cb247db78 Binary files /dev/null and b/tests/ut/data/dataset/testTextMindRecord/test.mindrecord differ diff --git a/tests/ut/data/dataset/testTextMindRecord/test.mindrecord.db b/tests/ut/data/dataset/testTextMindRecord/test.mindrecord.db new file mode 100644 index 0000000000000000000000000000000000000000..8f0fa403f69c0e7d99a97b4803ba34d136f8267f Binary files /dev/null and b/tests/ut/data/dataset/testTextMindRecord/test.mindrecord.db differ diff --git a/tests/ut/data/dataset/testTextTFRecord/datasetSchema.json b/tests/ut/data/dataset/testTextTFRecord/datasetSchema.json new file mode 100644 index 0000000000000000000000000000000000000000..d0493c524294280e7f6cdb837cc72b572889be97 --- /dev/null +++ b/tests/ut/data/dataset/testTextTFRecord/datasetSchema.json @@ -0,0 +1,18 @@ +{ + "datasetType": "TF", + "numRows": 3, + "columns": { + "line": { + "type": "string", + "rank": 0 + }, + "words": { + "type": "string", + "rank": 1 + }, + "chinese": { + "type": "string", + "rank": 0 + } + } +} diff --git a/tests/ut/data/dataset/testTextTFRecord/text.tfrecord b/tests/ut/data/dataset/testTextTFRecord/text.tfrecord new file mode 100644 index 0000000000000000000000000000000000000000..e33a1c4b91323bae7da34726b6c6e7ba5abf739b Binary files /dev/null and b/tests/ut/data/dataset/testTextTFRecord/text.tfrecord differ diff --git a/tests/ut/python/dataset/test_minddataset.py b/tests/ut/python/dataset/test_minddataset.py index a882dc6bcb1ef10b0dafddda190a4c462e6d8be6..79a98491c2d3b56e2afbaafd87cfdff46eb1e127 100644 --- a/tests/ut/python/dataset/test_minddataset.py +++ b/tests/ut/python/dataset/test_minddataset.py @@ -584,7 +584,7 @@ def test_cv_minddataset_reader_basic_tutorial_5_epoch(add_and_remove_cv_file): def test_cv_minddataset_reader_basic_tutorial_5_epoch_with_batch(add_and_remove_cv_file): """tutorial for cv minderdataset.""" - columns_list = ["data", "file_name", "label"] + columns_list = ["data", "label"] num_readers = 4 data_set = ds.MindDataset(CV_FILE_NAME + "0", columns_list, num_readers) @@ -948,8 +948,7 @@ def test_write_with_multi_bytes_and_array_and_read_by_MindDataset(): data_value_to_list = [] for item in data: new_data = {} - new_data['file_name'] = np.asarray( - list(bytes(item["file_name"], encoding='utf-8')), dtype=np.uint8) + new_data['file_name'] = np.asarray(item["file_name"], dtype='S') new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) @@ -1153,8 +1152,7 @@ def test_write_with_multi_bytes_and_MindDataset(): data_value_to_list = [] for item in data: new_data = {} - new_data['file_name'] = np.asarray( - list(bytes(item["file_name"], encoding='utf-8')), dtype=np.uint8) + new_data['file_name'] = np.asarray(item["file_name"], dtype='S') new_data['label'] = np.asarray(list([item["label"]]), dtype=np.int32) new_data['image1'] = np.asarray(list(item["image1"]), dtype=np.uint8) new_data['image2'] = np.asarray(list(item["image2"]), dtype=np.uint8) diff --git a/tests/ut/python/dataset/test_minddataset_sampler.py b/tests/ut/python/dataset/test_minddataset_sampler.py index 100d2d1e16c30edb57813aed8c27f0f65a7f9e49..c1affcea01e6ae758cdae25cee962e48ba17468f 100644 --- a/tests/ut/python/dataset/test_minddataset_sampler.py +++ b/tests/ut/python/dataset/test_minddataset_sampler.py @@ -27,6 +27,7 @@ import mindspore.dataset as ds import mindspore.dataset.transforms.vision.c_transforms as vision from mindspore import log as logger from mindspore.dataset.transforms.vision import Inter +from mindspore.dataset.transforms.text import as_text from mindspore.mindrecord import FileWriter FILES_NUM = 4 @@ -72,7 +73,7 @@ def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file): for item in data_set.create_dict_iterator(): logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter)) logger.info("-------------- item[file_name]: \ - {}------------------------".format("".join([chr(x) for x in item["file_name"]]))) + {}------------------------".format(as_text(item["file_name"]))) logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) num_iter += 1 @@ -92,7 +93,7 @@ def test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file): logger.info("-------------- item[data]: \ {}------------------------".format(item["data"][:10])) logger.info("-------------- item[file_name]: \ - {}------------------------".format("".join([chr(x) for x in item["file_name"]]))) + {}------------------------".format(as_text(item["file_name"]))) logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) num_iter += 1 @@ -110,7 +111,7 @@ def test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file): for item in data_set.create_dict_iterator(): logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter)) logger.info("-------------- item[file_name]: \ - {}------------------------".format("".join([chr(x) for x in item["file_name"]]))) + {}------------------------".format(as_text(item["file_name"]))) logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) num_iter += 1 @@ -127,7 +128,7 @@ def test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file): for item in data_set.create_dict_iterator(): logger.info("-------------- cv reader basic: {} ------------------------".format(num_iter)) logger.info("-------------- item[file_name]: \ - {}------------------------".format("".join([chr(x) for x in item["file_name"]]))) + {}------------------------".format(as_text(item["file_name"]))) logger.info("-------------- item[label]: {} ----------------------------".format(item["label"])) num_iter += 1 diff --git a/tests/ut/python/dataset/test_tensor_string.py b/tests/ut/python/dataset/test_tensor_string.py index 7f905d61e5847932a16cac0ed42909bea0a22ee4..e5e2be865b13fcb4d1edb47a8b7a96dd76a7a91e 100644 --- a/tests/ut/python/dataset/test_tensor_string.py +++ b/tests/ut/python/dataset/test_tensor_string.py @@ -17,17 +17,15 @@ import numpy as np import pytest import mindspore.dataset as ds +import mindspore.common.dtype as mstype # pylint: disable=comparison-with-itself def test_basic(): x = np.array([["ab", "cde", "121"], ["x", "km", "789"]], dtype='S') - # x = np.array(["ab", "cde"], dtype='S') n = cde.Tensor(x) arr = n.as_array() - y = np.array([1, 2]) - assert all(y == y) - # assert np.testing.assert_array_equal(y,y) + np.testing.assert_array_equal(x, arr) def compare(strings): @@ -60,7 +58,125 @@ def test_batching_strings(): assert "[Batch ERROR] Batch does not support" in str(info) +def test_map(): + def gen(): + yield np.array(["ab cde 121"], dtype='S'), + + data = ds.GeneratorDataset(gen, column_names=["col"]) + + def split(b): + splits = b.item().decode("utf8").split() + return np.array(splits, dtype='S') + + data = data.map(input_columns=["col"], operations=split) + expected = np.array(["ab", "cde", "121"], dtype='S') + for d in data: + np.testing.assert_array_equal(d[0], expected) + + +def as_str(arr): + def decode(s): return s.decode("utf8") + + decode_v = np.vectorize(decode) + return decode_v(arr) + + +line = np.array(["This is a text file.", + "Be happy every day.", + "Good luck to everyone."]) + +words = np.array([["This", "text", "file", "a"], + ["Be", "happy", "day", "b"], + ["女", "", "everyone", "c"]]) + +chinese = np.array(["今天天气太好了我们一起去外面玩吧", + "男默女泪", + "江州市长江大桥参加了长江大桥的通车仪式"]) + + +def test_tfrecord1(): + s = ds.Schema() + s.add_column("line", "string", []) + s.add_column("words", "string", [-1]) + s.add_column("chinese", "string", []) + + data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s) + + for i, d in enumerate(data.create_dict_iterator()): + assert d["line"].shape == line[i].shape + assert d["words"].shape == words[i].shape + assert d["chinese"].shape == chinese[i].shape + np.testing.assert_array_equal(line[i], as_str(d["line"])) + np.testing.assert_array_equal(words[i], as_str(d["words"])) + np.testing.assert_array_equal(chinese[i], as_str(d["chinese"])) + + +def test_tfrecord2(): + data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, + schema='../data/dataset/testTextTFRecord/datasetSchema.json') + for i, d in enumerate(data.create_dict_iterator()): + assert d["line"].shape == line[i].shape + assert d["words"].shape == words[i].shape + assert d["chinese"].shape == chinese[i].shape + np.testing.assert_array_equal(line[i], as_str(d["line"])) + np.testing.assert_array_equal(words[i], as_str(d["words"])) + np.testing.assert_array_equal(chinese[i], as_str(d["chinese"])) + + +def test_tfrecord3(): + s = ds.Schema() + s.add_column("line", mstype.string, []) + s.add_column("words", mstype.string, [-1, 2]) + s.add_column("chinese", mstype.string, []) + + data = ds.TFRecordDataset("../data/dataset/testTextTFRecord/text.tfrecord", shuffle=False, schema=s) + + for i, d in enumerate(data.create_dict_iterator()): + assert d["line"].shape == line[i].shape + assert d["words"].shape == words[i].reshape([2, 2]).shape + assert d["chinese"].shape == chinese[i].shape + np.testing.assert_array_equal(line[i], as_str(d["line"])) + np.testing.assert_array_equal(words[i].reshape([2, 2]), as_str(d["words"])) + np.testing.assert_array_equal(chinese[i], as_str(d["chinese"])) + + +def create_text_mindrecord(): + # methood to create mindrecord with string data, used to generate testTextMindRecord/test.mindrecord + from mindspore.mindrecord import FileWriter + + mindrecord_file_name = "test.mindrecord" + data = [{"english": "This is a text file.", + "chinese": "今天天气太好了我们一起去外面玩吧"}, + {"english": "Be happy every day.", + "chinese": "男默女泪"}, + {"english": "Good luck to everyone.", + "chinese": "江州市长江大桥参加了长江大桥的通车仪式"}, + ] + writer = FileWriter(mindrecord_file_name) + schema = {"english": {"type": "string"}, + "chinese": {"type": "string"}, + } + writer.add_schema(schema) + writer.write_raw_data(data) + writer.commit() + + +def test_mindrecord(): + data = ds.MindDataset("../data/dataset/testTextMindRecord/test.mindrecord", shuffle=False) + + for i, d in enumerate(data.create_dict_iterator()): + assert d["english"].shape == line[i].shape + assert d["chinese"].shape == chinese[i].shape + np.testing.assert_array_equal(line[i], as_str(d["english"])) + np.testing.assert_array_equal(chinese[i], as_str(d["chinese"])) + + if __name__ == '__main__': - test_generator() - test_basic() - test_batching_strings() + # test_generator() + # test_basic() + # test_batching_strings() + test_map() + # test_tfrecord1() + # test_tfrecord2() + # test_tfrecord3() + # test_mindrecord()