Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
6f733ec1
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6f733ec1
编写于
5月 22, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
5月 22, 2020
浏览文件
操作
浏览文件
下载
差异文件
!1308 Stage 2 of adding support for string Tensor
Merge pull request !1308 from h.farahat/string_tensor2
上级
3947c3d0
df361d1d
变更
29
隐藏空白更改
内联
并排
Showing
29 changed file
with
440 addition
and
248 deletion
+440
-248
mindspore/ccsrc/dataset/core/CMakeLists.txt
mindspore/ccsrc/dataset/core/CMakeLists.txt
+5
-0
mindspore/ccsrc/dataset/core/data_type.cc
mindspore/ccsrc/dataset/core/data_type.cc
+5
-5
mindspore/ccsrc/dataset/core/data_type.h
mindspore/ccsrc/dataset/core/data_type.h
+25
-50
mindspore/ccsrc/dataset/core/example.proto
mindspore/ccsrc/dataset/core/example.proto
+0
-0
mindspore/ccsrc/dataset/core/feature.proto
mindspore/ccsrc/dataset/core/feature.proto
+0
-0
mindspore/ccsrc/dataset/core/tensor.cc
mindspore/ccsrc/dataset/core/tensor.cc
+120
-59
mindspore/ccsrc/dataset/core/tensor.h
mindspore/ccsrc/dataset/core/tensor.h
+33
-32
mindspore/ccsrc/dataset/core/tensor_shape.cc
mindspore/ccsrc/dataset/core/tensor_shape.cc
+44
-43
mindspore/ccsrc/dataset/core/tensor_shape.h
mindspore/ccsrc/dataset/core/tensor_shape.h
+8
-1
mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
...ore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
+1
-7
mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
...e/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
+7
-2
mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc
...ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc
+2
-1
mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc
...re/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc
+12
-4
mindspore/ccsrc/dataset/kernels/data/data_utils.cc
mindspore/ccsrc/dataset/kernels/data/data_utils.cc
+2
-2
mindspore/ccsrc/dataset/kernels/image/image_utils.cc
mindspore/ccsrc/dataset/kernels/image/image_utils.cc
+2
-1
mindspore/ccsrc/mindrecord/include/shard_column.h
mindspore/ccsrc/mindrecord/include/shard_column.h
+1
-1
mindspore/dataset/core/datatypes.py
mindspore/dataset/core/datatypes.py
+1
-0
mindspore/dataset/engine/validators.py
mindspore/dataset/engine/validators.py
+1
-1
tests/ut/cpp/dataset/datatype_test.cc
tests/ut/cpp/dataset/datatype_test.cc
+11
-11
tests/ut/cpp/dataset/one_hot_op_test.cc
tests/ut/cpp/dataset/one_hot_op_test.cc
+2
-4
tests/ut/cpp/dataset/tensor_string_test.cc
tests/ut/cpp/dataset/tensor_string_test.cc
+8
-8
tests/ut/cpp/dataset/tensor_test.cc
tests/ut/cpp/dataset/tensor_test.cc
+1
-0
tests/ut/data/dataset/testTextMindRecord/test.mindrecord
tests/ut/data/dataset/testTextMindRecord/test.mindrecord
+0
-0
tests/ut/data/dataset/testTextMindRecord/test.mindrecord.db
tests/ut/data/dataset/testTextMindRecord/test.mindrecord.db
+0
-0
tests/ut/data/dataset/testTextTFRecord/datasetSchema.json
tests/ut/data/dataset/testTextTFRecord/datasetSchema.json
+18
-0
tests/ut/data/dataset/testTextTFRecord/text.tfrecord
tests/ut/data/dataset/testTextTFRecord/text.tfrecord
+0
-0
tests/ut/python/dataset/test_minddataset.py
tests/ut/python/dataset/test_minddataset.py
+3
-5
tests/ut/python/dataset/test_minddataset_sampler.py
tests/ut/python/dataset/test_minddataset_sampler.py
+5
-4
tests/ut/python/dataset/test_tensor_string.py
tests/ut/python/dataset/test_tensor_string.py
+123
-7
未找到文件。
mindspore/ccsrc/dataset/core/CMakeLists.txt
浏览文件 @
6f733ec1
ms_protobuf_generate
(
EXAMPLE_SRCS EXAMPLE_HDRS example.proto
)
ms_protobuf_generate
(
FEATURE_SRCS FEATURE_HDRS feature.proto
)
file
(
GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE
${
CMAKE_CURRENT_SOURCE_DIR
}
"*.cc"
)
set_property
(
SOURCE
${
_CURRENT_SRC_FILES
}
PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD
)
add_library
(
core OBJECT
${
EXAMPLE_SRCS
}
${
FEATURE_SRCS
}
client.cc
config_manager.cc
cv_tensor.cc
...
...
@@ -9,4 +13,5 @@ add_library(core OBJECT
tensor.cc
tensor_shape.cc
)
add_dependencies
(
core mindspore::protobuf
)
target_include_directories
(
core PRIVATE
${
pybind11_INCLUDE_DIRS
}
)
mindspore/ccsrc/dataset/core/data_type.cc
浏览文件 @
6f733ec1
...
...
@@ -25,14 +25,14 @@ namespace dataset {
uint8_t
DataType
::
SizeInBytes
()
const
{
if
(
type_
<
DataType
::
NUM_OF_TYPES
)
return
SIZE_IN_BYTES
[
type_
]
;
return
kTypeInfo
[
type_
].
sizeInBytes_
;
else
return
0
;
}
py
::
dtype
DataType
::
AsNumpyType
()
const
{
if
(
type_
<
DataType
::
NUM_OF_TYPES
)
return
py
::
dtype
(
PYBIND_TYPES
[
type_
]
);
return
py
::
dtype
(
kTypeInfo
[
type_
].
pybindType_
);
else
return
py
::
dtype
(
"unknown"
);
}
...
...
@@ -40,7 +40,7 @@ py::dtype DataType::AsNumpyType() const {
uint8_t
DataType
::
AsCVType
()
const
{
uint8_t
res
=
kCVInvalidType
;
if
(
type_
<
DataType
::
NUM_OF_TYPES
)
{
res
=
CV_TYPES
[
type_
]
;
res
=
kTypeInfo
[
type_
].
cvType_
;
}
if
(
res
==
kCVInvalidType
)
{
...
...
@@ -108,7 +108,7 @@ DataType::DataType(const std::string &type_str) {
std
::
string
DataType
::
ToString
()
const
{
if
(
type_
<
DataType
::
NUM_OF_TYPES
)
return
TO_STRINGS
[
type_
]
;
return
kTypeInfo
[
type_
].
name_
;
else
return
"unknown"
;
}
...
...
@@ -149,7 +149,7 @@ DataType DataType::FromNpArray(const py::array &arr) {
std
::
string
DataType
::
GetPybindFormat
()
const
{
std
::
string
res
;
if
(
type_
<
DataType
::
NUM_OF_TYPES
)
{
res
=
PYBIND_FORMAT_DESCRIPTOR
[
type_
]
;
res
=
kTypeInfo
[
type_
].
pybindFormatDescriptor_
;
}
if
(
res
.
empty
())
{
...
...
mindspore/ccsrc/dataset/core/data_type.h
浏览文件 @
6f733ec1
...
...
@@ -51,56 +51,31 @@ class DataType {
NUM_OF_TYPES
};
inline
static
constexpr
uint8_t
SIZE_IN_BYTES
[]
=
{
0
,
// DE_UNKNOWN
1
,
// DE_BOOL
1
,
// DE_INT8
1
,
// DE_UINT8
2
,
// DE_INT16
2
,
// DE_UINT16
4
,
// DE_INT32
4
,
// DE_UINT32
8
,
// DE_INT64
8
,
// DE_UINT64
2
,
// DE_FLOAT16
4
,
// DE_FLOAT32
8
,
// DE_FLOAT64
0
};
// DE_STRING
inline
static
const
char
*
TO_STRINGS
[]
=
{
"unknown"
,
"bool"
,
"int8"
,
"uint8"
,
"int16"
,
"uint16"
,
"int32"
,
"uint32"
,
"int64"
,
"uint64"
,
"float16"
,
"float32"
,
"float64"
,
"string"
};
inline
static
const
char
*
PYBIND_TYPES
[]
=
{
"object"
,
"bool"
,
"int8"
,
"uint8"
,
"int16"
,
"uint16"
,
"int32"
,
"uint32"
,
"int64"
,
"uint64"
,
"float16"
,
"float32"
,
"double"
,
"bytes"
};
inline
static
const
std
::
string
PYBIND_FORMAT_DESCRIPTOR
[]
=
{
""
,
// DE_UNKNOWN
py
::
format_descriptor
<
bool
>::
format
(),
// DE_BOOL
py
::
format_descriptor
<
int8_t
>::
format
(),
// DE_INT8
py
::
format_descriptor
<
uint8_t
>::
format
(),
// DE_UINT8
py
::
format_descriptor
<
int16_t
>::
format
(),
// DE_INT16
py
::
format_descriptor
<
uint16_t
>::
format
(),
// DE_UINT16
py
::
format_descriptor
<
int32_t
>::
format
(),
// DE_INT32
py
::
format_descriptor
<
uint32_t
>::
format
(),
// DE_UINT32
py
::
format_descriptor
<
int64_t
>::
format
(),
// DE_INT64
py
::
format_descriptor
<
uint64_t
>::
format
(),
// DE_UINT64
"e"
,
// DE_FLOAT16
py
::
format_descriptor
<
float
>::
format
(),
// DE_FLOAT32
py
::
format_descriptor
<
double
>::
format
(),
// DE_FLOAT64
"S"
};
// DE_STRING
inline
static
constexpr
uint8_t
CV_TYPES
[]
=
{
kCVInvalidType
,
// DE_UNKNOWN
CV_8U
,
// DE_BOOL
CV_8S
,
// DE_INT8
CV_8U
,
// DE_UINT8
CV_16S
,
// DE_INT16
CV_16U
,
// DE_UINT16
CV_32S
,
// DE_INT32
kCVInvalidType
,
// DE_UINT32
kCVInvalidType
,
// DE_INT64
kCVInvalidType
,
// DE_UINT64
CV_16F
,
// DE_FLOAT16
CV_32F
,
// DE_FLOAT32
CV_64F
,
// DE_FLOAT64
kCVInvalidType
};
// DE_STRING
struct
TypeInfo
{
const
char
*
name_
;
// name to be represent the type while printing
const
uint8_t
sizeInBytes_
;
// number of bytes needed for this type
const
char
*
pybindType_
;
// Python matching type, used in get_output_types
const
std
::
string
pybindFormatDescriptor_
;
// pybind format used for numpy types
const
uint8_t
cvType_
;
// OpenCv matching type
};
static
inline
const
TypeInfo
kTypeInfo
[]
=
{
// name, sizeInBytes, pybindTypem formatDescriptor, openCV
{
"unknown"
,
0
,
"object"
,
""
,
kCVInvalidType
},
// DE_UNKNOWN
{
"bool"
,
1
,
"bool"
,
py
::
format_descriptor
<
bool
>::
format
(),
CV_8U
},
// DE_BOOL
{
"int8"
,
1
,
"int8"
,
py
::
format_descriptor
<
int8_t
>::
format
(),
CV_8S
},
// DE_INT8
{
"uint8"
,
1
,
"uint8"
,
py
::
format_descriptor
<
uint8_t
>::
format
(),
CV_8U
},
// DE_UINT8
{
"int16"
,
2
,
"int16"
,
py
::
format_descriptor
<
int16_t
>::
format
(),
CV_16S
},
// DE_INT16
{
"uint16"
,
2
,
"uint16"
,
py
::
format_descriptor
<
uint16_t
>::
format
(),
CV_16U
},
// DE_UINT16
{
"int32"
,
4
,
"int32"
,
py
::
format_descriptor
<
int32_t
>::
format
(),
CV_32S
},
// DE_INT32
{
"uint32"
,
4
,
"uint32"
,
py
::
format_descriptor
<
uint32_t
>::
format
(),
kCVInvalidType
},
// DE_UINT32
{
"int64"
,
8
,
"int64"
,
py
::
format_descriptor
<
int64_t
>::
format
(),
kCVInvalidType
},
// DE_INT64
{
"uint64"
,
8
,
"uint64"
,
py
::
format_descriptor
<
uint64_t
>::
format
(),
kCVInvalidType
},
// DE_UINT64
{
"float16"
,
2
,
"float16"
,
"e"
,
CV_16F
},
// DE_FLOAT16
{
"float32"
,
4
,
"float32"
,
py
::
format_descriptor
<
float
>::
format
(),
CV_32F
},
// DE_FLOAT32
{
"float64"
,
8
,
"double"
,
py
::
format_descriptor
<
double
>::
format
(),
CV_64F
},
// DE_FLOAT64
{
"string"
,
0
,
"bytes"
,
"S"
,
kCVInvalidType
}
// DE_STRING
};
// No arg constructor to create an unknown shape
DataType
()
:
type_
(
DE_UNKNOWN
)
{}
...
...
mindspore/ccsrc/dataset/
engine/datasetops/sourc
e/example.proto
→
mindspore/ccsrc/dataset/
cor
e/example.proto
浏览文件 @
6f733ec1
文件已移动
mindspore/ccsrc/dataset/
engine/datasetops/sourc
e/feature.proto
→
mindspore/ccsrc/dataset/
cor
e/feature.proto
浏览文件 @
6f733ec1
文件已移动
mindspore/ccsrc/dataset/core/tensor.cc
浏览文件 @
6f733ec1
...
...
@@ -57,18 +57,40 @@ Tensor::Tensor(const TensorShape &shape, const DataType &type) : shape_(shape),
}
Tensor
::
Tensor
(
const
TensorShape
&
shape
,
const
DataType
&
type
,
const
unsigned
char
*
data
)
:
Tensor
(
shape
,
type
)
{
if
(
type
.
IsNumeric
())
{
// If the data pointer was given, then we can also populate the tensor with data
if
(
data
!=
nullptr
)
{
// Given the shape/type of this tensor, compute the data size and copy in the input bytes.
int64_t
byte_size
=
this
->
SizeInBytes
();
Status
s
=
this
->
AllocateBuffer
(
byte_size
);
// Allocates data_ inside itself
if
(
s
.
IsOk
()
&&
data_
!=
nullptr
)
{
int
ret_code
=
memcpy_s
(
data_
,
byte_size
,
data
,
byte_size
);
if
(
ret_code
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Failed to copy data into Tensor!"
;
}
}
else
{
MS_LOG
(
ERROR
)
<<
"Failed to create memory for Tensor!"
;
}
}
}
else
{
MS_LOG
(
ERROR
)
<<
"Type should be numeric to use this constructor."
;
}
}
Tensor
::
Tensor
(
const
TensorShape
&
shape
,
const
DataType
&
type
,
const
unsigned
char
*
data
,
const
dsize_t
&
length
)
:
Tensor
(
shape
,
type
)
{
// If the data pointer was given, then we can also populate the tensor with data
if
(
data
!=
nullptr
)
{
// Given the shape/type of this tensor, compute the data size and copy in the input bytes.
int64_t
byte_size
=
this
->
SizeInBytes
();
static_cast
<
void
>
(
this
->
GetMutableBuffer
());
// Allocates data_ inside itself
// Allocates data_ inside itself
Status
s
=
AllocateBuffer
(
length
);
if
(
s
.
IsError
())
{
MS_LOG
(
ERROR
)
<<
"Failed to create memory for Tensor!"
;
}
if
(
data_
!=
nullptr
)
{
int
ret_code
=
memcpy_s
(
data_
,
byte_size
,
data
,
byte_size
);
int
ret_code
=
memcpy_s
(
data_
,
length
,
data
,
length
);
if
(
ret_code
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Failed to copy data into Tensor!"
;
}
}
else
{
MS_LOG
(
ERROR
)
<<
"Failed to create memory for Tensor!"
;
}
}
}
...
...
@@ -98,32 +120,79 @@ Tensor::Tensor(const std::vector<std::string> &strings, const TensorShape &shape
auto
length_sum
=
[](
dsize_t
sum
,
const
std
::
string
&
s
)
{
return
s
.
length
()
+
sum
;
};
dsize_t
total_length
=
std
::
accumulate
(
strings
.
begin
(),
strings
.
end
(),
0
,
length_sum
);
dsize_t
num_bytes
=
(
kOffsetSize
+
1
)
*
shape_
.
NumOfElements
()
+
total_length
;
// total bytes needed = offset array + strings
// offset array needs to store one offset var per element + 1 extra to get the length of the last string.
// strings will be null-terminated --> need 1 extra byte per element
dsize_t
num_bytes
=
(
kOffsetSize
+
1
)
*
shape_
.
NumOfElements
()
+
kOffsetSize
+
total_length
;
data_
=
data_allocator_
->
allocate
(
num_bytes
);
auto
offset_arr
=
reinterpret_cast
<
offset_t
*>
(
data_
);
uchar
*
buf
=
GetStringsBuffer
();
offset_t
offset
=
-
1
;
offset_t
offset
=
buf
-
data_
;
// the first string will start here
uint32_t
i
=
0
;
for
(
const
auto
&
str
:
strings
)
{
// insert the end index of the string
// end index of a string is the end index of previous string + the length (including \0)
offset
=
offset
+
str
.
length
()
+
1
;
// insert the start index of the string.
offset_arr
[
i
++
]
=
offset
;
// total bytes are reduced by kOffsetSize
num_bytes
-=
kOffsetSize
;
// insert actual string
memcpy_s
(
buf
,
num_bytes
,
str
.
c_str
(),
str
.
length
()
+
1
);
buf
+=
str
.
length
()
+
1
;
int
ret_code
=
memcpy_s
(
data_
+
offset
,
num_bytes
,
common
::
SafeCStr
(
str
),
str
.
length
()
+
1
);
if
(
ret_code
!=
0
)
MS_LOG
(
ERROR
)
<<
"Cannot copy string into Tensor"
;
// next string will be stored right after the current one.
offset
=
offset
+
str
.
length
()
+
1
;
// total bytes are reduced by the length of the string
num_bytes
-=
str
.
length
()
+
1
;
}
this
->
data_end_
=
buf
;
// store one more offset value so we can get the length of the last string
// length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element]
offset_arr
[
i
]
=
offset
;
this
->
data_end_
=
data_
+
offset_arr
[
i
];
DS_ASSERT
(
num_bytes
==
0
);
if
(
shape
.
known
())
Tensor
::
Reshape
(
shape
);
}
Tensor
::
Tensor
(
const
dataengine
::
BytesList
&
bytes_list
,
const
TensorShape
&
shape
)
:
Tensor
(
TensorShape
({
static_cast
<
dsize_t
>
(
bytes_list
.
value_size
())}),
DataType
(
DataType
::
DE_STRING
))
{
// total bytes needed = offset array + strings
// offset array needs to store one offset var per element + 1 extra to get the length of the last string.
// strings will be null-terminated --> need 1 extra byte per element
dsize_t
num_bytes
=
(
kOffsetSize
)
*
shape_
.
NumOfElements
()
+
kOffsetSize
+
bytes_list
.
ByteSizeLong
();
data_
=
data_allocator_
->
allocate
(
num_bytes
);
auto
offset_arr
=
reinterpret_cast
<
offset_t
*>
(
data_
);
uchar
*
buf
=
GetStringsBuffer
();
offset_t
offset
=
buf
-
data_
;
// the first string will start here
uint32_t
i
=
0
;
for
(;
i
<
bytes_list
.
value_size
();
i
++
)
{
const
std
::
string
&
str
=
bytes_list
.
value
(
i
);
// insert the start index of the string.
offset_arr
[
i
]
=
offset
;
// total bytes are reduced by kOffsetSize
num_bytes
-=
kOffsetSize
;
// insert actual string
int
ret_code
=
memcpy_s
(
data_
+
offset
,
num_bytes
,
common
::
SafeCStr
(
str
),
str
.
length
()
+
1
);
if
(
ret_code
!=
0
)
{
MS_LOG
(
ERROR
)
<<
"Cannot copy string into Tensor"
;
}
// next string will be stored right after the current one.
offset
=
offset
+
str
.
length
()
+
1
;
// total bytes are reduced by the length of the string
num_bytes
-=
str
.
length
()
+
1
;
}
// store one more offset value so we can get the length of the last string
// length[last_element] = offset_arr[last_element + 1] - offset_arr[last_element]
offset_arr
[
i
]
=
offset
;
data_end_
=
data_
+
offset_arr
[
i
];
DS_ASSERT
(
num_bytes
==
0
);
if
(
shape
.
known
())
Tensor
::
Reshape
(
shape
);
}
Status
Tensor
::
CreateTensor
(
std
::
shared_ptr
<
Tensor
>
*
ptr
,
TensorImpl
tensor_impl
,
const
TensorShape
&
shape
,
DataType
type
,
const
unsigned
char
*
data
)
{
if
(
!
shape
.
known
())
{
...
...
@@ -152,20 +221,17 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, TensorImpl tensor_impl
}
return
Status
::
OK
();
// returns base-class shared_ptr
}
std
::
string
to
(
std
::
string
x
)
{
return
x
;
}
Status
Tensor
::
CreateTensorFromNumpyString
(
std
::
shared_ptr
<
Tensor
>
*
ptr
,
py
::
array
arr
)
{
std
::
vector
<
dsize_t
>
shape
;
for
(
dsize_t
i
=
0
;
i
<
arr
.
ndim
();
i
++
)
{
shape
.
push_back
(
static_cast
<
dsize_t
>
(
arr
.
shape
()[
i
]));
}
arr
.
resize
({
arr
.
size
()});
auto
itr
=
arr
.
begin
();
arr
.
resize
({
arr
.
size
()});
// flatten the py::array so we can iterate once
std
::
vector
<
std
::
string
>
strings
;
for
(;
itr
!=
arr
.
end
();
itr
++
)
{
std
::
string
s
=
to
(
py
::
cast
<
py
::
bytes
>
(
*
itr
));
strings
.
push_back
(
s
);
}
arr
.
resize
(
shape
);
std
::
for_each
(
arr
.
begin
(),
arr
.
end
(),
[
&
strings
](
const
auto
&
s
)
{
strings
.
emplace_back
(
py
::
cast
<
py
::
bytes
>
(
s
));
});
arr
.
resize
(
shape
);
// resize arr back to the original shape
return
CreateTensor
(
ptr
,
strings
,
TensorShape
{
shape
});
}
...
...
@@ -190,8 +256,9 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, py::array arr) {
std
::
shared_ptr
<
MemoryPool
>
global_pool
=
GlobalContext
::
Instance
()
->
mem_pool
();
(
*
ptr
)
->
data_allocator_
=
std
::
make_unique
<
Allocator
<
unsigned
char
>>
(
global_pool
);
static_cast
<
void
>
((
*
ptr
)
->
GetMutableBuffer
());
int64_t
byte_size
=
(
*
ptr
)
->
SizeInBytes
();
RETURN_IF_NOT_OK
((
*
ptr
)
->
AllocateBuffer
(
byte_size
));
unsigned
char
*
data
=
static_cast
<
unsigned
char
*>
(
arr
.
request
().
ptr
);
if
((
*
ptr
)
->
data_
==
nullptr
)
{
RETURN_STATUS_UNEXPECTED
(
"Failed to create memory for Tensor."
);
...
...
@@ -232,6 +299,13 @@ Status Tensor::CreateTensor(std::shared_ptr<Tensor> *ptr, const std::vector<std:
return
Status
::
OK
();
}
Status
Tensor
::
CreateTensor
(
std
::
shared_ptr
<
Tensor
>
*
ptr
,
const
dataengine
::
BytesList
&
bytes_list
,
const
TensorShape
&
shape
)
{
const
TensorAlloc
*
alloc
=
GlobalContext
::
Instance
()
->
tensor_allocator
();
*
ptr
=
std
::
allocate_shared
<
Tensor
>
(
*
alloc
,
bytes_list
,
shape
);
return
Status
::
OK
();
}
// Memcpy the given strided array's used part to consecutive memory
// Consider a 3-d array
// A[(i * shape[1] + j) * shape[2] + k] = B[i][j][k] = C[i * strides[0] + j * strides[1] + k * strides[2]]
...
...
@@ -370,25 +444,20 @@ void Tensor::Print(std::ostream &out) const {
out
<<
"[Data area is null]"
;
}
}
// Name: ToFlatIndex()
// Description: convert a vector style index to number, used to access memory internal use only
Status
Tensor
::
ToFlatIndex
(
const
std
::
vector
<
dsize_t
>
&
index
,
dsize_t
*
flat_index
)
const
{
if
(
!
shape_
.
IsValidIndex
(
index
))
{
std
::
string
err
=
"Not a valid index"
;
RETURN_STATUS_UNEXPECTED
(
err
);
}
*
flat_index
=
0
;
for
(
size_t
k
=
0
;
k
<
index
.
size
();
k
++
)
{
dsize_t
product
=
1
;
for
(
size_t
l
=
k
+
1
;
l
<
index
.
size
();
l
++
)
{
product
*=
shape_
[
l
];
Status
Tensor
::
AllocateBuffer
(
const
dsize_t
&
length
)
{
if
(
data_
==
nullptr
)
{
if
(
data_allocator_
!=
nullptr
)
{
data_
=
data_allocator_
->
allocate
(
length
);
RETURN_UNEXPECTED_IF_NULL
(
data_
);
data_end_
=
data_
+
length
;
}
else
{
data_
=
static_cast
<
unsigned
char
*>
(
malloc
(
length
));
data_end_
=
data_
+
length
;
RETURN_UNEXPECTED_IF_NULL
(
data_
);
}
*
flat_index
+=
index
[
k
]
*
product
;
}
return
Status
::
OK
();
}
const
unsigned
char
*
Tensor
::
GetBuffer
()
const
{
// This version cannot modify anything. data_ could possibly be null.
return
data_
;
...
...
@@ -404,17 +473,11 @@ unsigned char *Tensor::GetMutableBuffer() {
}
else
{
// If the data area is not created, then identify the memory size based
// on the shape and type and allocate it.
if
(
data_allocator_
!=
nullptr
)
{
data_
=
data_allocator_
->
allocate
(
this
->
SizeInBytes
());
data_end_
=
data_
+
SizeInBytes
();
if
(
this
->
AllocateBuffer
(
this
->
SizeInBytes
()).
IsOk
())
{
return
data_
;
}
else
{
data_
=
static_cast
<
unsigned
char
*>
(
malloc
(
this
->
SizeInBytes
()));
data_end_
=
data_
+
SizeInBytes
();
if
(
data_
==
nullptr
)
{
return
nullptr
;
}
return
nullptr
;
}
return
data_
;
}
}
...
...
@@ -444,7 +507,7 @@ Status Tensor::GetItemPtr(T **ptr, const std::vector<dsize_t> &index) const {
RETURN_STATUS_UNEXPECTED
(
err
);
}
dsize_t
flat_idx
;
RETURN_IF_NOT_OK
(
ToFlatIndex
(
index
,
&
flat_idx
));
RETURN_IF_NOT_OK
(
shape_
.
ToFlatIndex
(
index
,
&
flat_idx
));
*
ptr
=
reinterpret_cast
<
T
*>
(
data_
+
flat_idx
*
type_
.
SizeInBytes
());
return
Status
::
OK
();
...
...
@@ -461,7 +524,7 @@ Status Tensor::GetItemPtr(uchar **ptr, const std::vector<dsize_t> &index, offset
RETURN_STATUS_UNEXPECTED
(
err
);
}
dsize_t
flat_idx
;
RETURN_IF_NOT_OK
(
ToFlatIndex
(
index
,
&
flat_idx
));
RETURN_IF_NOT_OK
(
shape_
.
ToFlatIndex
(
index
,
&
flat_idx
));
offset_t
length_temp
=
0
;
RETURN_IF_NOT_OK
(
GetStringAt
(
flat_idx
,
ptr
,
&
length_temp
));
if
(
length
!=
nullptr
)
*
length
=
length_temp
;
...
...
@@ -481,7 +544,7 @@ Status Tensor::StartAddrOfIndex(std::vector<dsize_t> ind, uchar **start_addr_of_
std
::
vector
<
dsize_t
>
r
(
t_shape
.
begin
()
+
ind
.
size
(),
t_shape
.
end
());
*
remaining
=
TensorShape
(
r
);
ind
.
resize
(
this
->
Rank
(),
0
);
// same as -> while (ind.size() < this->Rank()) ind.push_back(0);
RETURN_IF_NOT_OK
(
ToFlatIndex
(
ind
,
&
flat_ind
));
RETURN_IF_NOT_OK
(
shape_
.
ToFlatIndex
(
ind
,
&
flat_ind
));
// check if GetBuffer() returns null, we should flag this as an error, this sanity check will only
// be true is the tensor failed to allocate memory.
if
(
GetMutableBuffer
()
==
nullptr
)
{
...
...
@@ -588,10 +651,10 @@ Status Tensor::GetItemAt(std::string_view *o, const std::vector<dsize_t> &index)
RETURN_UNEXPECTED_IF_NULL
(
o
);
CHECK_FAIL_RETURN_UNEXPECTED
(
type_
==
DataType
::
DE_STRING
,
"Type is not DE_STRING"
);
uchar
*
buf
=
nullptr
;
uchar
*
start
=
nullptr
;
offset_t
length
=
0
;
RETURN_IF_NOT_OK
(
GetItemPtr
(
&
buf
,
index
,
&
length
));
std
::
string_view
sv
{
reinterpret_cast
<
const
char
*>
(
buf
),
length
};
RETURN_IF_NOT_OK
(
GetItemPtr
(
&
start
,
index
,
&
length
));
std
::
string_view
sv
{
reinterpret_cast
<
const
char
*>
(
start
)
};
o
->
swap
(
sv
);
return
Status
::
OK
();
}
...
...
@@ -778,13 +841,11 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length
RETURN_UNEXPECTED_IF_NULL
(
string_start
);
RETURN_UNEXPECTED_IF_NULL
(
length
);
auto
*
offset_ptr
=
reinterpret_cast
<
offset_t
*>
(
data_
);
// offsets starts here
offset_t
end
=
offset_ptr
[
index
];
offset_t
start
=
0
;
if
(
index
!=
0
)
start
=
offset_ptr
[
index
-
1
]
+
1
;
// string starts at where the previous string ends + 1
uchar
*
buf
=
GetStringsBuffer
();
// string data starts here
*
string_start
=
buf
+
start
;
*
length
=
end
-
start
;
offset_t
start
=
offset_ptr
[
index
];
*
string_start
=
data_
+
start
;
*
length
=
offset_ptr
[
index
+
1
]
-
start
-
1
;
// -1 to skip the \0 from the string length
return
Status
::
OK
();
}
}
// namespace dataset
}
// namespace mindspore
mindspore/ccsrc/dataset/core/tensor.h
浏览文件 @
6f733ec1
...
...
@@ -35,6 +35,7 @@
#include "dataset/util/allocator.h"
#include "dataset/util/de_error.h"
#include "dataset/util/status.h"
#include "proto/example.pb.h"
namespace
py
=
pybind11
;
namespace
mindspore
{
...
...
@@ -64,6 +65,8 @@ class Tensor {
// @param data unsigned char*, pointer to the data.
Tensor
(
const
TensorShape
&
shape
,
const
DataType
&
type
,
const
unsigned
char
*
data
);
Tensor
(
const
TensorShape
&
shape
,
const
DataType
&
type
,
const
unsigned
char
*
data
,
const
dsize_t
&
length
);
Tensor
(
const
Tensor
&
other
)
=
delete
;
Tensor
&
operator
=
(
const
Tensor
&
other
)
=
delete
;
...
...
@@ -72,6 +75,8 @@ class Tensor {
Tensor
&
operator
=
(
Tensor
&&
other
)
noexcept
;
Status
AllocateBuffer
(
const
dsize_t
&
length
);
// type of offest values to store strings information
using
offset_t
=
uint32_t
;
// const of the size of the offset variable
...
...
@@ -84,15 +89,24 @@ class Tensor {
// Construct a tensor from a list of strings. Reshape the tensor with `shape` if given, otherwise assume the shape is
// the size of the vector `strings`.
// The memory layout of a Tensor of strings consists of the Offset_array followed by the strings.
// OFFSET1, OFFSET2, ... String1, String2, ...
// The value of each offset is the end index of the corresponding string
// Thr offset array will store one extra value to find the length of the last string.
// OFFSET1, OFFSET2, ..., OFFSETn+1, STRING1, STRING2, ..., STRINGn
// The value of each offset is the start index of the corresponding string
// Offsets is of type offest_t
// strings will ne null-terminated
// example: Tensor(['abc', 'de'], shape={2}, type=DE_STRING)
// 3 6 a b c \0 d e \0
// |----------------------------------------------------------------|
// | OFFSET ARRAY | STRINGS |
// | bytes 0-3 | bytes 3-6 | bytes 7-10 | bytes 11-14 | bytes 15-17 |
// | 11 | 15 | 18 | abc\0 | de\0 |
// |----------------------------------------------------------------|
explicit
Tensor
(
const
std
::
vector
<
std
::
string
>
&
strings
,
const
TensorShape
&
shape
=
TensorShape
::
CreateUnknownRankShape
());
// Same as Tensor(vector<string>) but the input is protobuf bytelist
explicit
Tensor
(
const
dataengine
::
BytesList
&
bytes_list
,
const
TensorShape
&
shape
=
TensorShape
::
CreateUnknownRankShape
());
// A static factory method to create the given flavour of derived Tensor
// Returns the base class reference for the Tensor.
// @param ptr output argument to hold the created Tensor of given tensor_impl
...
...
@@ -121,6 +135,9 @@ class Tensor {
static
Status
CreateTensor
(
std
::
shared_ptr
<
Tensor
>
*
ptr
,
const
std
::
vector
<
std
::
string
>
&
strings
,
const
TensorShape
&
shape
=
TensorShape
::
CreateUnknownRankShape
());
static
Status
CreateTensor
(
std
::
shared_ptr
<
Tensor
>
*
ptr
,
const
dataengine
::
BytesList
&
bytes_list
,
const
TensorShape
&
shape
);
// Copy raw data of a array based on shape and strides to the destination pointer
// @param dst Pointer to the destination array where the content is to be copied
// @param src Pointer to the source of strided array to be copied
...
...
@@ -166,7 +183,7 @@ class Tensor {
// @param value of type `T`
template
<
typename
T
>
Status
SetItemAt
(
const
std
::
vector
<
dsize_t
>
&
index
,
const
T
&
value
)
{
static_cast
<
void
>
(
GetMutableBuffer
(
));
RETURN_IF_NOT_OK
(
AllocateBuffer
(
SizeInBytes
()
));
T
*
ptr
=
nullptr
;
RETURN_IF_NOT_OK
(
GetItemPtr
<
T
>
(
&
ptr
,
index
));
*
ptr
=
value
;
...
...
@@ -203,7 +220,7 @@ class Tensor {
template
<
typename
T
>
Status
Fill
(
const
T
&
value
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
type_
!=
DataType
::
DE_STRING
,
"Cannot use fill on tensor of strings."
);
static_cast
<
void
>
(
GetMutableBuffer
(
));
RETURN_IF_NOT_OK
(
AllocateBuffer
(
SizeInBytes
()
));
int64_t
cellSize
=
type_
.
SizeInBytes
();
if
((
data_
!=
nullptr
)
&&
type_
.
IsCompatible
<
T
>
())
{
for
(
dsize_t
i
=
0
;
i
<
Size
();
i
++
)
{
...
...
@@ -418,32 +435,28 @@ class Tensor {
using
pointer
=
std
::
string_view
*
;
using
reference
=
std
::
string_view
&
;
explicit
TensorIterator
(
uchar
*
offset
=
nullptr
,
const
uchar
*
buf
=
nullptr
,
dsize_t
index
=
0
)
{
offset_
=
reinterpret_cast
<
offset_t
*>
(
offset
);
buf_
=
reinterpret_cast
<
const
char
*>
(
buf
);
explicit
TensorIterator
(
uchar
*
data
=
nullptr
,
dsize_t
index
=
0
)
{
data_
=
reinterpret_cast
<
const
char
*>
(
data
);
index_
=
index
;
}
TensorIterator
(
const
TensorIterator
<
std
::
string_view
,
DUMMY
>
&
raw_iterator
)
{
offset_
=
raw_iterator
.
offset_
;
buf_
=
raw_iterator
.
buf_
;
data_
=
raw_iterator
.
data_
;
index_
=
raw_iterator
.
index_
;
}
~
TensorIterator
()
=
default
;
bool
operator
==
(
const
TensorIterator
<
std
::
string_view
>
&
rhs
)
{
return
buf_
==
rhs
.
buf_
&&
offset_
==
rhs
.
offset_
&&
index_
==
rhs
.
index_
;
}
bool
operator
==
(
const
TensorIterator
<
std
::
string_view
>
&
rhs
)
{
return
data_
==
rhs
.
data_
&&
index_
==
rhs
.
index_
;
}
bool
operator
!=
(
const
TensorIterator
<
std
::
string_view
>
&
rhs
)
{
return
!
(
*
this
==
rhs
);
}
operator
bool
()
const
{
return
offset
_
!=
nullptr
;
}
operator
bool
()
const
{
return
data
_
!=
nullptr
;
}
std
::
string_view
operator
*
()
const
{
offset_t
start
=
0
;
if
(
index_
!=
0
)
start
=
offset_
[
index_
-
1
]
+
1
;
return
std
::
string_view
{
buf
_
+
start
};
auto
offset_
=
reinterpret_cast
<
const
offset_t
*>
(
data_
)
;
offset_t
start
=
offset_
[
index_
]
;
return
std
::
string_view
{
data
_
+
start
};
}
TensorIterator
<
std
::
string_view
>
&
operator
+=
(
const
dsize_t
&
inc
)
{
...
...
@@ -496,8 +509,7 @@ class Tensor {
protected:
dsize_t
index_
;
offset_t
*
offset_
;
const
char
*
buf_
;
const
char
*
data_
;
};
// Return a TensorIterator that points to the start of the Tensor.
...
...
@@ -518,11 +530,6 @@ class Tensor {
}
protected:
// Returns the location of the item assuming row major memory layout.
// @param index
// @return
Status
ToFlatIndex
(
const
std
::
vector
<
dsize_t
>
&
index
,
dsize_t
*
flat_index
)
const
;
// A function that prints Tensor recursively, first called by print
// @param out
// @param cur_dim
...
...
@@ -559,7 +566,7 @@ class Tensor {
// Skip the offsets and returns the start of the buffer where the real strings is stored. Caller needs to check if the
// tensor's type is a string, otherwise undefined address would be returned.
// @return address of the first string of the tensor.
uchar
*
GetStringsBuffer
()
const
{
return
data_
+
kOffsetSize
*
shape_
.
NumOfElements
();
}
uchar
*
GetStringsBuffer
()
const
{
return
data_
+
kOffsetSize
*
shape_
.
NumOfElements
()
+
kOffsetSize
;
}
// all access to shape_ should be via shape
TensorShape
shape_
;
...
...
@@ -573,14 +580,8 @@ class Tensor {
unsigned
char
*
data_end_
=
nullptr
;
};
template
<
>
inline
Tensor
::
TensorIterator
<
std
::
string_view
>
Tensor
::
begin
<
std
::
string_view
>
()
{
uchar
*
buf
=
GetStringsBuffer
();
return
TensorIterator
<
std
::
string_view
>
(
data_
,
buf
);
}
template
<
>
inline
Tensor
::
TensorIterator
<
std
::
string_view
>
Tensor
::
end
<
std
::
string_view
>
()
{
uchar
*
buf
=
GetStringsBuffer
();
return
TensorIterator
<
std
::
string_view
>
(
data_
,
buf
,
shape_
.
NumOfElements
());
return
TensorIterator
<
std
::
string_view
>
(
data_
,
shape_
.
NumOfElements
());
}
}
// namespace dataset
}
// namespace mindspore
...
...
mindspore/ccsrc/dataset/core/tensor_shape.cc
浏览文件 @
6f733ec1
...
...
@@ -40,16 +40,7 @@ dsize_t TensorShape::NumOfElements() const {
if
(
!
known
())
{
return
0
;
}
dsize_t
num
=
1
;
for
(
auto
i
:
raw_shape_
)
{
if
(
multi_ok
(
num
,
i
))
{
num
*=
i
;
}
else
{
// dsize_t can wrap since it is signed int, we double check here
MS_LOG
(
ERROR
)
<<
"Tensor shape larger than maximum allowed value!"
;
}
}
return
num
;
return
strides_
[
0
];
}
void
TensorShape
::
Print
(
std
::
ostream
&
out
)
const
{
...
...
@@ -72,20 +63,23 @@ void TensorShape::Print(std::ostream &out) const {
}
TensorShape
::
TensorShape
(
const
std
::
initializer_list
<
dsize_t
>
&
list
)
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
,
strides_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
AddListToShape
(
list
);
}
TensorShape
::
TensorShape
(
const
std
::
vector
<
dsize_t
>
&
list
)
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
TensorShape
::
TensorShape
(
const
std
::
vector
<
dsize_t
>
&
list
)
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
()),
strides_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
AddListToShape
(
list
);
}
TensorShape
::
TensorShape
(
const
TensorShape
&
shape
)
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
TensorShape
::
TensorShape
(
const
TensorShape
&
shape
)
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
()),
strides_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
AddListToShape
(
shape
.
AsVector
());
known_
=
shape
.
known_
;
// override with the input shape in case of unknown-rank tensor shape.
}
TensorShape
::
TensorShape
(
py
::
list
l
)
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
TensorShape
::
TensorShape
(
py
::
list
l
)
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
()),
strides_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
std
::
vector
<
dsize_t
>
list_c
;
for
(
auto
&
i
:
l
)
{
if
(
!
i
.
is_none
())
{
...
...
@@ -97,6 +91,18 @@ TensorShape::TensorShape(py::list l) : raw_shape_(*GlobalContext::Instance()->in
AddListToShape
(
list_c
);
}
TensorShape
::
TensorShape
(
cv
::
MatSize
cv_size
,
uint32_t
type
)
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
()),
strides_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
for
(
int
i
=
0
;
i
<
cv_size
.
dims
();
i
++
)
{
raw_shape_
.
push_back
(
cv_size
[
i
]);
}
auto
channels
=
static_cast
<
uint8_t
>
(
1
+
(
type
>>
static_cast
<
uint8_t
>
(
CV_CN_SHIFT
)));
if
(
channels
!=
1
)
{
raw_shape_
.
push_back
(
channels
);
}
known_
=
true
;
}
TensorShape
TensorShape
::
CreateUnknownRankShape
()
{
TensorShape
s
({});
s
.
known_
=
false
;
...
...
@@ -109,17 +115,6 @@ TensorShape TensorShape::InsertDim(dsize_t axis, dsize_t dim) const {
return
TensorShape
(
tmp
);
}
TensorShape
::
TensorShape
(
cv
::
MatSize
cv_size
,
uint32_t
type
)
:
raw_shape_
(
*
GlobalContext
::
Instance
()
->
int_allocator
())
{
for
(
int
i
=
0
;
i
<
cv_size
.
dims
();
i
++
)
{
raw_shape_
.
push_back
(
cv_size
[
i
]);
}
auto
channels
=
static_cast
<
uint8_t
>
(
1
+
(
type
>>
static_cast
<
uint8_t
>
(
CV_CN_SHIFT
)));
if
(
channels
!=
1
)
{
raw_shape_
.
push_back
(
channels
);
}
known_
=
true
;
}
std
::
vector
<
dsize_t
>
TensorShape
::
AsVector
()
const
{
return
std
::
vector
<
dsize_t
>
(
raw_shape_
.
begin
(),
raw_shape_
.
end
());
}
...
...
@@ -139,23 +134,28 @@ bool TensorShape::IsValidIndex(const std::vector<dsize_t> &index) const {
template
<
typename
T
>
void
TensorShape
::
AddListToShape
(
const
T
&
list
)
{
raw_shape_
.
resize
(
list
.
size
());
strides_
.
resize
(
list
.
size
()
+
1
);
strides_
[
list
.
size
()]
=
1
;
known_
=
true
;
dsize_t
num
=
1
;
dsize_t
size
=
0
;
for
(
const
auto
&
itr
:
list
)
{
if
(
itr
>
0
)
{
if
(
num
>
std
::
numeric_limits
<
int64_t
>::
max
()
/
itr
)
{
auto
itr
=
std
::
rbegin
(
list
);
// iterate over the list in reverse order
auto
s
=
list
.
size
()
-
1
;
// to compute strides while adding dims
for
(;
itr
!=
std
::
rend
(
list
);
itr
++
,
s
--
)
{
dsize_t
dim
=
*
itr
;
if
(
dim
>
0
)
{
if
(
strides_
[
s
+
1
]
>
std
::
numeric_limits
<
int64_t
>::
max
()
/
dim
)
{
MS_LOG
(
ERROR
)
<<
"Invalid shape data, overflow occurred!"
;
known_
=
false
;
raw_shape_
.
clear
();
return
;
}
num
*=
itr
;
strides_
[
s
]
=
dim
*
strides_
[
s
+
1
]
;
}
if
(
itr
<
0
)
{
if
(
dim
<
0
)
{
known_
=
false
;
}
if
(
itr
>
kDeMaxDim
)
{
if
(
dim
>
kDeMaxDim
)
{
std
::
stringstream
ss
;
ss
<<
"Invalid shape data, dim ("
<<
size
<<
") is larger than the maximum dim size("
<<
kDeMaxDim
<<
")!"
;
MS_LOG
(
ERROR
)
<<
ss
.
str
().
c_str
();
...
...
@@ -163,7 +163,7 @@ void TensorShape::AddListToShape(const T &list) {
raw_shape_
.
clear
();
return
;
}
raw_shape_
.
push_back
(
itr
)
;
raw_shape_
[
s
]
=
dim
;
size
++
;
}
if
(
size
>
kDeMaxRank
)
{
...
...
@@ -215,17 +215,18 @@ TensorShape TensorShape::Squeeze() const {
}
return
TensorShape
(
new_shape
);
}
std
::
vector
<
dsize_t
>
TensorShape
::
Strides
()
{
std
::
vector
<
dsize_t
>
strides
(
Rank
());
dsize_t
count
=
NumOfElements
();
for
(
dsize_t
i
=
0
;
i
<
Rank
();
i
++
)
{
if
(
raw_shape_
[
i
]
!=
0
)
count
/=
raw_shape_
[
i
];
else
count
=
0
;
strides
[
i
]
=
count
;
std
::
vector
<
dsize_t
>
TensorShape
::
Strides
()
const
{
return
std
::
vector
<
dsize_t
>
{
strides_
.
begin
()
+
1
,
strides_
.
end
()};
}
// Name: ToFlatIndex()
// Description: convert a vector style index to number, used to access memory internal use only
Status
TensorShape
::
ToFlatIndex
(
const
std
::
vector
<
dsize_t
>
&
index
,
dsize_t
*
flat_index
)
const
{
*
flat_index
=
0
;
for
(
size_t
k
=
0
;
k
<
index
.
size
();
k
++
)
{
*
flat_index
+=
index
[
k
]
*
strides_
[
k
+
1
];
// skip the first element of strides_ which is numOfElements
}
return
strides
;
CHECK_FAIL_RETURN_UNEXPECTED
(
*
flat_index
<
NumOfElements
(),
"Not a valid index"
);
return
Status
::
OK
();
}
}
// namespace dataset
}
// namespace mindspore
mindspore/ccsrc/dataset/core/tensor_shape.h
浏览文件 @
6f733ec1
...
...
@@ -156,13 +156,20 @@ class TensorShape {
TensorShape
Squeeze
()
const
;
std
::
vector
<
dsize_t
>
Strides
();
std
::
vector
<
dsize_t
>
Strides
()
const
;
// Returns the location of the item assuming row major memory layout.
// @param index
// @return
Status
ToFlatIndex
(
const
std
::
vector
<
dsize_t
>
&
index
,
dsize_t
*
flat_index
)
const
;
private:
// True if known and valid shape, false otherwise
bool
known_
;
// Vector to keep the dims of the shape.
std
::
vector
<
dsize_t
,
IntAlloc
>
raw_shape_
;
// Vector to keep the strides of the shape. The size is rank+1
std
::
vector
<
dsize_t
,
IntAlloc
>
strides_
;
// Internal utility function to iterate over a list, check if the dim is valid and then insert it into the shape.
// @tparam T list
...
...
mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
浏览文件 @
6f733ec1
ms_protobuf_generate
(
EXAMPLE_SRCS EXAMPLE_HDRS example.proto
)
ms_protobuf_generate
(
FEATURE_SRCS FEATURE_HDRS feature.proto
)
add_subdirectory
(
sampler
)
file
(
GLOB_RECURSE _CURRENT_SRC_FILES RELATIVE
${
CMAKE_CURRENT_SOURCE_DIR
}
"*.cc"
)
set_property
(
SOURCE
${
_CURRENT_SRC_FILES
}
PROPERTY COMPILE_DEFINITIONS SUBMODULE_ID=mindspore::SubModuleId::SM_MD
)
...
...
@@ -15,13 +13,9 @@ add_library(engine-datasetops-source OBJECT
image_folder_op.cc
mnist_op.cc
voc_op.cc
${
EXAMPLE_SRCS
}
${
FEATURE_SRCS
}
manifest_op.cc
cifar_op.cc
random_data_op.cc
celeba_op.cc
text_file_op.cc
)
add_dependencies
(
engine-datasetops-source mindspore::protobuf
)
)
\ No newline at end of file
mindspore/ccsrc/dataset/engine/datasetops/source/mindrecord_op.cc
浏览文件 @
6f733ec1
...
...
@@ -127,8 +127,10 @@ Status MindRecordOp::Init() {
std
::
string
type_str
=
mindrecord
::
ColumnDataTypeNameNormalized
[
col_data_types
[
i
]];
DataType
t_dtype
=
DataType
(
type_str
);
// valid types: {"bytes", "string", "int32", "int64", "float32", "float64"}
if
(
col_data_types
[
i
]
==
mindrecord
::
ColumnBytes
||
col_data_types
[
i
]
==
mindrecord
::
ColumnString
)
{
// rank = 1
if
(
col_data_types
[
i
]
==
mindrecord
::
ColumnBytes
)
{
// rank = 1
col_desc
=
ColDescriptor
(
colname
,
t_dtype
,
TensorImpl
::
kFlexible
,
1
);
}
else
if
(
col_data_types
[
i
]
==
mindrecord
::
ColumnString
)
{
// rank = 0
col_desc
=
ColDescriptor
(
colname
,
t_dtype
,
TensorImpl
::
kFlexible
,
0
);
}
else
if
(
col_shapes
[
i
].
size
()
>
0
)
{
std
::
vector
<
dsize_t
>
vec
(
col_shapes
[
i
].
size
());
// temporary vector to hold shape
(
void
)
std
::
copy
(
col_shapes
[
i
].
begin
(),
col_shapes
[
i
].
end
(),
vec
.
begin
());
...
...
@@ -309,7 +311,10 @@ Status MindRecordOp::LoadTensorRow(TensorRow *tensor_row, const std::vector<uint
// Set shape
auto
num_elements
=
n_bytes
/
column_data_type_size
;
if
(
column
.
hasShape
())
{
if
(
type
==
DataType
::
DE_STRING
)
{
std
::
string
s
{
data
,
data
+
n_bytes
};
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
tensor
,
{
s
},
TensorShape
::
CreateScalar
()));
}
else
if
(
column
.
hasShape
())
{
auto
new_shape
=
TensorShape
(
column
.
shape
());
RETURN_IF_NOT_OK
(
column
.
MaterializeTensorShape
(
static_cast
<
int32_t
>
(
num_elements
),
&
new_shape
));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
tensor
,
column
.
tensorImpl
(),
new_shape
,
type
,
data
));
...
...
mindspore/ccsrc/dataset/engine/datasetops/source/sampler/sampler.cc
浏览文件 @
6f733ec1
...
...
@@ -63,7 +63,8 @@ Status Sampler::CreateSamplerTensor(std::shared_ptr<Tensor> *sample_ids, int64_t
}
TensorShape
shape
(
std
::
vector
<
dsize_t
>
(
1
,
num_elements
));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
sample_ids
,
col_desc_
->
tensorImpl
(),
shape
,
col_desc_
->
type
()));
(
void
)(
*
sample_ids
)
->
GetMutableBuffer
();
// allocate memory in case user forgets!
RETURN_IF_NOT_OK
(
(
*
sample_ids
)
->
AllocateBuffer
((
*
sample_ids
)
->
SizeInBytes
()));
// allocate memory in case user forgets!
return
Status
::
OK
();
}
...
...
mindspore/ccsrc/dataset/engine/datasetops/source/tf_reader_op.cc
浏览文件 @
6f733ec1
...
...
@@ -724,18 +724,26 @@ Status TFReaderOp::LoadBytesList(const ColDescriptor ¤t_col, const dataeng
// kBytesList can map to the following DE types ONLY!
// DE_UINT8, DE_INT8
// Must be single byte type for each element!
if
(
current_col
.
type
()
!=
DataType
::
DE_UINT8
&&
current_col
.
type
()
!=
DataType
::
DE_INT8
)
{
if
(
current_col
.
type
()
!=
DataType
::
DE_UINT8
&&
current_col
.
type
()
!=
DataType
::
DE_INT8
&&
current_col
.
type
()
!=
DataType
::
DE_STRING
)
{
std
::
string
err_msg
=
"Invalid datatype for Tensor at column: "
+
current_col
.
name
();
RETURN_STATUS_UNEXPECTED
(
err_msg
);
}
const
dataengine
::
BytesList
&
bytes_list
=
column_values_list
.
bytes_list
();
*
num_elements
=
bytes_list
.
value_size
();
if
(
current_col
.
type
()
==
DataType
::
DE_STRING
)
{
TensorShape
shape
=
TensorShape
::
CreateScalar
();
RETURN_IF_NOT_OK
(
current_col
.
MaterializeTensorShape
(
*
num_elements
,
&
shape
));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
tensor
,
bytes_list
,
shape
));
return
Status
::
OK
();
}
uint64_t
max_size
=
0
;
for
(
uint32_t
i
=
0
;
i
<
bytes_list
.
value_size
();
++
i
)
max_size
=
std
::
max
(
max_size
,
bytes_list
.
value
(
i
).
size
());
*
num_elements
=
bytes_list
.
value_size
();
int64_t
pad_size
=
max_size
;
// if user provides a shape in the form of [-1, d1, 2d, ... , dn], we need to pad to d1 * d2 * ... * dn
...
...
@@ -879,7 +887,7 @@ Status TFReaderOp::LoadIntList(const ColDescriptor ¤t_col, const dataengin
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
tensor
,
current_col
.
tensorImpl
(),
current_shape
,
current_col
.
type
()));
// Tensors are lazily allocated, this eagerly allocates memory for the tensor.
(
void
)(
*
tensor
)
->
GetMutableBuffer
(
);
RETURN_IF_NOT_OK
((
*
tensor
)
->
AllocateBuffer
((
*
tensor
)
->
SizeInBytes
())
);
int64_t
i
=
0
;
auto
it
=
(
*
tensor
)
->
begin
<
T
>
();
...
...
mindspore/ccsrc/dataset/kernels/data/data_utils.cc
浏览文件 @
6f733ec1
...
...
@@ -162,7 +162,7 @@ void CastFrom(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *out
Status
TypeCast
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
,
const
DataType
&
data_type
)
{
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
output
,
TensorImpl
::
kFlexible
,
input
->
shape
(),
data_type
));
static_cast
<
void
>
((
*
output
)
->
GetMutableBuffer
(
));
RETURN_IF_NOT_OK
((
*
output
)
->
AllocateBuffer
((
*
output
)
->
SizeInBytes
()
));
switch
(
input
->
type
().
value
())
{
case
DataType
::
DE_BOOL
:
CastFrom
<
bool
>
(
input
,
output
);
...
...
@@ -211,7 +211,7 @@ Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *
// initiate new tensor for type cast
DataType
new_type
=
DataType
(
"float16"
);
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
output
,
TensorImpl
::
kFlexible
,
input
->
shape
(),
new_type
));
static_cast
<
void
>
((
*
output
)
->
GetMutableBuffer
(
));
RETURN_IF_NOT_OK
((
*
output
)
->
AllocateBuffer
((
*
output
)
->
SizeInBytes
()
));
auto
in_itr
=
input
->
begin
<
float
>
();
auto
out_itr
=
(
*
output
)
->
begin
<
float16
>
();
...
...
mindspore/ccsrc/dataset/kernels/image/image_utils.cc
浏览文件 @
6f733ec1
...
...
@@ -64,7 +64,8 @@ Status Flip(std::shared_ptr<Tensor> input, std::shared_ptr<Tensor> *output, int
std
::
shared_ptr
<
CVTensor
>
output_cv
=
std
::
make_shared
<
CVTensor
>
(
input_cv
->
shape
(),
input_cv
->
type
());
RETURN_UNEXPECTED_IF_NULL
(
output_cv
);
(
void
)
output_cv
->
GetMutableBuffer
();
RETURN_IF_NOT_OK
(
output_cv
->
AllocateBuffer
(
output_cv
->
SizeInBytes
()));
if
(
input_cv
->
mat
().
data
)
{
try
{
cv
::
flip
(
input_cv
->
mat
(),
output_cv
->
mat
(),
flip_code
);
...
...
mindspore/ccsrc/mindrecord/include/shard_column.h
浏览文件 @
6f733ec1
...
...
@@ -51,7 +51,7 @@ enum ColumnDataType {
// mapping as {"bytes", "string", "int32", "int64", "float32", "float64"};
const
uint32_t
ColumnDataTypeSize
[
kDataTypes
]
=
{
1
,
1
,
4
,
8
,
4
,
8
};
const
std
::
vector
<
std
::
string
>
ColumnDataTypeNameNormalized
=
{
"uint8"
,
"
uint8"
,
"int32"
,
const
std
::
vector
<
std
::
string
>
ColumnDataTypeNameNormalized
=
{
"uint8"
,
"
string"
,
"int32"
,
"int64"
,
"float32"
,
"float64"
};
const
std
::
unordered_map
<
std
::
string
,
ColumnDataType
>
ColumnDataTypeMap
=
{
...
...
mindspore/dataset/core/datatypes.py
浏览文件 @
6f733ec1
...
...
@@ -48,6 +48,7 @@ def mstype_to_detype(type_):
mstype
.
float16
:
cde
.
DataType
(
"float16"
),
mstype
.
float32
:
cde
.
DataType
(
"float32"
),
mstype
.
float64
:
cde
.
DataType
(
"float64"
),
mstype
.
string
:
cde
.
DataType
(
"string"
),
}[
type_
]
...
...
mindspore/dataset/engine/validators.py
浏览文件 @
6f733ec1
...
...
@@ -26,7 +26,7 @@ from . import datasets
INT32_MAX
=
2147483647
valid_detype
=
[
"bool"
,
"int8"
,
"int16"
,
"int32"
,
"int64"
,
"uint8"
,
"uint16"
,
"uint32"
,
"uint64"
,
"float16"
,
"float32"
,
"float64"
"uint32"
,
"uint64"
,
"float16"
,
"float32"
,
"float64"
,
"string"
]
...
...
tests/ut/cpp/dataset/datatype_test.cc
浏览文件 @
6f733ec1
...
...
@@ -32,47 +32,47 @@ class MindDataTestDatatype : public UT::Common {
TEST_F
(
MindDataTestDatatype
,
TestSizes
)
{
uint8_t
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_BOOL
]
;
uint8_t
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_BOOL
].
sizeInBytes_
;
DataType
d
=
DataType
(
DataType
::
DE_BOOL
);
ASSERT_EQ
(
x
,
1
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_INT8
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_INT8
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_INT8
);
ASSERT_EQ
(
x
,
1
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_UINT8
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_UINT8
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_UINT8
);
ASSERT_EQ
(
x
,
1
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_INT16
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_INT16
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_INT16
);
ASSERT_EQ
(
x
,
2
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_UINT16
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_UINT16
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_UINT16
);
ASSERT_EQ
(
x
,
2
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_INT32
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_INT32
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_INT32
);
ASSERT_EQ
(
x
,
4
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_UINT32
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_UINT32
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_UINT32
);
ASSERT_EQ
(
x
,
4
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_INT64
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_INT64
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_INT64
);
ASSERT_EQ
(
x
,
8
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_UINT64
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_UINT64
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_UINT64
);
ASSERT_EQ
(
x
,
8
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_FLOAT32
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_FLOAT32
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_FLOAT32
);
ASSERT_EQ
(
x
,
4
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
x
=
DataType
::
SIZE_IN_BYTES
[
DataType
::
DE_FLOAT64
]
;
x
=
DataType
::
kTypeInfo
[
DataType
::
DE_FLOAT64
].
sizeInBytes_
;
d
=
DataType
(
DataType
::
DE_FLOAT64
);
ASSERT_EQ
(
x
,
8
);
ASSERT_EQ
(
d
.
SizeInBytes
(),
x
);
...
...
tests/ut/cpp/dataset/one_hot_op_test.cc
浏览文件 @
6f733ec1
...
...
@@ -14,9 +14,7 @@
* limitations under the License.
*/
#include "common/common.h"
#include "common/cvop_common.h"
#include "dataset/kernels/data/one_hot_op.h"
#include "dataset/core/cv_tensor.h"
#include "utils/log_adapter.h"
using
namespace
mindspore
::
dataset
;
...
...
@@ -24,9 +22,9 @@ using mindspore::MsLogLevel::INFO;
using
mindspore
::
ExceptionType
::
NoExceptionType
;
using
mindspore
::
LogStream
;
class
MindDataTestOneHotOp
:
public
UT
::
C
VOP
::
CVOpC
ommon
{
class
MindDataTestOneHotOp
:
public
UT
::
Common
{
protected:
MindDataTestOneHotOp
()
:
CVOpCommon
()
{}
MindDataTestOneHotOp
()
{}
};
TEST_F
(
MindDataTestOneHotOp
,
TestOp
)
{
...
...
tests/ut/cpp/dataset/tensor_string_test.cc
浏览文件 @
6f733ec1
...
...
@@ -65,14 +65,14 @@ TEST_F(MindDataTestStringTensorDE, Basics) {
TEST_F
(
MindDataTestStringTensorDE
,
Basics2
)
{
std
::
shared_ptr
<
Tensor
>
t
=
std
::
make_shared
<
Tensor
>
(
std
::
vector
<
std
::
string
>
{
"abc"
,
"defg"
,
"hi"
,
"klmno"
,
"123"
,
"789"
},
TensorShape
({
2
,
3
}));
ASSERT_TRUE
(
t
->
SizeInBytes
()
==
6
*
5
+
20
);
std
::
vector
<
uint32_t
>
offsets
=
{
3
,
8
,
11
,
17
,
21
,
25
};
ASSERT_TRUE
(
t
->
SizeInBytes
()
==
6
*
5
+
20
+
4
);
std
::
vector
<
uint32_t
>
offsets
=
{
0
,
4
,
9
,
12
,
18
,
22
,
26
};
uint32_t
ctr
=
0
;
for
(
auto
i
:
offsets
)
{
ASSERT_TRUE
(
*
(
reinterpret_cast
<
uint32_t
*>
(
t
->
GetMutableBuffer
()
+
ctr
))
==
i
);
ASSERT_TRUE
(
*
(
reinterpret_cast
<
uint32_t
*>
(
t
->
GetMutableBuffer
()
+
ctr
))
==
i
+
28
);
ctr
+=
4
;
}
const
char
*
buf
=
reinterpret_cast
<
char
*>
(
t
->
GetMutableBuffer
())
+
6
*
4
;
const
char
*
buf
=
reinterpret_cast
<
char
*>
(
t
->
GetMutableBuffer
())
+
6
*
4
+
4
;
std
::
vector
<
uint32_t
>
starts
=
{
0
,
4
,
9
,
12
,
18
,
22
};
uint32_t
index
=
0
;
...
...
@@ -90,14 +90,14 @@ TEST_F(MindDataTestStringTensorDE, Empty) {
std
::
shared_ptr
<
Tensor
>
t
=
std
::
make_shared
<
Tensor
>
(
strings
,
TensorShape
({
2
,
3
}));
// abc_defg___123__
// 0123456789012345
ASSERT_TRUE
(
t
->
SizeInBytes
()
==
6
*
5
+
10
);
std
::
vector
<
uint32_t
>
offsets
=
{
3
,
8
,
9
,
10
,
14
,
15
};
ASSERT_TRUE
(
t
->
SizeInBytes
()
==
6
*
5
+
10
+
4
);
std
::
vector
<
uint32_t
>
offsets
=
{
0
,
4
,
9
,
10
,
11
,
15
,
16
};
uint32_t
ctr
=
0
;
for
(
auto
i
:
offsets
)
{
ASSERT_TRUE
(
*
(
reinterpret_cast
<
uint32_t
*>
(
t
->
GetMutableBuffer
()
+
ctr
))
==
i
);
ASSERT_TRUE
(
*
(
reinterpret_cast
<
uint32_t
*>
(
t
->
GetMutableBuffer
()
+
ctr
))
==
i
+
28
);
ctr
+=
4
;
}
const
char
*
buf
=
reinterpret_cast
<
char
*>
(
t
->
GetMutableBuffer
())
+
6
*
4
;
const
char
*
buf
=
reinterpret_cast
<
char
*>
(
t
->
GetMutableBuffer
())
+
6
*
4
+
4
;
std
::
vector
<
uint32_t
>
starts
=
{
0
,
4
,
9
,
10
,
11
,
15
};
uint32_t
index
=
0
;
...
...
tests/ut/cpp/dataset/tensor_test.cc
浏览文件 @
6f733ec1
...
...
@@ -41,6 +41,7 @@ class MindDataTestTensorDE : public UT::Common {
TEST_F
(
MindDataTestTensorDE
,
Basics
)
{
std
::
shared_ptr
<
Tensor
>
t
=
std
::
make_shared
<
Tensor
>
(
TensorShape
({
2
,
3
}),
DataType
(
DataType
::
DE_UINT64
));
ASSERT_TRUE
((
t
->
AllocateBuffer
(
t
->
SizeInBytes
())).
IsOk
());
ASSERT_EQ
(
t
->
shape
(),
TensorShape
({
2
,
3
}));
ASSERT_EQ
(
t
->
type
(),
DataType
::
DE_UINT64
);
ASSERT_EQ
(
t
->
SizeInBytes
(),
2
*
3
*
8
);
...
...
tests/ut/data/dataset/testTextMindRecord/test.mindrecord
0 → 100644
浏览文件 @
6f733ec1
文件已添加
tests/ut/data/dataset/testTextMindRecord/test.mindrecord.db
0 → 100644
浏览文件 @
6f733ec1
文件已添加
tests/ut/data/dataset/testTextTFRecord/datasetSchema.json
0 → 100644
浏览文件 @
6f733ec1
{
"datasetType"
:
"TF"
,
"numRows"
:
3
,
"columns"
:
{
"line"
:
{
"type"
:
"string"
,
"rank"
:
0
},
"words"
:
{
"type"
:
"string"
,
"rank"
:
1
},
"chinese"
:
{
"type"
:
"string"
,
"rank"
:
0
}
}
}
tests/ut/data/dataset/testTextTFRecord/text.tfrecord
0 → 100644
浏览文件 @
6f733ec1
文件已添加
tests/ut/python/dataset/test_minddataset.py
浏览文件 @
6f733ec1
...
...
@@ -584,7 +584,7 @@ def test_cv_minddataset_reader_basic_tutorial_5_epoch(add_and_remove_cv_file):
def
test_cv_minddataset_reader_basic_tutorial_5_epoch_with_batch
(
add_and_remove_cv_file
):
"""tutorial for cv minderdataset."""
columns_list
=
[
"data"
,
"
file_name"
,
"
label"
]
columns_list
=
[
"data"
,
"label"
]
num_readers
=
4
data_set
=
ds
.
MindDataset
(
CV_FILE_NAME
+
"0"
,
columns_list
,
num_readers
)
...
...
@@ -948,8 +948,7 @@ def test_write_with_multi_bytes_and_array_and_read_by_MindDataset():
data_value_to_list
=
[]
for
item
in
data
:
new_data
=
{}
new_data
[
'file_name'
]
=
np
.
asarray
(
list
(
bytes
(
item
[
"file_name"
],
encoding
=
'utf-8'
)),
dtype
=
np
.
uint8
)
new_data
[
'file_name'
]
=
np
.
asarray
(
item
[
"file_name"
],
dtype
=
'S'
)
new_data
[
'label'
]
=
np
.
asarray
(
list
([
item
[
"label"
]]),
dtype
=
np
.
int32
)
new_data
[
'image1'
]
=
np
.
asarray
(
list
(
item
[
"image1"
]),
dtype
=
np
.
uint8
)
new_data
[
'image2'
]
=
np
.
asarray
(
list
(
item
[
"image2"
]),
dtype
=
np
.
uint8
)
...
...
@@ -1153,8 +1152,7 @@ def test_write_with_multi_bytes_and_MindDataset():
data_value_to_list
=
[]
for
item
in
data
:
new_data
=
{}
new_data
[
'file_name'
]
=
np
.
asarray
(
list
(
bytes
(
item
[
"file_name"
],
encoding
=
'utf-8'
)),
dtype
=
np
.
uint8
)
new_data
[
'file_name'
]
=
np
.
asarray
(
item
[
"file_name"
],
dtype
=
'S'
)
new_data
[
'label'
]
=
np
.
asarray
(
list
([
item
[
"label"
]]),
dtype
=
np
.
int32
)
new_data
[
'image1'
]
=
np
.
asarray
(
list
(
item
[
"image1"
]),
dtype
=
np
.
uint8
)
new_data
[
'image2'
]
=
np
.
asarray
(
list
(
item
[
"image2"
]),
dtype
=
np
.
uint8
)
...
...
tests/ut/python/dataset/test_minddataset_sampler.py
浏览文件 @
6f733ec1
...
...
@@ -27,6 +27,7 @@ import mindspore.dataset as ds
import
mindspore.dataset.transforms.vision.c_transforms
as
vision
from
mindspore
import
log
as
logger
from
mindspore.dataset.transforms.vision
import
Inter
from
mindspore.dataset.transforms.text
import
as_text
from
mindspore.mindrecord
import
FileWriter
FILES_NUM
=
4
...
...
@@ -72,7 +73,7 @@ def test_cv_minddataset_pk_sample_no_column(add_and_remove_cv_file):
for
item
in
data_set
.
create_dict_iterator
():
logger
.
info
(
"-------------- cv reader basic: {} ------------------------"
.
format
(
num_iter
))
logger
.
info
(
"-------------- item[file_name]:
\
{}------------------------"
.
format
(
""
.
join
([
chr
(
x
)
for
x
in
item
[
"file_name"
]
])))
{}------------------------"
.
format
(
as_text
(
item
[
"file_name"
])))
logger
.
info
(
"-------------- item[label]: {} ----------------------------"
.
format
(
item
[
"label"
]))
num_iter
+=
1
...
...
@@ -92,7 +93,7 @@ def test_cv_minddataset_pk_sample_basic(add_and_remove_cv_file):
logger
.
info
(
"-------------- item[data]:
\
{}------------------------"
.
format
(
item
[
"data"
][:
10
]))
logger
.
info
(
"-------------- item[file_name]:
\
{}------------------------"
.
format
(
""
.
join
([
chr
(
x
)
for
x
in
item
[
"file_name"
]
])))
{}------------------------"
.
format
(
as_text
(
item
[
"file_name"
])))
logger
.
info
(
"-------------- item[label]: {} ----------------------------"
.
format
(
item
[
"label"
]))
num_iter
+=
1
...
...
@@ -110,7 +111,7 @@ def test_cv_minddataset_pk_sample_shuffle(add_and_remove_cv_file):
for
item
in
data_set
.
create_dict_iterator
():
logger
.
info
(
"-------------- cv reader basic: {} ------------------------"
.
format
(
num_iter
))
logger
.
info
(
"-------------- item[file_name]:
\
{}------------------------"
.
format
(
""
.
join
([
chr
(
x
)
for
x
in
item
[
"file_name"
]
])))
{}------------------------"
.
format
(
as_text
(
item
[
"file_name"
])))
logger
.
info
(
"-------------- item[label]: {} ----------------------------"
.
format
(
item
[
"label"
]))
num_iter
+=
1
...
...
@@ -127,7 +128,7 @@ def test_cv_minddataset_pk_sample_out_of_range(add_and_remove_cv_file):
for
item
in
data_set
.
create_dict_iterator
():
logger
.
info
(
"-------------- cv reader basic: {} ------------------------"
.
format
(
num_iter
))
logger
.
info
(
"-------------- item[file_name]:
\
{}------------------------"
.
format
(
""
.
join
([
chr
(
x
)
for
x
in
item
[
"file_name"
]
])))
{}------------------------"
.
format
(
as_text
(
item
[
"file_name"
])))
logger
.
info
(
"-------------- item[label]: {} ----------------------------"
.
format
(
item
[
"label"
]))
num_iter
+=
1
...
...
tests/ut/python/dataset/test_tensor_string.py
浏览文件 @
6f733ec1
...
...
@@ -17,17 +17,15 @@ import numpy as np
import
pytest
import
mindspore.dataset
as
ds
import
mindspore.common.dtype
as
mstype
# pylint: disable=comparison-with-itself
def
test_basic
():
x
=
np
.
array
([[
"ab"
,
"cde"
,
"121"
],
[
"x"
,
"km"
,
"789"
]],
dtype
=
'S'
)
# x = np.array(["ab", "cde"], dtype='S')
n
=
cde
.
Tensor
(
x
)
arr
=
n
.
as_array
()
y
=
np
.
array
([
1
,
2
])
assert
all
(
y
==
y
)
# assert np.testing.assert_array_equal(y,y)
np
.
testing
.
assert_array_equal
(
x
,
arr
)
def
compare
(
strings
):
...
...
@@ -60,7 +58,125 @@ def test_batching_strings():
assert
"[Batch ERROR] Batch does not support"
in
str
(
info
)
def
test_map
():
def
gen
():
yield
np
.
array
([
"ab cde 121"
],
dtype
=
'S'
),
data
=
ds
.
GeneratorDataset
(
gen
,
column_names
=
[
"col"
])
def
split
(
b
):
splits
=
b
.
item
().
decode
(
"utf8"
).
split
()
return
np
.
array
(
splits
,
dtype
=
'S'
)
data
=
data
.
map
(
input_columns
=
[
"col"
],
operations
=
split
)
expected
=
np
.
array
([
"ab"
,
"cde"
,
"121"
],
dtype
=
'S'
)
for
d
in
data
:
np
.
testing
.
assert_array_equal
(
d
[
0
],
expected
)
def
as_str
(
arr
):
def
decode
(
s
):
return
s
.
decode
(
"utf8"
)
decode_v
=
np
.
vectorize
(
decode
)
return
decode_v
(
arr
)
line
=
np
.
array
([
"This is a text file."
,
"Be happy every day."
,
"Good luck to everyone."
])
words
=
np
.
array
([[
"This"
,
"text"
,
"file"
,
"a"
],
[
"Be"
,
"happy"
,
"day"
,
"b"
],
[
"女"
,
""
,
"everyone"
,
"c"
]])
chinese
=
np
.
array
([
"今天天气太好了我们一起去外面玩吧"
,
"男默女泪"
,
"江州市长江大桥参加了长江大桥的通车仪式"
])
def
test_tfrecord1
():
s
=
ds
.
Schema
()
s
.
add_column
(
"line"
,
"string"
,
[])
s
.
add_column
(
"words"
,
"string"
,
[
-
1
])
s
.
add_column
(
"chinese"
,
"string"
,
[])
data
=
ds
.
TFRecordDataset
(
"../data/dataset/testTextTFRecord/text.tfrecord"
,
shuffle
=
False
,
schema
=
s
)
for
i
,
d
in
enumerate
(
data
.
create_dict_iterator
()):
assert
d
[
"line"
].
shape
==
line
[
i
].
shape
assert
d
[
"words"
].
shape
==
words
[
i
].
shape
assert
d
[
"chinese"
].
shape
==
chinese
[
i
].
shape
np
.
testing
.
assert_array_equal
(
line
[
i
],
as_str
(
d
[
"line"
]))
np
.
testing
.
assert_array_equal
(
words
[
i
],
as_str
(
d
[
"words"
]))
np
.
testing
.
assert_array_equal
(
chinese
[
i
],
as_str
(
d
[
"chinese"
]))
def
test_tfrecord2
():
data
=
ds
.
TFRecordDataset
(
"../data/dataset/testTextTFRecord/text.tfrecord"
,
shuffle
=
False
,
schema
=
'../data/dataset/testTextTFRecord/datasetSchema.json'
)
for
i
,
d
in
enumerate
(
data
.
create_dict_iterator
()):
assert
d
[
"line"
].
shape
==
line
[
i
].
shape
assert
d
[
"words"
].
shape
==
words
[
i
].
shape
assert
d
[
"chinese"
].
shape
==
chinese
[
i
].
shape
np
.
testing
.
assert_array_equal
(
line
[
i
],
as_str
(
d
[
"line"
]))
np
.
testing
.
assert_array_equal
(
words
[
i
],
as_str
(
d
[
"words"
]))
np
.
testing
.
assert_array_equal
(
chinese
[
i
],
as_str
(
d
[
"chinese"
]))
def
test_tfrecord3
():
s
=
ds
.
Schema
()
s
.
add_column
(
"line"
,
mstype
.
string
,
[])
s
.
add_column
(
"words"
,
mstype
.
string
,
[
-
1
,
2
])
s
.
add_column
(
"chinese"
,
mstype
.
string
,
[])
data
=
ds
.
TFRecordDataset
(
"../data/dataset/testTextTFRecord/text.tfrecord"
,
shuffle
=
False
,
schema
=
s
)
for
i
,
d
in
enumerate
(
data
.
create_dict_iterator
()):
assert
d
[
"line"
].
shape
==
line
[
i
].
shape
assert
d
[
"words"
].
shape
==
words
[
i
].
reshape
([
2
,
2
]).
shape
assert
d
[
"chinese"
].
shape
==
chinese
[
i
].
shape
np
.
testing
.
assert_array_equal
(
line
[
i
],
as_str
(
d
[
"line"
]))
np
.
testing
.
assert_array_equal
(
words
[
i
].
reshape
([
2
,
2
]),
as_str
(
d
[
"words"
]))
np
.
testing
.
assert_array_equal
(
chinese
[
i
],
as_str
(
d
[
"chinese"
]))
def
create_text_mindrecord
():
# methood to create mindrecord with string data, used to generate testTextMindRecord/test.mindrecord
from
mindspore.mindrecord
import
FileWriter
mindrecord_file_name
=
"test.mindrecord"
data
=
[{
"english"
:
"This is a text file."
,
"chinese"
:
"今天天气太好了我们一起去外面玩吧"
},
{
"english"
:
"Be happy every day."
,
"chinese"
:
"男默女泪"
},
{
"english"
:
"Good luck to everyone."
,
"chinese"
:
"江州市长江大桥参加了长江大桥的通车仪式"
},
]
writer
=
FileWriter
(
mindrecord_file_name
)
schema
=
{
"english"
:
{
"type"
:
"string"
},
"chinese"
:
{
"type"
:
"string"
},
}
writer
.
add_schema
(
schema
)
writer
.
write_raw_data
(
data
)
writer
.
commit
()
def
test_mindrecord
():
data
=
ds
.
MindDataset
(
"../data/dataset/testTextMindRecord/test.mindrecord"
,
shuffle
=
False
)
for
i
,
d
in
enumerate
(
data
.
create_dict_iterator
()):
assert
d
[
"english"
].
shape
==
line
[
i
].
shape
assert
d
[
"chinese"
].
shape
==
chinese
[
i
].
shape
np
.
testing
.
assert_array_equal
(
line
[
i
],
as_str
(
d
[
"english"
]))
np
.
testing
.
assert_array_equal
(
chinese
[
i
],
as_str
(
d
[
"chinese"
]))
if
__name__
==
'__main__'
:
test_generator
()
test_basic
()
test_batching_strings
()
# test_generator()
# test_basic()
# test_batching_strings()
test_map
()
# test_tfrecord1()
# test_tfrecord2()
# test_tfrecord3()
# test_mindrecord()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录