Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
5eb95599
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5eb95599
编写于
6月 10, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
6月 10, 2020
浏览文件
操作
浏览文件
下载
差异文件
!1874 Updates to string tensor
Merge pull request !1874 from h.farahat/string_tensor_updates
上级
0e783982
f837ddc9
变更
13
隐藏空白更改
内联
并排
Showing
13 changed file
with
526 addition
and
218 deletion
+526
-218
mindspore/ccsrc/dataset/api/de_pipeline.cc
mindspore/ccsrc/dataset/api/de_pipeline.cc
+28
-12
mindspore/ccsrc/dataset/api/de_pipeline.h
mindspore/ccsrc/dataset/api/de_pipeline.h
+2
-0
mindspore/ccsrc/dataset/core/data_type.cc
mindspore/ccsrc/dataset/core/data_type.cc
+1
-1
mindspore/ccsrc/dataset/core/tensor.cc
mindspore/ccsrc/dataset/core/tensor.cc
+27
-3
mindspore/ccsrc/dataset/core/tensor.h
mindspore/ccsrc/dataset/core/tensor.h
+6
-0
mindspore/ccsrc/dataset/core/tensor_shape.h
mindspore/ccsrc/dataset/core/tensor_shape.h
+4
-1
mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
+68
-131
mindspore/ccsrc/dataset/engine/datasetops/batch_op.h
mindspore/ccsrc/dataset/engine/datasetops/batch_op.h
+24
-35
mindspore/ccsrc/dataset/kernels/data/data_utils.cc
mindspore/ccsrc/dataset/kernels/data/data_utils.cc
+122
-0
mindspore/ccsrc/dataset/kernels/data/data_utils.h
mindspore/ccsrc/dataset/kernels/data/data_utils.h
+54
-0
mindspore/dataset/engine/validators.py
mindspore/dataset/engine/validators.py
+4
-2
tests/ut/cpp/dataset/batch_op_test.cc
tests/ut/cpp/dataset/batch_op_test.cc
+53
-5
tests/ut/python/dataset/test_tensor_string.py
tests/ut/python/dataset/test_tensor_string.py
+133
-28
未找到文件。
mindspore/ccsrc/dataset/api/de_pipeline.cc
浏览文件 @
5eb95599
...
...
@@ -654,18 +654,8 @@ Status DEPipeline::ParseBatchOp(const py::dict &args, std::shared_ptr<DatasetOp>
(
void
)
builder
->
SetColumnsToMap
(
ToStringVector
(
value
));
}
if
(
key
==
"pad_info"
)
{
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
float
>>
pad_info
;
for
(
auto
p
:
py
::
reinterpret_borrow
<
py
::
dict
>
(
value
))
{
if
(
!
p
.
second
.
is_none
())
{
py
::
tuple
tp
=
py
::
reinterpret_borrow
<
py
::
tuple
>
(
p
.
second
);
CHECK_FAIL_RETURN_UNEXPECTED
(
tp
.
size
()
==
2
,
"tuple in pad_info must be (list,int) or (list,float)"
);
TensorShape
shape
=
tp
[
0
].
is_none
()
?
TensorShape
::
CreateUnknownRankShape
()
:
TensorShape
(
tp
[
0
]);
float
pad_val
=
tp
[
1
].
is_none
()
?
0
:
ToFloat
(
tp
[
1
]);
(
void
)
pad_info
.
insert
({
ToString
(
p
.
first
),
{
shape
,
pad_val
}});
}
else
{
// tuple is None
(
void
)
pad_info
.
insert
({
ToString
(
p
.
first
),
{
TensorShape
({}),
0
}});
}
}
PadInfo
pad_info
;
RETURN_IF_NOT_OK
(
ParsePadInfo
(
value
,
&
pad_info
));
(
void
)
builder
->
SetPaddingMap
(
pad_info
,
true
);
}
}
...
...
@@ -1166,5 +1156,31 @@ Status DEPipeline::ParseTextFileOp(const py::dict &args, std::shared_ptr<Dataset
*
ptr
=
op
;
return
Status
::
OK
();
}
Status
DEPipeline
::
ParsePadInfo
(
py
::
handle
value
,
PadInfo
*
pad_info
)
{
for
(
auto
p
:
py
::
reinterpret_borrow
<
py
::
dict
>
(
value
))
{
if
(
!
p
.
second
.
is_none
())
{
auto
tp
=
py
::
reinterpret_borrow
<
py
::
tuple
>
(
p
.
second
);
CHECK_FAIL_RETURN_UNEXPECTED
(
tp
.
size
()
==
2
,
"tuple in pad_info must be (list,int) or (list,float)"
);
TensorShape
shape
=
tp
[
0
].
is_none
()
?
TensorShape
::
CreateUnknownRankShape
()
:
TensorShape
(
tp
[
0
]);
std
::
shared_ptr
<
Tensor
>
pad_val
=
nullptr
;
if
(
py
::
isinstance
<
py
::
str
>
(
tp
[
1
]))
{
std
::
string
pad_val_string
=
tp
[
1
].
is_none
()
?
""
:
ToString
(
tp
[
1
]);
CHECK_FAIL_RETURN_UNEXPECTED
(
Tensor
::
CreateTensor
(
&
pad_val
,
std
::
vector
<
std
::
string
>
{
pad_val_string
},
TensorShape
::
CreateScalar
()),
"Cannot create pad_value Tensor"
);
}
else
{
float
pad_val_float
=
tp
[
1
].
is_none
()
?
0
:
ToFloat
(
tp
[
1
]);
CHECK_FAIL_RETURN_UNEXPECTED
(
Tensor
::
CreateTensor
(
&
pad_val
,
TensorImpl
::
kFlexible
,
TensorShape
::
CreateScalar
(),
DataType
(
DataType
::
DE_FLOAT32
)),
"Cannot create pad_value Tensor"
);
pad_val
->
SetItemAt
<
float
>
({},
pad_val_float
);
}
(
void
)
pad_info
->
insert
({
ToString
(
p
.
first
),
{
shape
,
pad_val
}});
}
else
{
// tuple is None
(
void
)
pad_info
->
insert
({
ToString
(
p
.
first
),
{
TensorShape
({}),
nullptr
}});
}
}
return
Status
::
OK
();
}
}
// namespace dataset
}
// namespace mindspore
mindspore/ccsrc/dataset/api/de_pipeline.h
浏览文件 @
5eb95599
...
...
@@ -169,6 +169,8 @@ class DEPipeline {
// Validate required args passed to storage op.
Status
ValidateArgStorageOp
(
const
py
::
dict
&
args
);
static
Status
ParsePadInfo
(
py
::
handle
value
,
PadInfo
*
pad_info
);
int
batch_size_
;
int
repeat_num_
;
int
num_rows_
;
...
...
mindspore/ccsrc/dataset/core/data_type.cc
浏览文件 @
5eb95599
...
...
@@ -138,7 +138,7 @@ DataType DataType::FromNpArray(const py::array &arr) {
return
DataType
(
DataType
::
DE_FLOAT32
);
}
else
if
(
py
::
isinstance
<
py
::
array_t
<
std
::
double_t
>>
(
arr
))
{
return
DataType
(
DataType
::
DE_FLOAT64
);
}
else
if
(
arr
.
dtype
().
kind
()
==
'S'
)
{
}
else
if
(
arr
.
dtype
().
kind
()
==
'S'
||
arr
.
dtype
().
kind
()
==
'U'
)
{
return
DataType
(
DataType
::
DE_STRING
);
}
else
{
MS_LOG
(
ERROR
)
<<
"Cannot convert from numpy type. Unknown data type is returned!"
;
...
...
mindspore/ccsrc/dataset/core/tensor.cc
浏览文件 @
5eb95599
...
...
@@ -229,7 +229,12 @@ Status Tensor::CreateTensorFromNumpyString(std::shared_ptr<Tensor> *ptr, py::arr
}
arr
.
resize
({
arr
.
size
()});
// flatten the py::array so we can iterate once
std
::
vector
<
std
::
string
>
strings
;
std
::
for_each
(
arr
.
begin
(),
arr
.
end
(),
[
&
strings
](
const
auto
&
s
)
{
strings
.
emplace_back
(
py
::
cast
<
py
::
bytes
>
(
s
));
});
if
(
arr
.
dtype
().
kind
()
==
'U'
)
{
std
::
for_each
(
arr
.
begin
(),
arr
.
end
(),
[
&
strings
](
const
auto
&
s
)
{
strings
.
emplace_back
(
py
::
cast
<
py
::
str
>
(
s
));
});
}
else
{
std
::
for_each
(
arr
.
begin
(),
arr
.
end
(),
[
&
strings
](
const
auto
&
s
)
{
strings
.
emplace_back
(
py
::
cast
<
py
::
bytes
>
(
s
));
});
}
arr
.
resize
(
shape
);
// resize arr back to the original shape
...
...
@@ -699,6 +704,8 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
for
(;
itr
!=
end
<
std
::
string_view
>
();
itr
++
)
{
max
=
std
::
max
((
*
itr
).
length
(),
max
);
}
// if all strings are empty, numpy stores a byte for each string |S1
max
=
(
max
==
0
?
1
:
max
);
uint64_t
total_size
=
shape_
.
NumOfElements
()
*
max
;
char
*
tmp_data
=
reinterpret_cast
<
char
*>
(
data_allocator_
->
allocate
(
total_size
));
if
(
tmp_data
==
nullptr
)
RETURN_STATUS_UNEXPECTED
(
"Cannot create temp array."
);
...
...
@@ -708,8 +715,10 @@ Status Tensor::GetDataAsNumpyStrings(py::array *data) {
itr
=
begin
<
std
::
string_view
>
();
uint64_t
i
=
0
;
for
(;
itr
!=
end
<
std
::
string_view
>
();
itr
++
,
i
++
)
{
ret_code
=
memcpy_s
(
tmp_data
+
i
*
max
,
total_size
,
(
*
itr
).
data
(),
(
*
itr
).
length
());
CHECK_FAIL_RETURN_UNEXPECTED
(
ret_code
==
0
,
"Failed to copy string data."
);
if
(
!
(
*
itr
).
empty
())
{
ret_code
=
memcpy_s
(
tmp_data
+
i
*
max
,
total_size
,
(
*
itr
).
data
(),
(
*
itr
).
length
());
CHECK_FAIL_RETURN_UNEXPECTED
(
ret_code
==
0
,
"Failed to copy string data."
);
}
}
auto
strides
=
shape_
.
Strides
();
std
::
transform
(
strides
.
begin
(),
strides
.
end
(),
strides
.
begin
(),
[
&
max
](
const
auto
&
s
)
{
return
s
*
max
;
});
...
...
@@ -847,6 +856,21 @@ Status Tensor::GetStringAt(dsize_t index, uchar **string_start, offset_t *length
*
length
=
offset_ptr
[
index
+
1
]
-
start
-
1
;
// -1 to skip the \0 from the string length
return
Status
::
OK
();
}
Status
Tensor
::
CopyLastDimAt
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
const
std
::
vector
<
dsize_t
>
&
index
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
src
->
type
()
==
type_
,
"Source Tensor has a different type"
);
CHECK_FAIL_RETURN_UNEXPECTED
(
index
.
back
()
==
0
,
"Last dim in index should be 0"
);
uint8_t
type_size
=
type_
.
SizeInBytes
();
size_t
len
=
std
::
min
(
src
->
shape
()[
-
1
],
shape_
[
-
1
])
*
type_size
;
dsize_t
src_flat_ind
=
0
,
dst_flat_ind
=
0
;
RETURN_IF_NOT_OK
(
src
->
shape
().
ToFlatIndex
(
index
,
&
src_flat_ind
));
RETURN_IF_NOT_OK
(
shape_
.
ToFlatIndex
(
index
,
&
dst_flat_ind
));
const
unsigned
char
*
src_addr
=
src
->
GetBuffer
()
+
src_flat_ind
*
type_size
;
unsigned
char
*
dst_addr
=
GetMutableBuffer
()
+
dst_flat_ind
*
type_size
;
CHECK_FAIL_RETURN_UNEXPECTED
(
memcpy_s
(
dst_addr
,
len
,
src_addr
,
len
)
==
0
,
"memcpy error"
);
return
Status
::
OK
();
}
}
// namespace dataset
}
// namespace mindspore
mindspore/ccsrc/dataset/core/tensor.h
浏览文件 @
5eb95599
...
...
@@ -529,6 +529,12 @@ class Tensor {
return
TensorIterator
<
T
>
(
data_end_
);
}
// Copies the last dimension at `index` from Tensor `src` to this Tensor.
// @param src Tensor
// @param index vector to the start of the dimension. The last dim should be 0
// @return Status
Status
CopyLastDimAt
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
const
std
::
vector
<
dsize_t
>
&
index
);
protected:
// A function that prints Tensor recursively, first called by print
// @param out
...
...
mindspore/ccsrc/dataset/core/tensor_shape.h
浏览文件 @
5eb95599
...
...
@@ -118,7 +118,10 @@ class TensorShape {
bool
operator
!=
(
const
TensorShape
&
rhs
)
const
{
return
!
(
rhs
==
*
this
);
}
dsize_t
operator
[](
const
dsize_t
index
)
const
{
return
raw_shape_
[
index
];
}
dsize_t
operator
[](
const
dsize_t
index
)
const
{
if
(
index
<
0
)
return
raw_shape_
[
raw_shape_
.
size
()
+
index
];
return
raw_shape_
[
index
];
}
// Return the Shape as a vector
// @return
...
...
mindspore/ccsrc/dataset/engine/datasetops/batch_op.cc
浏览文件 @
5eb95599
...
...
@@ -23,6 +23,7 @@
#include "dataset/engine/data_buffer.h"
#include "dataset/engine/db_connector.h"
#include "dataset/engine/opt/pass.h"
#include "dataset/kernels/data/data_utils.h"
using
float16
=
Eigen
::
half
;
...
...
@@ -53,7 +54,7 @@ Status BatchOp::Builder::SanityCheck() {
BatchOp
::
BatchOp
(
int32_t
batch_size
,
bool
drop
,
bool
pad
,
int32_t
op_queue_size
,
int32_t
num_workers
,
const
std
::
vector
<
std
::
string
>
&
cols_to_map
,
py
::
function
batch_size_func
,
py
::
function
batch_map_func
,
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
float
>>
pad_map
)
PadInfo
pad_map
)
:
ParallelOp
(
num_workers
,
op_queue_size
),
start_batch_size_
(
batch_size
),
drop_
(
drop
),
...
...
@@ -75,10 +76,6 @@ Status BatchOp::operator()() {
std
::
unique_ptr
<
TensorQTable
>
table
=
std
::
make_unique
<
TensorQTable
>
();
child_iterator_
=
std
::
make_unique
<
ChildIterator
>
(
this
,
0
,
0
);
RETURN_IF_NOT_OK
(
child_iterator_
->
FetchNextTensorRow
(
&
new_row
));
for
(
const
auto
&
t
:
new_row
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
t
->
type
().
IsNumeric
(),
"[Batch ERROR] Batch does not support Tensor of type string yet."
);
}
RETURN_IF_NOT_OK
(
DatasetOp
::
AssignColMapFromChild
());
// must come after the first fetch above
int32_t
cur_batch_size
=
0
;
RETURN_IF_NOT_OK
(
GetBatchSize
(
&
cur_batch_size
,
CBatchInfo
(
0
,
0
,
0
)));
...
...
@@ -134,49 +131,57 @@ void BatchOp::Print(std::ostream &out, bool show_all) const {
}
}
Status
BatchOp
::
BatchRows
(
const
std
::
unique_ptr
<
TensorQTable
>
*
s
ource_table
,
const
std
::
unique_ptr
<
TensorQTable
>
*
dest_table
,
size_t
batch_size
)
{
if
((
*
s
ource_table
)
->
size
()
<
batch_size
||
(
*
source_table
)
->
size
()
==
0
)
{
RETURN_STATUS_UNEXPECTED
(
"[Internal Batch ERROR]
Insufficient rows in source_table
\n
"
);
Status
BatchOp
::
BatchRows
(
const
std
::
unique_ptr
<
TensorQTable
>
*
s
rc
,
const
std
::
unique_ptr
<
TensorQTable
>
*
dest
,
d
size_t
batch_size
)
{
if
((
*
s
rc
)
->
size
()
!=
batch_size
)
{
RETURN_STATUS_UNEXPECTED
(
"[Internal Batch ERROR]
Source table size does not match the batch_size
"
);
}
TensorRow
row
=
std
::
move
((
*
source_table
)
->
front
());
(
*
source_table
)
->
pop_front
();
if
(
batch_size
==
1
)
{
for
(
std
::
shared_ptr
<
Tensor
>
tensor
:
row
)
{
TensorRow
row
=
std
::
move
((
*
src
)
->
front
());
(
*
src
)
->
pop_front
();
(
*
dest
)
->
push_back
(
row
);
for
(
const
auto
&
tensor
:
(
*
dest
)
->
front
())
{
RETURN_IF_NOT_OK
(
tensor
->
ExpandDim
(
0
));
}
(
*
dest_table
)
->
push_back
(
row
);
}
else
{
// batch_size > 1
std
::
vector
<
TensorShape
>
row_shapes
;
TensorRow
batched_row
;
for
(
size_t
i
=
0
;
i
<
row
.
size
();
i
++
)
{
// Handle the first row popped
row_shapes
.
push_back
(
row
[
i
]
->
shape
());
std
::
shared_ptr
<
Tensor
>
ts
;
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
ts
,
TensorImpl
::
kFlexible
,
row
[
i
]
->
shape
().
PrependDim
(
static_cast
<
int64_t
>
(
batch_size
)),
row
[
i
]
->
type
()));
batched_row
.
emplace_back
(
ts
);
RETURN_IF_NOT_OK
(
batched_row
[
i
]
->
InsertTensor
(
std
::
vector
<
dsize_t
>
(
1
,
0
),
row
[
i
]));
// {j} = 0
}
for
(
size_t
j
=
1
;
j
<
batch_size
;
j
++
)
{
// Handle the rest of the rows
row
=
std
::
move
((
*
source_table
)
->
front
());
(
*
source_table
)
->
pop_front
();
for
(
size_t
i
=
0
;
i
<
row
.
size
();
i
++
)
{
if
(
row
[
i
]
->
shape
()
==
row_shapes
[
i
])
{
// check the newly popped rows have the same dim as the first
RETURN_IF_NOT_OK
(
batched_row
[
i
]
->
InsertTensor
(
std
::
vector
<
dsize_t
>
(
1
,
j
),
row
[
i
]));
return
Status
::
OK
();
}
TensorRow
batched_row
;
auto
num_columns
=
(
*
src
)
->
front
().
size
();
for
(
size_t
i
=
0
;
i
<
num_columns
;
i
++
)
{
std
::
shared_ptr
<
Tensor
>
first_tensor
=
(
*
src
)
->
at
(
0
).
at
(
i
);
// first row, column i
TensorShape
first_shape
=
first_tensor
->
shape
();
DataType
first_type
=
first_tensor
->
type
();
TensorShape
new_shape
=
first_shape
.
PrependDim
(
static_cast
<
int64_t
>
(
batch_size
));
std
::
shared_ptr
<
Tensor
>
new_tensor
;
if
(
first_type
.
IsNumeric
())
{
// numeric tensor
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
new_tensor
,
TensorImpl
::
kFlexible
,
new_shape
,
first_type
));
dsize_t
j
=
0
;
for
(
auto
row
:
**
src
)
{
std
::
shared_ptr
<
Tensor
>
old_tensor
=
row
.
at
(
i
);
// row j, column i
if
(
old_tensor
->
shape
()
==
first_shape
)
{
// check the newly popped rows have the same dim as the first
RETURN_IF_NOT_OK
(
new_tensor
->
InsertTensor
({
j
++
},
old_tensor
));
}
else
{
std
::
string
column_name
;
for
(
auto
itr
:
column_name_id_map_
)
{
if
(
static_cast
<
size_t
>
(
itr
.
second
)
==
i
)
{
column_name
=
itr
.
first
;
break
;
}
}
RETURN_STATUS_UNEXPECTED
(
"[Batch ERROR] Inconsistent TensorShapes of Column "
+
column_name
);
RETURN_STATUS_UNEXPECTED
(
"[Batch ERROR] Inconsistent TensorShapes of Column "
+
std
::
to_string
(
i
));
}
}
}
else
{
// handle string column differently
std
::
vector
<
std
::
string
>
strings
;
for
(
dsize_t
j
=
0
;
j
<
batch_size
;
j
++
)
{
std
::
shared_ptr
<
Tensor
>
old_tensor
=
(
*
src
)
->
at
(
j
).
at
(
i
);
for
(
auto
itr
=
old_tensor
->
begin
<
std
::
string_view
>
();
itr
!=
old_tensor
->
end
<
std
::
string_view
>
();
itr
++
)
{
strings
.
emplace_back
(
*
itr
);
}
}
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
new_tensor
,
strings
,
new_shape
));
}
(
*
dest_table
)
->
emplace_back
(
batched_row
);
batched_row
.
emplace_back
(
new_tensor
);
}
(
*
dest
)
->
emplace_back
(
batched_row
);
return
Status
::
OK
();
}
...
...
@@ -202,8 +207,8 @@ Status BatchOp::WorkerEntry(int32_t workerId) {
Status
BatchOp
::
MakeBatchedBuffer
(
std
::
pair
<
std
::
unique_ptr
<
TensorQTable
>
,
CBatchInfo
>
table_pair
,
std
::
unique_ptr
<
DataBuffer
>
*
db
)
{
RETURN_UNEXPECTED_IF_NULL
(
table_pair
.
first
);
if
(
!
pyfunc_column_names_
.
empty
())
RETURN_IF_NOT_OK
(
MapColumns
(
&
table_pair
));
// pass it through pyfunc
if
(
pad_
)
RETURN_IF_NOT_OK
(
PadColumns
(
&
table_pair
));
// do padding if needed
if
(
!
pyfunc_column_names_
.
empty
())
RETURN_IF_NOT_OK
(
MapColumns
(
&
table_pair
));
// pass it through pyfunc
if
(
pad_
)
RETURN_IF_NOT_OK
(
PadColumns
(
&
table_pair
.
first
,
pad_info_
,
column_name_id_map_
));
// do padding if needed
(
*
db
)
=
std
::
make_unique
<
DataBuffer
>
(
table_pair
.
second
.
batch_num_
,
DataBuffer
::
kDeBFlagNone
);
std
::
unique_ptr
<
TensorQTable
>
dest_table
=
std
::
make_unique
<
TensorQTable
>
();
RETURN_IF_NOT_OK
(
BatchRows
(
&
table_pair
.
first
,
&
dest_table
,
table_pair
.
first
->
size
()));
...
...
@@ -333,74 +338,27 @@ Status BatchOp::InvokeBatchMapFunc(TensorBatchTable *input, TensorBatchTable *ou
return
Status
(
StatusCode
::
kOK
);
}
Status
BatchOp
::
PadTensor
(
std
::
shared_ptr
<
Tensor
>
src
,
std
::
shared_ptr
<
Tensor
>
*
dst
,
const
std
::
vector
<
dsize_t
>
&
pad_shape
,
float
pad_val
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
src
!=
nullptr
&&
dst
!=
nullptr
,
"tensor can't be nullptr"
);
if
(
src
->
Rank
()
==
0
||
src
->
shape
().
AsVector
()
==
pad_shape
)
{
(
*
dst
)
=
src
;
// if no padding, copy the pointer
}
else
{
CHECK_FAIL_RETURN_UNEXPECTED
(
src
->
Rank
()
==
pad_shape
.
size
(),
"Pad to diff rank not allowed"
);
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
dst
,
TensorImpl
::
kFlexible
,
TensorShape
(
pad_shape
),
src
->
type
()));
auto
tensor_type
=
src
->
type
().
value
();
if
(
pad_val
==
0
)
{
// if pad with zero, don't care what type it is
RETURN_IF_NOT_OK
((
*
dst
)
->
Zero
());
}
else
if
(
tensor_type
==
DataType
::
DE_INT8
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
int8_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_BOOL
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
bool
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_UINT8
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
uint8_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_INT16
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
int16_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_FLOAT16
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
float16
>
(
static_cast
<
float16
>
(
pad_val
)));
}
else
if
(
tensor_type
==
DataType
::
DE_UINT16
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
uint16_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_INT32
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
int32_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_UINT32
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
uint32_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_INT64
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
int64_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_UINT64
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
uint64_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_FLOAT32
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
float
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_FLOAT64
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
double
>
(
pad_val
));
}
else
{
RETURN_STATUS_UNEXPECTED
(
"Incorrect/Unknown tensor type"
);
}
std
::
vector
<
dsize_t
>
cur_ind
(
src
->
Rank
(),
0
),
src_s
(
src
->
Rank
(),
1
),
dst_s
(
src
->
Rank
(),
1
);
for
(
dsize_t
i
=
src
->
Rank
()
-
2
;
i
>=
0
;
i
--
)
{
src_s
[
i
]
=
src
->
shape
()[
i
+
1
]
*
src_s
[
i
+
1
];
dst_s
[
i
]
=
pad_shape
[
i
+
1
]
*
dst_s
[
i
+
1
];
}
RETURN_IF_NOT_OK
(
PadHelper
(
src
,
*
dst
,
cur_ind
,
src_s
,
dst_s
,
0
));
}
return
Status
::
OK
();
}
// namespace dataset
Status
BatchOp
::
PadColumns
(
std
::
pair
<
std
::
unique_ptr
<
TensorQTable
>
,
CBatchInfo
>
*
table_pair
)
{
RETURN_UNEXPECTED_IF_NULL
(
table_pair
);
// placeholder for now, might need this in the future
CHECK_FAIL_RETURN_UNEXPECTED
(
table_pair
->
first
->
front
().
size
()
==
column_name_id_map_
.
size
(),
"col_name_map mismatch"
);
std
::
vector
<
float
>
pad_vals
(
column_name_id_map_
.
size
(),
0
);
// value to pad each column's tensor with, default 0
Status
BatchOp
::
PadColumns
(
std
::
unique_ptr
<
TensorQTable
>
*
table
,
const
PadInfo
&
pad_info
,
const
std
::
unordered_map
<
std
::
string
,
int32_t
>
&
column_name_id_map
)
{
RETURN_UNEXPECTED_IF_NULL
(
table
);
// placeholder for now, might need this in the future
CHECK_FAIL_RETURN_UNEXPECTED
((
*
table
)
->
front
().
size
()
==
column_name_id_map
.
size
(),
"col_name_map mismatch"
);
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>
pad_vals
(
column_name_id_map
.
size
(),
0
);
// value to pad each column's tensor with, default 0
std
::
set
<
int32_t
>
pad_cols
;
// padded_shape provided by user, maximum shapes of current batch of tensors
std
::
vector
<
std
::
vector
<
dsize_t
>>
pad_shapes
(
column_name_id_map
_
.
size
()),
max_shapes
(
column_name_id_map_
.
size
());
RETURN_IF_NOT_OK
(
UnpackPadInfo
(
&
pad_cols
,
&
pad_vals
,
&
pad_shapes
));
std
::
vector
<
std
::
vector
<
dsize_t
>>
pad_shapes
(
column_name_id_map
.
size
()),
max_shapes
(
column_name_id_map
.
size
());
RETURN_IF_NOT_OK
(
UnpackPadInfo
(
pad_info
,
column_name_id_map
,
&
pad_cols
,
&
pad_vals
,
&
pad_shapes
));
// init each shape in max_shape to {-1,-1...} init each unspecified shape in pad_shape to -1 as well
for
(
size_t
col_id
:
pad_cols
)
{
max_shapes
[
col_id
]
=
std
::
vector
<
dsize_t
>
(
table_pair
->
first
->
front
()[
col_id
]
->
Rank
(),
-
1
);
max_shapes
[
col_id
]
=
std
::
vector
<
dsize_t
>
(
(
*
table
)
->
front
()[
col_id
]
->
Rank
(),
-
1
);
if
(
pad_shapes
[
col_id
].
empty
())
pad_shapes
[
col_id
]
=
max_shapes
[
col_id
];
// fill pad shape with -1
CHECK_FAIL_RETURN_UNEXPECTED
(
pad_shapes
[
col_id
].
size
()
==
max_shapes
[
col_id
].
size
(),
"wrong rank in pad_shape"
);
}
// calculate maximum shape for each column that needs to be padded
for
(
const
TensorRow
&
row
:
*
(
table_pair
->
first
)
)
{
// iterator each row in a batch
for
(
size_t
col_id
:
pad_cols
)
{
// iterator each tensor in a row
for
(
const
TensorRow
&
row
:
*
*
table
)
{
// iterator each row in a batch
for
(
size_t
col_id
:
pad_cols
)
{
// iterator each tensor in a row
CHECK_FAIL_RETURN_UNEXPECTED
(
row
[
col_id
]
->
Rank
()
==
max_shapes
[
col_id
].
size
(),
"Tensor to be padded together need to have the same rank"
);
for
(
size_t
dim
=
0
;
dim
<
row
[
col_id
]
->
Rank
();
dim
++
)
{
// pick the largest number in each dimension
...
...
@@ -417,27 +375,29 @@ Status BatchOp::PadColumns(std::pair<std::unique_ptr<TensorQTable>, CBatchInfo>
}
// call pad on each tensor that needs to be padded
for
(
TensorRow
&
row
:
*
(
table_pair
->
first
)
)
{
for
(
TensorRow
&
row
:
*
*
table
)
{
for
(
size_t
col_id
:
pad_cols
)
{
std
::
shared_ptr
<
Tensor
>
pad_tensor
;
RETURN_IF_NOT_OK
(
Pad
Tensor
(
row
[
col_id
],
&
pad_tensor
,
pad_shapes
[
col_id
],
pad_vals
[
col_id
]));
RETURN_IF_NOT_OK
(
Pad
End
(
row
[
col_id
],
&
pad_tensor
,
pad_shapes
[
col_id
],
pad_vals
[
col_id
]));
row
[
col_id
]
=
pad_tensor
;
}
}
return
Status
::
OK
();
}
Status
BatchOp
::
UnpackPadInfo
(
std
::
set
<
int32_t
>
*
pad_cols
,
std
::
vector
<
float
>
*
pad_vals
,
Status
BatchOp
::
UnpackPadInfo
(
const
PadInfo
&
pad_info
,
const
std
::
unordered_map
<
std
::
string
,
int32_t
>
&
column_name_id_map
,
std
::
set
<
int32_t
>
*
pad_cols
,
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>
*
pad_vals
,
std
::
vector
<
std
::
vector
<
dsize_t
>>
*
pad_shapes
)
{
if
(
pad_info
_
.
empty
())
{
// if pad_info empty, pad every columns automatically
for
(
dsize_t
col_id
=
0
;
col_id
<
column_name_id_map
_
.
size
();
col_id
++
)
{
if
(
pad_info
.
empty
())
{
// if pad_info empty, pad every columns automatically
for
(
dsize_t
col_id
=
0
;
col_id
<
column_name_id_map
.
size
();
col_id
++
)
{
pad_cols
->
insert
(
col_id
);
}
}
else
{
for
(
auto
p
:
pad_info_
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
column_name_id_map_
.
find
(
p
.
first
)
!=
column_name_id_map_
.
end
(),
"no column exists with name:"
+
p
.
first
);
dsize_t
col_id
=
static_cast
<
dsize_t
>
(
column_name_id_map_
[
p
.
first
]
);
for
(
const
auto
&
p
:
pad_info
)
{
auto
location
=
column_name_id_map
.
find
(
p
.
first
);
CHECK_FAIL_RETURN_UNEXPECTED
(
location
!=
column_name_id_map
.
end
(),
"no column exists with name:"
+
p
.
first
);
auto
col_id
=
static_cast
<
dsize_t
>
(
location
->
second
);
CHECK_FAIL_RETURN_UNEXPECTED
(
col_id
<
pad_vals
->
size
()
&&
col_id
<
pad_shapes
->
size
(),
"col_id out of bound"
);
pad_cols
->
insert
(
col_id
);
(
*
pad_vals
)[
col_id
]
=
p
.
second
.
second
;
// set pad values
...
...
@@ -447,29 +407,6 @@ Status BatchOp::UnpackPadInfo(std::set<int32_t> *pad_cols, std::vector<float> *p
return
Status
::
OK
();
}
Status
BatchOp
::
PadHelper
(
std
::
shared_ptr
<
Tensor
>
src
,
std
::
shared_ptr
<
Tensor
>
dst
,
std
::
vector
<
dsize_t
>
cur_ind
,
const
std
::
vector
<
dsize_t
>
&
src_s
,
const
std
::
vector
<
dsize_t
>
&
dst_s
,
size_t
cur_dim
)
{
if
(
cur_dim
==
src
->
Rank
()
-
1
)
{
// if this is the last dimension, copy the data
uint8_t
type_size
=
src
->
type
().
SizeInBytes
();
size_t
len
=
std
::
min
(
src
->
shape
()[
cur_dim
],
dst
->
shape
()[
cur_dim
])
*
type_size
;
dsize_t
src_flat_ind
=
0
,
dst_flat_ind
=
0
;
for
(
size_t
i
=
0
;
i
<
src
->
Rank
();
i
++
)
{
src_flat_ind
+=
src_s
[
i
]
*
cur_ind
[
i
];
dst_flat_ind
+=
dst_s
[
i
]
*
cur_ind
[
i
];
}
unsigned
char
*
src_addr
=
src
->
GetMutableBuffer
()
+
src_flat_ind
*
type_size
;
unsigned
char
*
dst_addr
=
dst
->
GetMutableBuffer
()
+
dst_flat_ind
*
type_size
;
CHECK_FAIL_RETURN_UNEXPECTED
(
memcpy_s
(
dst_addr
,
len
,
src_addr
,
len
)
==
0
,
"memcpy error"
);
}
else
{
// not the last dimension, keep doing recursion
dsize_t
min_ind
=
std
::
min
(
dst
->
shape
()[
cur_dim
],
src
->
shape
()[
cur_dim
]);
for
(
dsize_t
i
=
0
;
i
<
min_ind
;
i
++
)
{
cur_ind
[
cur_dim
]
=
i
;
RETURN_IF_NOT_OK
(
PadHelper
(
src
,
dst
,
cur_ind
,
src_s
,
dst_s
,
cur_dim
+
1
));
}
}
return
Status
::
OK
();
}
// Visitor accept method for NodePass
Status
BatchOp
::
Accept
(
NodePass
*
p
,
bool
*
modified
)
{
// Downcast shared pointer then call visitor
...
...
mindspore/ccsrc/dataset/engine/datasetops/batch_op.h
浏览文件 @
5eb95599
...
...
@@ -38,6 +38,7 @@ class DataBuffer;
using
TensorBatch
=
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>
;
using
TensorBatchTable
=
std
::
vector
<
TensorBatch
>
;
using
PadInfo
=
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
std
::
shared_ptr
<
Tensor
>>>
;
class
BatchOp
:
public
ParallelOp
{
public:
...
...
@@ -66,7 +67,7 @@ class BatchOp : public ParallelOp {
return
*
this
;
}
Builder
&
SetPaddingMap
(
const
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
float
>>
&
pad_map
,
bool
pad
=
true
)
{
Builder
&
SetPaddingMap
(
const
PadInfo
&
pad_map
,
bool
pad
=
true
)
{
builder_pad_
=
pad
;
builder_pad_map_
=
pad_map
;
return
*
this
;
...
...
@@ -119,7 +120,7 @@ class BatchOp : public ParallelOp {
int32_t
builder_num_workers_
;
int32_t
builder_op_connector_size_
;
std
::
vector
<
std
::
string
>
builder_cols_to_map_
;
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
float
>>
builder_pad_map_
;
PadInfo
builder_pad_map_
;
py
::
function
builder_batch_size_func_
;
py
::
function
builder_batch_map_func_
;
};
...
...
@@ -150,8 +151,7 @@ class BatchOp : public ParallelOp {
// @param int32_t rows_per_buf
// @param int32_t num_workers
BatchOp
(
int32_t
batch_size
,
bool
drop
,
bool
pad
,
int32_t
op_queue_size
,
int32_t
num_workers
,
const
std
::
vector
<
std
::
string
>
&
,
py
::
function
batch_size_func
,
py
::
function
batch_map_func
,
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
float
>>
pad_map
);
const
std
::
vector
<
std
::
string
>
&
,
py
::
function
batch_size_func
,
py
::
function
batch_map_func
,
PadInfo
pad_map
);
// BatchOp destructor
~
BatchOp
()
{}
...
...
@@ -183,15 +183,6 @@ class BatchOp : public ParallelOp {
// @return Status - The error code return
Status
operator
()()
override
;
// Pad input tensor according pad_shape, need to have same rank.
// @param std::shared_ptr<Tensor> src - tensor to pad from
// @param std::shared_ptr<Tensor> *dst - return tensor padded
// @param std::vector<dsize_t> pad_shape - shape to pad to
// @param float pad_val - value to pad with
// @return - The error code return
Status
PadTensor
(
std
::
shared_ptr
<
Tensor
>
src
,
std
::
shared_ptr
<
Tensor
>
*
dst
,
const
std
::
vector
<
dsize_t
>
&
pad_shape
,
float
pad_val
);
// Base-class override for NodePass visitor acceptor.
// @param p - Pointer to the NodePass to be accepted.
// @param modified - Whether this node visit modified the pipeline.
...
...
@@ -199,18 +190,6 @@ class BatchOp : public ParallelOp {
Status
Accept
(
NodePass
*
p
,
bool
*
modified
)
override
;
private:
// recursive helper function. This function could be very expensive if called on a multi-dimensional tensor
// it is only meant to be called by PadTensor.
// @tparam T - type of tensor and fill value
// @param std::shared_ptr<Tensor> src - Tensor to pad from
// @param std::shared_ptr<Tensor>* dst - Tensor to pad to, return value
// @param std::vector<dsize_t> cur_ind - recursion helper
// @param T pad_val - value to pad tensor with
// @param size_t cur_dim - recursion helper
// @return Status - The error code return
Status
PadHelper
(
std
::
shared_ptr
<
Tensor
>
src
,
std
::
shared_ptr
<
Tensor
>
dst
,
std
::
vector
<
dsize_t
>
cur_ind
,
const
std
::
vector
<
dsize_t
>
&
src_s
,
const
std
::
vector
<
dsize_t
>
&
dst_s
,
size_t
cur_dim
=
0
);
// Worker thread for doing the memcpy of batch
// @param int32_t param workerId
// @return Status - The error code return
...
...
@@ -225,23 +204,33 @@ class BatchOp : public ParallelOp {
// @param const std::unique_ptr<TensorQTable> *src - table that has the rows for batching
// @param const std::unique_ptr<TensorQTable> *dest - dest_table to hold batched rows
// @param int32_t size - batch_size
// @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
// @return Status - The error code return
Status
BatchRows
(
const
std
::
unique_ptr
<
TensorQTable
>
*
src
,
const
std
::
unique_ptr
<
TensorQTable
>
*
dest
,
size_t
size
);
static
Status
BatchRows
(
const
std
::
unique_ptr
<
TensorQTable
>
*
src
,
const
std
::
unique_ptr
<
TensorQTable
>
*
dest
,
dsize_t
batch_size
);
// Function that calls pyfunc to perform map on batch
// @param (std::pair<std::unique_ptr<TensorQTable>, batch_stats> *table_pair - contains un-batched tensor
// @return Status - The error code return
Status
MapColumns
(
std
::
pair
<
std
::
unique_ptr
<
TensorQTable
>
,
CBatchInfo
>
*
table_pair
);
// @param const PadInfo &pad_info pad info to unpack
// @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
// @param std::set<int32_t> *cols, col ids to perform pad on
// @param std::vector<float> *vals, default padding value for each column
// @param std::vector<std::vector<dsize_t>> *shapes, padding shape specified by user
// @return Status - The error code return
Status
UnpackPadInfo
(
std
::
set
<
int32_t
>
*
cols
,
std
::
vector
<
float
>
*
vals
,
std
::
vector
<
std
::
vector
<
dsize_t
>>
*
shapes
);
// @param table_pair
static
Status
UnpackPadInfo
(
const
PadInfo
&
pad_info
,
const
std
::
unordered_map
<
std
::
string
,
int32_t
>
&
column_name_id_map
,
std
::
set
<
int32_t
>
*
pad_cols
,
std
::
vector
<
std
::
shared_ptr
<
Tensor
>>
*
pad_vals
,
std
::
vector
<
std
::
vector
<
dsize_t
>>
*
pad_shapes
);
// @param table
// @param const PadInfo &pad_info pad info
// @param const std::unordered_map<std::string, int32_t>& column_name_id_map - column names to index mapping
// @return Status - The error code return
Status
PadColumns
(
std
::
pair
<
std
::
unique_ptr
<
TensorQTable
>
,
CBatchInfo
>
*
table_pair
);
static
Status
PadColumns
(
std
::
unique_ptr
<
TensorQTable
>
*
table
,
const
PadInfo
&
pad_info
,
const
std
::
unordered_map
<
std
::
string
,
int32_t
>
&
column_name_id_map
);
// the number of thread pulling from the mOutConnector of the Op below
// @return int32_t, 1
...
...
@@ -264,11 +253,11 @@ class BatchOp : public ParallelOp {
Status
InvokeBatchMapFunc
(
TensorTable
*
input
,
TensorTable
*
output
,
CBatchInfo
info
);
int32_t
start_batch_size_
;
bool
drop_
;
// bool for whether to drop remainder or not
bool
pad_
;
// bool for whether to perform padding on tensor
std
::
vector
<
std
::
string
>
pyfunc_column_names_
;
// Name of the columns to perform map op on
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
float
>>
pad_info_
;
// column names to perform padding on
std
::
unique_ptr
<
ChildIterator
>
child_iterator_
;
// child iterator for fetching TensorRows 1 by 1
bool
drop_
;
// bool for whether to drop remainder or not
bool
pad_
;
// bool for whether to perform padding on tensor
std
::
vector
<
std
::
string
>
pyfunc_column_names_
;
// Name of the columns to perform map op on
PadInfo
pad_info_
;
// column names to perform padding on
std
::
unique_ptr
<
ChildIterator
>
child_iterator_
;
// child iterator for fetching TensorRows 1 by 1
QueueList
<
std
::
pair
<
std
::
unique_ptr
<
TensorQTable
>
,
CBatchInfo
>>
worker_queues_
;
// internal queue for syncing worker
py
::
function
batch_size_func_
;
// Function pointer of batch size function
py
::
function
batch_map_func_
;
// Function pointer of per batch map function
...
...
mindspore/ccsrc/dataset/kernels/data/data_utils.cc
浏览文件 @
5eb95599
...
...
@@ -15,6 +15,8 @@
*/
#include "dataset/kernels/data/data_utils.h"
#include <algorithm>
#include <string>
#include <vector>
#include "dataset/core/constants.h"
#include "dataset/core/tensor.h"
...
...
@@ -220,5 +222,125 @@ Status ToFloat16(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *
return
Status
::
OK
();
}
Status
PadEnd
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
shared_ptr
<
Tensor
>
*
dst
,
const
std
::
vector
<
dsize_t
>
&
pad_shape
,
const
std
::
shared_ptr
<
Tensor
>
&
pad_val
)
{
if
(
pad_val
==
nullptr
)
{
if
(
src
->
type
().
IsNumeric
())
{
return
PadEndNumeric
(
src
,
dst
,
pad_shape
,
0
);
}
else
{
return
PadEndString
(
src
,
dst
,
pad_shape
,
""
);
}
}
if
(
pad_val
->
type
().
IsNumeric
())
{
float
val
=
0
;
RETURN_IF_NOT_OK
(
pad_val
->
GetItemAt
<
float
>
(
&
val
,
{}));
return
PadEndNumeric
(
src
,
dst
,
pad_shape
,
val
);
}
std
::
string_view
val
;
RETURN_IF_NOT_OK
(
pad_val
->
GetItemAt
(
&
val
,
{}));
return
PadEndString
(
src
,
dst
,
pad_shape
,
std
::
string
(
val
));
}
Status
PadEndNumeric
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
shared_ptr
<
Tensor
>
*
dst
,
const
std
::
vector
<
dsize_t
>
&
pad_shape
,
float
pad_val
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
src
!=
nullptr
&&
dst
!=
nullptr
,
"tensor can't be nullptr"
);
if
(
src
->
Rank
()
==
0
||
src
->
shape
().
AsVector
()
==
pad_shape
)
{
(
*
dst
)
=
src
;
// if no padding, copy the pointer
}
else
{
CHECK_FAIL_RETURN_UNEXPECTED
(
src
->
Rank
()
==
pad_shape
.
size
(),
"Pad to diff rank not allowed"
);
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
dst
,
TensorImpl
::
kFlexible
,
TensorShape
(
pad_shape
),
src
->
type
()));
auto
tensor_type
=
src
->
type
().
value
();
if
(
pad_val
==
0
)
{
// if pad with zero, don't care what type it is
RETURN_IF_NOT_OK
((
*
dst
)
->
Zero
());
}
else
if
(
tensor_type
==
DataType
::
DE_INT8
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
int8_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_BOOL
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
bool
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_UINT8
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
uint8_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_INT16
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
int16_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_FLOAT16
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
float16
>
(
static_cast
<
float16
>
(
pad_val
)));
}
else
if
(
tensor_type
==
DataType
::
DE_UINT16
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
uint16_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_INT32
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
int32_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_UINT32
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
uint32_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_INT64
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
int64_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_UINT64
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
uint64_t
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_FLOAT32
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
float
>
(
pad_val
));
}
else
if
(
tensor_type
==
DataType
::
DE_FLOAT64
)
{
RETURN_IF_NOT_OK
((
*
dst
)
->
Fill
<
double
>
(
pad_val
));
}
else
{
RETURN_STATUS_UNEXPECTED
(
"Incorrect/Unknown tensor type"
);
}
std
::
vector
<
dsize_t
>
cur_ind
(
src
->
Rank
(),
0
);
RETURN_IF_NOT_OK
(
PadEndNumericHelper
(
src
,
*
dst
,
cur_ind
,
0
));
}
return
Status
::
OK
();
}
Status
PadEndNumericHelper
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
shared_ptr
<
Tensor
>
dst
,
std
::
vector
<
dsize_t
>
cur_ind
,
size_t
cur_dim
)
{
if
(
cur_dim
==
src
->
Rank
()
-
1
)
{
// if this is the last dimension, copy the data
dst
->
CopyLastDimAt
(
src
,
cur_ind
);
}
else
{
// not the last dimension, keep doing recursion
dsize_t
min_ind
=
std
::
min
(
dst
->
shape
()[
cur_dim
],
src
->
shape
()[
cur_dim
]);
for
(
dsize_t
i
=
0
;
i
<
min_ind
;
i
++
)
{
cur_ind
[
cur_dim
]
=
i
;
RETURN_IF_NOT_OK
(
PadEndNumericHelper
(
src
,
dst
,
cur_ind
,
cur_dim
+
1
));
}
}
return
Status
::
OK
();
}
Status
PadEndString
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
shared_ptr
<
Tensor
>
*
dst
,
const
std
::
vector
<
dsize_t
>
&
pad_shape
,
const
std
::
string
&
pad_val
)
{
CHECK_FAIL_RETURN_UNEXPECTED
(
src
!=
nullptr
&&
dst
!=
nullptr
,
"tensor can't be nullptr"
);
if
(
src
->
Rank
()
==
0
||
src
->
shape
().
AsVector
()
==
pad_shape
)
{
(
*
dst
)
=
src
;
// if no padding, copy the pointer
}
else
{
CHECK_FAIL_RETURN_UNEXPECTED
(
src
->
Rank
()
==
pad_shape
.
size
(),
"Pad to diff rank not allowed"
);
std
::
vector
<
dsize_t
>
cur_ind
(
src
->
Rank
(),
0
);
std
::
vector
<
std
::
string
>
strings
;
RETURN_IF_NOT_OK
(
PadEndStringHelper
(
src
,
&
strings
,
TensorShape
(
pad_shape
),
cur_ind
,
0
,
pad_val
));
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
dst
,
strings
,
TensorShape
(
pad_shape
)));
}
return
Status
::
OK
();
}
Status
PadEndStringHelper
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
vector
<
std
::
string
>
*
dst
,
const
TensorShape
&
dst_shape
,
std
::
vector
<
dsize_t
>
cur_ind
,
size_t
cur_dim
,
const
std
::
string
&
pad_value
)
{
if
(
cur_dim
==
src
->
Rank
()
-
1
)
{
// if this is the last dimension, copy the data
dsize_t
min_ind
=
std
::
min
(
dst_shape
[
cur_dim
],
src
->
shape
()[
cur_dim
]);
for
(
dsize_t
i
=
0
;
i
<
min_ind
;
i
++
)
{
cur_ind
[
cur_dim
]
=
i
;
std
::
string_view
item
;
RETURN_IF_NOT_OK
(
src
->
GetItemAt
(
&
item
,
cur_ind
));
dst
->
emplace_back
(
item
);
}
for
(
dsize_t
i
=
min_ind
;
i
<
dst_shape
[
cur_dim
];
i
++
)
{
dst
->
emplace_back
(
pad_value
);
}
}
else
{
// not the last dimension, keep doing recursion
dsize_t
min_ind
=
std
::
min
(
dst_shape
[
cur_dim
],
src
->
shape
()[
cur_dim
]);
for
(
dsize_t
i
=
0
;
i
<
min_ind
;
i
++
)
{
cur_ind
[
cur_dim
]
=
i
;
RETURN_IF_NOT_OK
(
PadEndStringHelper
(
src
,
dst
,
dst_shape
,
cur_ind
,
cur_dim
+
1
,
pad_value
));
}
dsize_t
count
=
(
dst_shape
[
cur_dim
]
-
min_ind
)
*
dst_shape
.
Strides
()[
cur_dim
];
for
(
dsize_t
i
=
0
;
i
<
count
;
i
++
)
{
dst
->
emplace_back
(
pad_value
);
}
}
return
Status
::
OK
();
}
}
// namespace dataset
}
// namespace mindspore
mindspore/ccsrc/dataset/kernels/data/data_utils.h
浏览文件 @
5eb95599
...
...
@@ -17,6 +17,7 @@
#define DATASET_KERNELS_DATA_DATA_UTILS_H_
#include <memory>
#include <string>
#include <vector>
#include "dataset/core/constants.h"
#include "dataset/core/cv_tensor.h"
...
...
@@ -58,6 +59,59 @@ void Cast(const std::shared_ptr<Tensor> &input, std::shared_ptr<Tensor> *output)
Status
ToFloat16
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
);
Status
TypeCast
(
const
std
::
shared_ptr
<
Tensor
>
&
input
,
std
::
shared_ptr
<
Tensor
>
*
output
,
const
DataType
&
data_type
);
// Pad input tensor according pad_shape, need to have same rank.
// Based on the type of the input tensor, PadEndNumeric/String will be called.
// @param std::shared_ptr<Tensor> src - tensor to pad from
// @param std::shared_ptr<Tensor> *dst - return tensor padded
// @param std::vector<dsize_t> pad_shape - shape to pad to
// @param std::shared_ptr<Tensor> pad_val - value to pad with in Tensor format,
// @return - The error code return
Status
PadEnd
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
shared_ptr
<
Tensor
>
*
dst
,
const
std
::
vector
<
dsize_t
>
&
pad_shape
,
const
std
::
shared_ptr
<
Tensor
>
&
pad_val
);
// Pad input numeric tensor according pad_shape, need to have same rank.
// @param std::shared_ptr<Tensor> src - tensor to pad from
// @param std::shared_ptr<Tensor> *dst - return tensor padded
// @param std::vector<dsize_t> pad_shape - shape to pad to
// @param float pad_val - value to pad with
// @return - The error code return
Status
PadEndNumeric
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
shared_ptr
<
Tensor
>
*
dst
,
const
std
::
vector
<
dsize_t
>
&
pad_shape
,
float
pad_val
);
// recursive helper function for padding numric tensors. This function could be very expensive if called on a
// multi-dimensional tensor it is only meant to be called by PadEndNumeric.
// @tparam T - type of tensor and fill value
// @param std::shared_ptr<Tensor> src - Tensor to pad from
// @param std::shared_ptr<Tensor>* dst - Tensor to pad to, return value
// @param std::vector<dsize_t> cur_ind - recursion helper
// @param T pad_val - value to pad tensor with
// @param size_t cur_dim - recursion helper
// @return Status - The error code return
Status
PadEndNumericHelper
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
shared_ptr
<
Tensor
>
dst
,
std
::
vector
<
dsize_t
>
cur_ind
,
size_t
cur_dim
=
0
);
// Pad input string tensor according pad_shape, need to have same rank.
// @param std::shared_ptr<Tensor> src - tensor to pad from
// @param std::shared_ptr<Tensor> *dst - return tensor padded
// @param std::vector<dsize_t> pad_shape - shape to pad to
// @param std::string pad_val - value to pad with
// @return - The error code return
Status
PadEndString
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
shared_ptr
<
Tensor
>
*
dst
,
const
std
::
vector
<
dsize_t
>
&
pad_shape
,
const
std
::
string
&
pad_val
);
// recursive helper function for padding string tensors. This function could be very expensive if called on a
// multi-dimensional tensor it is only meant to be called by PadEndNumeric.
// @tparam T - type of tensor and fill value
// @param std::shared_ptr<Tensor> src - Tensor to pad from
// @param std::shared_ptr<Tensor>* dst - Tensor to pad to, return value
// @param std::vector<dsize_t> cur_ind - recursion helper
// @param std::string pad_val - value to pad tensor with
// @param size_t cur_dim - recursion helper
// @return Status - The error code return
Status
PadEndStringHelper
(
const
std
::
shared_ptr
<
Tensor
>
&
src
,
std
::
vector
<
std
::
string
>
*
dst
,
const
TensorShape
&
dst_shape
,
std
::
vector
<
dsize_t
>
cur_ind
,
size_t
cur_dim
,
const
std
::
string
&
pad_value
);
}
// namespace dataset
}
// namespace mindspore
...
...
mindspore/dataset/engine/validators.py
浏览文件 @
5eb95599
...
...
@@ -19,10 +19,12 @@ import inspect as ins
import
os
from
functools
import
wraps
from
multiprocessing
import
cpu_count
import
numpy
as
np
from
mindspore._c_expression
import
typing
from
.
import
samplers
from
.
import
datasets
from
.
import
samplers
INT32_MAX
=
2147483647
valid_detype
=
[
...
...
@@ -704,7 +706,7 @@ def check_pad_info(key, val):
check_type
(
dim
,
"dim in pad_shape"
,
int
)
assert
dim
>
0
,
"pad shape should be positive integers"
if
val
[
1
]
is
not
None
:
check_type
(
val
[
1
],
"pad_value"
,
(
int
,
float
))
check_type
(
val
[
1
],
"pad_value"
,
(
int
,
float
,
str
,
bytes
))
def
check_batch
(
method
):
...
...
tests/ut/cpp/dataset/batch_op_test.cc
浏览文件 @
5eb95599
...
...
@@ -299,8 +299,11 @@ TEST_F(MindDataTestBatchOp, TestBatchDropTrueRepeat) {
TEST_F
(
MindDataTestBatchOp
,
TestSimpleBatchPadding
)
{
std
::
string
schema_file
=
datasets_root_path_
+
"/testBatchDataset"
;
std
::
shared_ptr
<
BatchOp
>
op
;
std
::
map
<
std
::
string
,
std
::
pair
<
TensorShape
,
float
>>
m
;
m
.
insert
({
"col_1d"
,
std
::
make_pair
(
TensorShape
({
4
}),
-
1
)});
PadInfo
m
;
std
::
shared_ptr
<
Tensor
>
pad_value
;
Tensor
::
CreateTensor
(
&
pad_value
,
TensorImpl
::
kFlexible
,
TensorShape
::
CreateScalar
(),
DataType
(
DataType
::
DE_FLOAT32
));
pad_value
->
SetItemAt
<
float
>
({},
-
1
);
m
.
insert
({
"col_1d"
,
std
::
make_pair
(
TensorShape
({
4
}),
pad_value
)});
de
::
BatchOp
::
Builder
(
12
).
SetDrop
(
false
).
SetPaddingMap
(
m
,
true
).
Build
(
&
op
);
auto
tree
=
Build
({
Storage
(
schema_file
),
op
});
tree
->
Prepare
();
...
...
@@ -308,9 +311,54 @@ TEST_F(MindDataTestBatchOp, TestSimpleBatchPadding) {
if
(
rc
.
IsError
())
{
MS_LOG
(
ERROR
)
<<
"Return code error detected during tree launch: "
<<
rc
.
ToString
()
<<
"."
;
}
else
{
int64_t
payload
[]
=
{
-
9223372036854775807
-
1
,
1
,
-
1
,
-
1
,
2
,
3
,
-
1
,
-
1
,
4
,
5
,
-
1
,
-
1
,
6
,
7
,
-
1
,
-
1
,
8
,
9
,
-
1
,
-
1
,
10
,
11
,
-
1
,
-
1
,
12
,
13
,
-
1
,
-
1
,
14
,
15
,
-
1
,
-
1
,
16
,
17
,
-
1
,
-
1
,
18
,
19
,
-
1
,
-
1
,
20
,
21
,
-
1
,
-
1
,
22
,
23
,
-
1
,
-
1
};
int64_t
payload
[]
=
{
-
9223372036854775807
-
1
,
1
,
-
1
,
-
1
,
2
,
3
,
-
1
,
-
1
,
4
,
5
,
-
1
,
-
1
,
6
,
7
,
-
1
,
-
1
,
8
,
9
,
-
1
,
-
1
,
10
,
11
,
-
1
,
-
1
,
12
,
13
,
-
1
,
-
1
,
14
,
15
,
-
1
,
-
1
,
16
,
17
,
-
1
,
-
1
,
18
,
19
,
-
1
,
-
1
,
20
,
21
,
-
1
,
-
1
,
22
,
23
,
-
1
,
-
1
};
std
::
shared_ptr
<
de
::
Tensor
>
t
;
rc
=
de
::
Tensor
::
CreateTensor
(
&
t
,
TensorImpl
::
kFlexible
,
de
::
TensorShape
({
12
,
4
}),
de
::
DataType
(
DataType
::
DE_INT64
),
(
unsigned
char
*
)
payload
);
...
...
tests/ut/python/dataset/test_tensor_string.py
浏览文件 @
5eb95599
...
...
@@ -12,15 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
pytest
import
mindspore._c_dataengine
as
cde
import
numpy
as
np
import
mindspore.dataset
as
ds
import
mindspore._c_dataengine
as
cde
import
mindspore.common.dtype
as
mstype
from
mindspore.dataset.text
import
to_str
import
mindspore.dataset
as
ds
from
mindspore.dataset.text
import
to_str
,
to_bytes
# pylint: disable=comparison-with-itself
def
test_basic
():
x
=
np
.
array
([[
"ab"
,
"cde"
,
"121"
],
[
"x"
,
"km"
,
"789"
]],
dtype
=
'S'
)
n
=
cde
.
Tensor
(
x
)
...
...
@@ -28,8 +27,8 @@ def test_basic():
np
.
testing
.
assert_array_equal
(
x
,
arr
)
def
compare
(
strings
):
arr
=
np
.
array
(
strings
,
dtype
=
'S'
)
def
compare
(
strings
,
dtype
=
'S'
):
arr
=
np
.
array
(
strings
,
dtype
=
dtype
)
def
gen
():
(
yield
arr
,)
...
...
@@ -37,25 +36,51 @@ def compare(strings):
data
=
ds
.
GeneratorDataset
(
gen
,
column_names
=
[
"col"
])
for
d
in
data
:
np
.
testing
.
assert_array_equal
(
d
[
0
],
arr
)
np
.
testing
.
assert_array_equal
(
d
[
0
],
arr
.
astype
(
'S'
)
)
def
test_generator
():
compare
([
"ab"
])
compare
([
""
,
""
])
compare
([
""
])
compare
([
"ab"
,
""
])
compare
([
"ab"
,
"cde"
,
"121"
])
compare
([[
"ab"
,
"cde"
,
"121"
],
[
"x"
,
"km"
,
"789"
]])
compare
([[
"ab"
,
""
,
"121"
],
[
""
,
"km"
,
"789"
]])
compare
([
"ab"
],
dtype
=
'U'
)
compare
([
""
,
""
],
dtype
=
'U'
)
compare
([
""
],
dtype
=
'U'
)
compare
([
"ab"
,
""
],
dtype
=
'U'
)
compare
([
""
,
""
],
dtype
=
'U'
)
compare
([
""
,
"ab"
],
dtype
=
'U'
)
compare
([
"ab"
,
"cde"
,
"121"
],
dtype
=
'U'
)
compare
([[
"ab"
,
"cde"
,
"121"
],
[
"x"
,
"km"
,
"789"
]],
dtype
=
'U'
)
compare
([[
"ab"
,
""
,
"121"
],
[
""
,
"km"
,
"789"
]],
dtype
=
'U'
)
line
=
np
.
array
([
"This is a text file."
,
"Be happy every day."
,
"Good luck to everyone."
])
words
=
np
.
array
([[
"This"
,
"text"
,
"file"
,
"a"
],
[
"Be"
,
"happy"
,
"day"
,
"b"
],
[
"女"
,
""
,
"everyone"
,
"c"
]])
chinese
=
np
.
array
([
"今天天气太好了我们一起去外面玩吧"
,
"男默女泪"
,
"江州市长江大桥参加了长江大桥的通车仪式"
])
def
test_batching_strings
():
def
gen
():
yield
(
np
.
array
([
"ab"
,
"cde"
,
"121"
],
dtype
=
'S'
),)
for
row
in
chinese
:
yield
(
np
.
array
(
row
),)
data
=
ds
.
GeneratorDataset
(
gen
,
column_names
=
[
"col"
]).
batch
(
10
)
data
=
ds
.
GeneratorDataset
(
gen
,
column_names
=
[
"col"
])
data
=
data
.
batch
(
2
,
drop_remainder
=
True
)
with
pytest
.
raises
(
RuntimeError
)
as
info
:
for
_
in
data
:
pass
assert
"[Batch ERROR] Batch does not support"
in
str
(
info
.
value
)
for
d
in
data
:
np
.
testing
.
assert_array_equal
(
d
[
0
],
to_bytes
(
chinese
[
0
:
2
]))
def
test_map
():
...
...
@@ -67,7 +92,7 @@ def test_map():
def
split
(
b
):
s
=
to_str
(
b
)
splits
=
s
.
item
().
split
()
return
np
.
array
(
splits
,
dtype
=
'S'
)
return
np
.
array
(
splits
)
data
=
data
.
map
(
input_columns
=
[
"col"
],
operations
=
split
)
expected
=
np
.
array
([
"ab"
,
"cde"
,
"121"
],
dtype
=
'S'
)
...
...
@@ -91,19 +116,6 @@ def test_map2():
np
.
testing
.
assert_array_equal
(
d
[
0
],
expected
)
line
=
np
.
array
([
"This is a text file."
,
"Be happy every day."
,
"Good luck to everyone."
])
words
=
np
.
array
([[
"This"
,
"text"
,
"file"
,
"a"
],
[
"Be"
,
"happy"
,
"day"
,
"b"
],
[
"女"
,
""
,
"everyone"
,
"c"
]])
chinese
=
np
.
array
([
"今天天气太好了我们一起去外面玩吧"
,
"男默女泪"
,
"江州市长江大桥参加了长江大桥的通车仪式"
])
def
test_tfrecord1
():
s
=
ds
.
Schema
()
s
.
add_column
(
"line"
,
"string"
,
[])
...
...
@@ -181,6 +193,94 @@ def test_mindrecord():
np
.
testing
.
assert_array_equal
(
chinese
[
i
],
to_str
(
d
[
"chinese"
]))
# The following tests cases were copied from test_pad_batch but changed to strings instead
# this generator function yield two columns
# col1d: [0],[1], [2], [3]
# col2d: [[100],[200]], [[101],[201]], [102],[202]], [103],[203]]
def
gen_2cols
(
num
):
for
i
in
range
(
num
):
yield
(
np
.
array
([
str
(
i
)]),
np
.
array
([[
str
(
i
+
100
)],
[
str
(
i
+
200
)]]))
# this generator function yield one column of variable shapes
# col: [0], [0,1], [0,1,2], [0,1,2,3]
def
gen_var_col
(
num
):
for
i
in
range
(
num
):
yield
(
np
.
array
([
str
(
j
)
for
j
in
range
(
i
+
1
)]),)
# this generator function yield two columns of variable shapes
# col1: [0], [0,1], [0,1,2], [0,1,2,3]
# col2: [100], [100,101], [100,101,102], [100,110,102,103]
def
gen_var_cols
(
num
):
for
i
in
range
(
num
):
yield
(
np
.
array
([
str
(
j
)
for
j
in
range
(
i
+
1
)]),
np
.
array
([
str
(
100
+
j
)
for
j
in
range
(
i
+
1
)]))
# this generator function yield two columns of variable shapes
# col1: [[0]], [[0,1]], [[0,1,2]], [[0,1,2,3]]
# col2: [[100]], [[100,101]], [[100,101,102]], [[100,110,102,103]]
def
gen_var_cols_2d
(
num
):
for
i
in
range
(
num
):
yield
(
np
.
array
([[
str
(
j
)
for
j
in
range
(
i
+
1
)]]),
np
.
array
([[
str
(
100
+
j
)
for
j
in
range
(
i
+
1
)]]))
def
test_batch_padding_01
():
data1
=
ds
.
GeneratorDataset
((
lambda
:
gen_2cols
(
2
)),
[
"col1d"
,
"col2d"
])
data1
=
data1
.
batch
(
batch_size
=
2
,
drop_remainder
=
False
,
pad_info
=
{
"col2d"
:
([
2
,
2
],
b
"-2"
),
"col1d"
:
([
2
],
b
"-1"
)})
data1
=
data1
.
repeat
(
2
)
for
data
in
data1
.
create_dict_iterator
():
np
.
testing
.
assert_array_equal
([[
b
"0"
,
b
"-1"
],
[
b
"1"
,
b
"-1"
]],
data
[
"col1d"
])
np
.
testing
.
assert_array_equal
([[[
b
"100"
,
b
"-2"
],
[
b
"200"
,
b
"-2"
]],
[[
b
"101"
,
b
"-2"
],
[
b
"201"
,
b
"-2"
]]],
data
[
"col2d"
])
def
test_batch_padding_02
():
data1
=
ds
.
GeneratorDataset
((
lambda
:
gen_2cols
(
2
)),
[
"col1d"
,
"col2d"
])
data1
=
data1
.
batch
(
batch_size
=
2
,
drop_remainder
=
False
,
pad_info
=
{
"col2d"
:
([
1
,
2
],
""
)})
data1
=
data1
.
repeat
(
2
)
for
data
in
data1
.
create_dict_iterator
():
np
.
testing
.
assert_array_equal
([[
b
"0"
],
[
b
"1"
]],
data
[
"col1d"
])
np
.
testing
.
assert_array_equal
([[[
b
"100"
,
b
""
]],
[[
b
"101"
,
b
""
]]],
data
[
"col2d"
])
def
test_batch_padding_03
():
data1
=
ds
.
GeneratorDataset
((
lambda
:
gen_var_col
(
4
)),
[
"col"
])
data1
=
data1
.
batch
(
batch_size
=
2
,
drop_remainder
=
False
,
pad_info
=
{
"col"
:
(
None
,
"PAD_VALUE"
)})
# pad automatically
data1
=
data1
.
repeat
(
2
)
res
=
dict
()
for
ind
,
data
in
enumerate
(
data1
.
create_dict_iterator
()):
res
[
ind
]
=
data
[
"col"
].
copy
()
np
.
testing
.
assert_array_equal
(
res
[
0
],
[[
b
"0"
,
b
"PAD_VALUE"
],
[
0
,
1
]])
np
.
testing
.
assert_array_equal
(
res
[
1
],
[[
b
"0"
,
b
"1"
,
b
"2"
,
b
"PAD_VALUE"
],
[
b
"0"
,
b
"1"
,
b
"2"
,
b
"3"
]])
np
.
testing
.
assert_array_equal
(
res
[
2
],
[[
b
"0"
,
b
"PAD_VALUE"
],
[
b
"0"
,
b
"1"
]])
np
.
testing
.
assert_array_equal
(
res
[
3
],
[[
b
"0"
,
b
"1"
,
b
"2"
,
b
"PAD_VALUE"
],
[
b
"0"
,
b
"1"
,
b
"2"
,
b
"3"
]])
def
test_batch_padding_04
():
data1
=
ds
.
GeneratorDataset
((
lambda
:
gen_var_cols
(
2
)),
[
"col1"
,
"col2"
])
data1
=
data1
.
batch
(
batch_size
=
2
,
drop_remainder
=
False
,
pad_info
=
{})
# pad automatically
data1
=
data1
.
repeat
(
2
)
for
data
in
data1
.
create_dict_iterator
():
np
.
testing
.
assert_array_equal
(
data
[
"col1"
],
[[
b
"0"
,
b
""
],
[
b
"0"
,
b
"1"
]])
np
.
testing
.
assert_array_equal
(
data
[
"col2"
],
[[
b
"100"
,
b
""
],
[
b
"100"
,
b
"101"
]])
def
test_batch_padding_05
():
data1
=
ds
.
GeneratorDataset
((
lambda
:
gen_var_cols_2d
(
3
)),
[
"col1"
,
"col2"
])
data1
=
data1
.
batch
(
batch_size
=
3
,
drop_remainder
=
False
,
pad_info
=
{
"col2"
:
([
2
,
None
],
"-2"
),
"col1"
:
(
None
,
"-1"
)})
# pad automatically
for
data
in
data1
.
create_dict_iterator
():
np
.
testing
.
assert_array_equal
(
data
[
"col1"
],
[[[
b
"0"
,
b
"-1"
,
b
"-1"
]],
[[
b
"0"
,
b
"1"
,
b
"-1"
]],
[[
b
"0"
,
b
"1"
,
b
"2"
]]])
np
.
testing
.
assert_array_equal
(
data
[
"col2"
],
[[[
b
"100"
,
b
"-2"
,
b
"-2"
],
[
b
"-2"
,
b
"-2"
,
b
"-2"
]],
[[
b
"100"
,
b
"101"
,
b
"-2"
],
[
b
"-2"
,
b
"-2"
,
b
"-2"
]],
[[
b
"100"
,
b
"101"
,
b
"102"
],
[
b
"-2"
,
b
"-2"
,
b
"-2"
]]])
if
__name__
==
'__main__'
:
test_generator
()
test_basic
()
...
...
@@ -191,3 +291,8 @@ if __name__ == '__main__':
test_tfrecord2
()
test_tfrecord3
()
test_mindrecord
()
test_batch_padding_01
()
test_batch_padding_02
()
test_batch_padding_03
()
test_batch_padding_04
()
test_batch_padding_05
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录