Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
270bf831
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
270bf831
编写于
4月 29, 2020
作者:
J
Jesse Lee
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Random Data Op
上级
05676676
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
1367 addition
and
1 deletion
+1367
-1
mindspore/ccsrc/dataset/api/de_pipeline.cc
mindspore/ccsrc/dataset/api/de_pipeline.cc
+41
-0
mindspore/ccsrc/dataset/api/de_pipeline.h
mindspore/ccsrc/dataset/api/de_pipeline.h
+3
-0
mindspore/ccsrc/dataset/api/python_bindings.cc
mindspore/ccsrc/dataset/api/python_bindings.cc
+2
-0
mindspore/ccsrc/dataset/engine/data_schema.cc
mindspore/ccsrc/dataset/engine/data_schema.cc
+19
-0
mindspore/ccsrc/dataset/engine/data_schema.h
mindspore/ccsrc/dataset/engine/data_schema.h
+7
-0
mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
...ore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
+1
-0
mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.cc
.../ccsrc/dataset/engine/datasetops/source/random_data_op.cc
+414
-0
mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h
...e/ccsrc/dataset/engine/datasetops/source/random_data_op.h
+271
-0
mindspore/dataset/__init__.py
mindspore/dataset/__init__.py
+1
-1
mindspore/dataset/engine/datasets.py
mindspore/dataset/engine/datasets.py
+51
-0
mindspore/dataset/engine/iterators.py
mindspore/dataset/engine/iterators.py
+2
-0
tests/ut/cpp/dataset/random_data_op_test.cc
tests/ut/cpp/dataset/random_data_op_test.cc
+457
-0
tests/ut/data/dataset/testRandomData/datasetSchema.json
tests/ut/data/dataset/testRandomData/datasetSchema.json
+14
-0
tests/ut/data/dataset/testRandomData/datasetSchema2.json
tests/ut/data/dataset/testRandomData/datasetSchema2.json
+14
-0
tests/ut/python/dataset/test_random_dataset.py
tests/ut/python/dataset/test_random_dataset.py
+70
-0
未找到文件。
mindspore/ccsrc/dataset/api/de_pipeline.cc
浏览文件 @
270bf831
...
...
@@ -28,6 +28,7 @@
#include "dataset/engine/datasetops/source/manifest_op.h"
#include "dataset/engine/datasetops/source/cifar_op.h"
#include "dataset/engine/datasetops/source/celeba_op.h"
#include "dataset/engine/datasetops/source/random_data_op.h"
#include "dataset/engine/datasetops/source/text_file_op.h"
#include "dataset/engine/datasetops/filter_op.h"
#include "mindrecord/include/shard_category.h"
...
...
@@ -65,6 +66,7 @@ static std::unordered_map<uint32_t, pFunction> g_parse_op_func_ = {{kStorage, &D
{
kCifar10
,
&
DEPipeline
::
ParseCifar10Op
},
{
kCifar100
,
&
DEPipeline
::
ParseCifar100Op
},
{
kCelebA
,
&
DEPipeline
::
ParseCelebAOp
},
{
kRandomData
,
&
DEPipeline
::
ParseRandomDataOp
},
{
kTextFile
,
&
DEPipeline
::
ParseTextFileOp
}};
DEPipeline
::
DEPipeline
()
:
iterator_
(
nullptr
)
{
...
...
@@ -972,6 +974,45 @@ Status DEPipeline::ParseCifar100Op(const py::dict &args, std::shared_ptr<Dataset
return
Status
::
OK
();
}
Status
DEPipeline
::
ParseRandomDataOp
(
const
py
::
dict
&
args
,
std
::
shared_ptr
<
DatasetOp
>
*
ptr
)
{
// Required arguments
RandomDataOp
::
Builder
builder
;
if
(
args
[
"num_samples"
].
is_none
())
{
std
::
string
err_msg
=
"Error: num_samples is a required argument"
;
RETURN_STATUS_UNEXPECTED
(
err_msg
);
}
std
::
vector
<
std
::
string
>
columns_to_load
;
bool
schema_exists
=
false
;
// Optional arguments
for
(
auto
arg
:
args
)
{
std
::
string
key
=
py
::
str
(
arg
.
first
);
py
::
handle
value
=
arg
.
second
;
if
(
key
==
"num_parallel_workers"
)
{
(
void
)
builder
.
SetNumWorkers
(
ToInt
(
value
));
}
else
if
(
key
==
"schema_file_path"
||
key
==
"schema_json_string"
)
{
schema_exists
=
true
;
}
else
if
(
key
==
"num_samples"
)
{
(
void
)
builder
.
SetTotalRows
(
ToInt
(
value
));
}
else
if
(
key
==
"columns_list"
)
{
columns_to_load
=
ToStringVector
(
value
);
}
}
if
(
schema_exists
)
{
std
::
unique_ptr
<
DataSchema
>
schema
=
std
::
make_unique
<
DataSchema
>
();
if
(
args
.
contains
(
"schema_file_path"
))
{
RETURN_IF_NOT_OK
(
schema
->
LoadSchemaFile
(
ToString
(
args
[
"schema_file_path"
]),
columns_to_load
));
}
else
{
RETURN_IF_NOT_OK
(
schema
->
LoadSchemaString
(
ToString
(
args
[
"schema_json_string"
]),
columns_to_load
));
}
(
void
)
builder
.
SetDataSchema
(
std
::
move
(
schema
));
}
std
::
shared_ptr
<
RandomDataOp
>
op
;
RETURN_IF_NOT_OK
(
builder
.
Build
(
&
op
));
*
ptr
=
op
;
return
Status
::
OK
();
}
int32_t
DEPipeline
::
GetNumClasses
()
const
{
return
num_classes_
;
}
Status
DEPipeline
::
ParseMnistOp
(
const
py
::
dict
&
args
,
std
::
shared_ptr
<
DatasetOp
>
*
ptr
)
{
...
...
mindspore/ccsrc/dataset/api/de_pipeline.h
浏览文件 @
270bf831
...
...
@@ -60,6 +60,7 @@ enum OpName {
kCifar10
,
kCifar100
,
kCelebA
,
kRandomData
,
kTextFile
};
...
...
@@ -142,6 +143,8 @@ class DEPipeline {
Status
ParseCifar100Op
(
const
py
::
dict
&
args
,
std
::
shared_ptr
<
DatasetOp
>
*
ptr
);
Status
ParseRandomDataOp
(
const
py
::
dict
&
args
,
std
::
shared_ptr
<
DatasetOp
>
*
ptr
);
void
PrintTree
();
int32_t
GetNumClasses
()
const
;
...
...
mindspore/ccsrc/dataset/api/python_bindings.cc
浏览文件 @
270bf831
...
...
@@ -47,6 +47,7 @@
#include "dataset/engine/datasetops/source/mnist_op.h"
#include "dataset/engine/datasetops/source/manifest_op.h"
#include "dataset/engine/datasetops/source/mindrecord_op.h"
#include "dataset/engine/datasetops/source/random_data_op.h"
#include "dataset/engine/datasetops/source/sampler/distributed_sampler.h"
#include "dataset/engine/datasetops/source/sampler/pk_sampler.h"
#include "dataset/engine/datasetops/source/sampler/random_sampler.h"
...
...
@@ -489,6 +490,7 @@ PYBIND11_MODULE(_c_dataengine, m) {
.
value
(
"VOC"
,
OpName
::
kVoc
)
.
value
(
"CIFAR10"
,
OpName
::
kCifar10
)
.
value
(
"CIFAR100"
,
OpName
::
kCifar100
)
.
value
(
"RANDOMDATA"
,
OpName
::
kRandomData
)
.
value
(
"CELEBA"
,
OpName
::
kCelebA
)
.
value
(
"TEXTFILE"
,
OpName
::
kTextFile
);
...
...
mindspore/ccsrc/dataset/engine/data_schema.cc
浏览文件 @
270bf831
...
...
@@ -466,5 +466,24 @@ Status DataSchema::PreLoadExceptionCheck(const nlohmann::json &js) {
"
\"
columns
\"
node is required in the schema json file."
);
return
Status
::
OK
();
}
// Loops through all columns in the schema and returns a map with the column
// name to column index number.
Status
DataSchema
::
GetColumnNameMap
(
std
::
unordered_map
<
std
::
string
,
int32_t
>
*
out_column_name_map
)
{
if
(
out_column_name_map
==
nullptr
)
{
return
Status
(
StatusCode
::
kUnexpectedError
,
__LINE__
,
__FILE__
,
"unexpected null output column name map."
);
}
for
(
int32_t
i
=
0
;
i
<
col_descs_
.
size
();
++
i
)
{
if
(
col_descs_
[
i
].
name
().
empty
())
{
return
Status
(
StatusCode
::
kUnexpectedError
,
__LINE__
,
__FILE__
,
"Constructing column name map from schema, but found empty column name."
);
}
(
*
out_column_name_map
)[
col_descs_
[
i
].
name
()]
=
i
;
}
return
Status
::
OK
();
}
}
// namespace dataset
}
// namespace mindspore
mindspore/ccsrc/dataset/engine/data_schema.h
浏览文件 @
270bf831
...
...
@@ -20,6 +20,7 @@
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include <nlohmann/json.hpp>
#include "dataset/core/constants.h"
...
...
@@ -180,6 +181,12 @@ class DataSchema {
static
const
char
DEFAULT_DATA_SCHEMA_FILENAME
[];
// Loops through all columns in the schema and returns a map with the column
// name to column index number.
// @param out_column_name_map - The output map of columns names to column index
// @return Status - The error code return
Status
GetColumnNameMap
(
std
::
unordered_map
<
std
::
string
,
int32_t
>
*
out_column_name_map
);
private:
// Internal helper function. Parses the json schema file in any order and produces a schema that
// does not follow any particular order (json standard does not enforce any ordering protocol).
...
...
mindspore/ccsrc/dataset/engine/datasetops/source/CMakeLists.txt
浏览文件 @
270bf831
...
...
@@ -17,6 +17,7 @@ add_library(engine-datasetops-source OBJECT
${
FEATURE_SRCS
}
manifest_op.cc
cifar_op.cc
random_data_op.cc
celeba_op.cc
text_file_op.cc
)
...
...
mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.cc
0 → 100644
浏览文件 @
270bf831
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/engine/datasetops/source/random_data_op.h"
#include <iomanip>
#include <random>
#include "dataset/engine/execution_tree.h"
#include "dataset/core/config_manager.h"
#include "dataset/util/random.h"
#include "dataset/util/wait_post.h"
namespace
mindspore
{
namespace
dataset
{
// Builder constructor. Creates the builder object.
RandomDataOp
::
Builder
::
Builder
()
:
builder_data_schema_
(
nullptr
),
builder_num_workers_
(
0
),
builder_op_connector_size_
(
0
),
builder_rows_per_buffer_
(
0
),
builder_total_rows_
(
0
)
{
// Some arguments to the RandomDataOp have a default argument that is taken from the config.
// The user may override these defaults by using the builder set methods.
std
::
shared_ptr
<
ConfigManager
>
cfg
=
GlobalContext
::
config_manager
();
builder_rows_per_buffer_
=
cfg
->
rows_per_buffer
();
builder_num_workers_
=
cfg
->
num_parallel_workers
();
builder_op_connector_size_
=
cfg
->
op_connector_size
();
}
// The build method that produces the instantiated RandomDataOp as a shared pointer
Status
RandomDataOp
::
Builder
::
Build
(
std
::
shared_ptr
<
RandomDataOp
>
*
out_op
)
{
RETURN_IF_NOT_OK
(
SanityCheck
());
*
out_op
=
std
::
make_shared
<
RandomDataOp
>
(
builder_num_workers_
,
builder_op_connector_size_
,
builder_rows_per_buffer_
,
builder_total_rows_
,
std
::
move
(
builder_data_schema_
));
// If the user did not provide a schema, then we will ask the op to generate a pseudo-random
// schema.
// See details of generateSchema function to learn what type of schema it will create.
if
((
*
out_op
)
->
data_schema_
==
nullptr
)
{
RETURN_IF_NOT_OK
((
*
out_op
)
->
GenerateSchema
());
}
// Extract the column name mapping from the schema and save it in the class.
// This will be needed when constructing buffers.
RETURN_IF_NOT_OK
((
*
out_op
)
->
data_schema_
->
GetColumnNameMap
(
&
((
*
out_op
)
->
column_name_map_
)));
return
Status
::
OK
();
}
// Check if the required parameters are set by the builder.
Status
RandomDataOp
::
Builder
::
SanityCheck
()
const
{
// There actually is no required arguments for the random data op at all.
// Some arguments are preset with global values from config, and if they are not given by the user
// then we create them randomly. Leaving this function here for consistency with other operators.
return
Status
::
OK
();
}
// Constructor for RandomDataOp
RandomDataOp
::
RandomDataOp
(
int32_t
num_workers
,
int32_t
op_connector_size
,
int64_t
rows_per_buffer
,
int64_t
total_rows
,
std
::
unique_ptr
<
DataSchema
>
data_schema
)
:
ParallelOp
(
num_workers
,
op_connector_size
),
buffer_id_
(
0
),
rows_per_buffer_
(
rows_per_buffer
),
total_rows_
(
total_rows
),
epoch_buffers_sent_
(
0
),
guys_in_
(
0
),
guys_out_
(
num_workers_
),
eoe_worker_id_
(
0
),
data_schema_
(
std
::
move
(
data_schema
))
{
rand_gen_
.
seed
(
GetSeed
());
// seed the random generator
// If total rows was not given, then randomly pick a number
if
(
total_rows_
==
0
)
{
total_rows_
=
GenRandomInt
(
1
,
kMaxTotalRows
);
}
// Everyone is already out from the sync area.
all_out_
.
Set
();
}
// A print method typically used for debugging
void
RandomDataOp
::
Print
(
std
::
ostream
&
out
,
bool
show_all
)
const
{
// Always show the id and name as first line regardless if this summary or detailed print
out
<<
"("
<<
std
::
setw
(
2
)
<<
operator_id_
<<
") <RandomDataOp>:"
;
if
(
!
show_all
)
{
// Call the super class for displaying any common 1-liner info
ParallelOp
::
Print
(
out
,
show_all
);
// Then show any custom derived-internal 1-liner info for this op
out
<<
" [total rows: "
<<
total_rows_
<<
"]
\n
"
;
}
else
{
// Call the super class for displaying any common detailed info
ParallelOp
::
Print
(
out
,
show_all
);
// Then show any custom derived-internal stuff
out
<<
"
\n
Total_rows: "
<<
total_rows_
<<
"
\n
Rows per buffer: "
<<
rows_per_buffer_
<<
"
\n
Schema:
\n
"
<<
*
data_schema_
<<
"
\n\n
"
;
}
}
// Helper function to produce a default/random schema if one didn't exist
Status
RandomDataOp
::
GenerateSchema
()
{
if
(
data_schema_
!=
nullptr
)
{
return
Status
(
StatusCode
::
kUnexpectedError
,
__LINE__
,
__FILE__
,
"Generating a schema but one already exists!"
);
}
// To randomly create a schema, we need to choose:
// a) how many columns
// b) the type of each column
// c) the shape of each column (number of dimensions i.e. rank)
// d) the shape of each column (dimension values)
data_schema_
=
std
::
make_unique
<
DataSchema
>
();
std
::
unique_ptr
<
TensorShape
>
newShape
;
std
::
unique_ptr
<
ColDescriptor
>
newCol
;
// Loop over the number of chosen columns
int32_t
numColumns
=
GenRandomInt
(
1
,
kMaxNumColumns
);
for
(
int32_t
i
=
0
;
i
<
numColumns
;
i
++
)
{
// For each column:
// - choose a datatype
// - generate a shape that randomly chooses the number of dimensions and the dimension values.
DataType
::
Type
newType
=
static_cast
<
DataType
::
Type
>
(
GenRandomInt
(
0
,
kMaxDataType
));
int32_t
rank
=
GenRandomInt
(
1
,
kMaxRank
);
std
::
vector
<
dsize_t
>
dims
;
for
(
int32_t
d
=
0
;
d
<
rank
;
d
++
)
{
// 0 is not a valid dimension value. however, we can support "*" or unknown, so map the random
// 0 value to the unknown attribute if 0 is chosen
dsize_t
dim_value
=
static_cast
<
dsize_t
>
(
GenRandomInt
(
0
,
kMaxDimValue
));
if
(
dim_value
==
0
)
dim_value
=
TensorShape
::
kDimUnknown
;
dims
.
push_back
(
dim_value
);
}
newShape
=
std
::
make_unique
<
TensorShape
>
(
dims
);
// Create the column descriptor
std
::
string
colName
=
"c"
+
std
::
to_string
(
i
);
newCol
=
std
::
make_unique
<
ColDescriptor
>
(
colName
,
DataType
(
newType
),
TensorImpl
::
kFlexible
,
rank
,
newShape
.
get
());
data_schema_
->
AddColumn
(
*
newCol
);
}
return
Status
::
OK
();
}
// Class functor operator () override.
// All DatasetOps operate by launching a thread (see ExecutionTree). This class functor will
// provide the master loop that drives the logic for performing the work.
Status
RandomDataOp
::
operator
()()
{
// First, compute how many buffers we'll need to satisfy the total row count.
// The only reason we do this is for the purpose of throttling worker count if needed.
int64_t
buffers_needed
=
total_rows_
/
rows_per_buffer_
;
if
(
total_rows_
%
rows_per_buffer_
!=
0
)
{
buffers_needed
++
;
}
// If the amount of workers we have exceeds the number of buffers to produce, then we'll have
// idle workers doing nothing. In that case, let's throttle the worker count.
if
(
num_workers_
>
buffers_needed
)
{
MS_LOG
(
INFO
)
<<
"RandomDataOp throttling worker count from "
<<
num_workers_
<<
"to "
<<
buffers_needed
;
num_workers_
=
buffers_needed
;
num_producers_
=
num_workers_
;
guys_out_
=
num_workers_
;
// The output connector was already created with a different worker count. We have to drop and recreate
// that connector.
DatasetOp
::
CreateConnector
(
num_producers_
,
num_workers_
);
}
// Assign the number of rows to each worker in a round robin fashion.
worker_max_rows_
.
reserve
(
num_workers_
);
worker_rows_packed_
.
reserve
(
num_workers_
);
// init the counts to zero to start.
for
(
int32_t
w
=
0
;
w
<
num_workers_
;
w
++
)
{
worker_max_rows_
.
push_back
(
0
);
worker_rows_packed_
.
push_back
(
0
);
}
// then assign round robin row counts
int32_t
currentWorker
=
0
;
for
(
int64_t
r
=
0
;
r
<
total_rows_
;
r
++
)
{
worker_max_rows_
[
currentWorker
]
++
;
currentWorker
=
(
currentWorker
+
1
)
%
num_workers_
;
}
// Next, compute the total buffer count. This stat is needed during reset logic
for
(
int32_t
w
=
0
;
w
<
num_workers_
;
w
++
)
{
int64_t
worker_buffers
=
0
;
worker_buffers
=
worker_max_rows_
[
w
]
/
rows_per_buffer_
;
if
(
worker_max_rows_
[
w
]
%
rows_per_buffer_
!=
0
)
worker_buffers
++
;
epoch_buffers_sent_
+=
worker_buffers
;
}
// For the connector to work, we need to target the correct worker channel for the eoe.
// This will initialize it for the first one. reset() handles for the rest of the epochs.
eoe_worker_id_
=
epoch_buffers_sent_
%
num_workers_
;
epoch_buffers_sent_
++
;
// Add the eoe buffer to the count for subsequent epochs
// RandomDataOp doesn't need the master thread to stay around. Kick off the workers and then master exits.
RETURN_IF_NOT_OK
(
tree_
->
LaunchWorkers
(
num_workers_
,
std
::
bind
(
&
RandomDataOp
::
WorkerEntry
,
this
,
std
::
placeholders
::
_1
)));
// required task group setup after launching workers
TaskManager
::
FindMe
()
->
Post
();
RETURN_IF_NOT_OK
(
epoch_sync_wait_post_
.
Register
(
tree_
->
AllTasks
()));
return
Status
::
OK
();
}
// Performs a synchronization between workers at the end of an epoch
Status
RandomDataOp
::
EpochSync
(
int32_t
worker_id
,
bool
*
quitting
)
{
MS_LOG
(
INFO
)
<<
"RandomDataOp worker "
<<
worker_id
<<
" syncing at end of epoch"
;
// Sync on the guys_in counter
// We have to wait the last guy is out.
all_out_
.
Wait
();
// If we are not in a repeat loop, or that was the last repeat already, then setup our exit
// condition from the master loop.
if
(
!
BitTest
(
op_ctrl_flags_
,
kDeOpRepeated
)
||
BitTest
(
op_ctrl_flags_
,
kDeOpLastRepeat
))
{
*
quitting
=
true
;
}
auto
prev
=
guys_in_
.
fetch_add
(
1
);
bool
last_guy_in
=
(
prev
+
1
)
==
num_workers_
;
// If we are the last worker to hit this sync point, we have some extra tasks
if
(
last_guy_in
)
{
MS_LOG
(
INFO
)
<<
"RandomDataOp worker "
<<
worker_id
<<
" is the last one to sync. eoe sent as worker "
<<
eoe_worker_id_
;
// Prepare for sync
all_out_
.
Clear
();
// Always flow eoe at the end
std
::
unique_ptr
<
DataBuffer
>
eoe_buffer
=
std
::
make_unique
<
DataBuffer
>
(
0
,
DataBuffer
::
kDeBFlagEOE
);
RETURN_IF_NOT_OK
(
out_connector_
->
Add
(
eoe_worker_id_
,
std
::
move
(
eoe_buffer
)));
// If we're done then also flow the eof
if
(
*
quitting
)
{
// The eof needs to be sent from the next sender in the round robin, so +1
int32_t
eof_worker_id
=
(
eoe_worker_id_
+
1
)
%
num_workers_
;
MS_LOG
(
INFO
)
<<
"RandomDataOp worker "
<<
worker_id
<<
" has no more epochs. sending eof as worker "
<<
eof_worker_id
;
std
::
unique_ptr
<
DataBuffer
>
eof_buffer
=
std
::
make_unique
<
DataBuffer
>
(
0
,
DataBuffer
::
kDeBFlagEOF
);
RETURN_IF_NOT_OK
(
out_connector_
->
Add
(
eof_worker_id
,
std
::
move
(
eof_buffer
)));
}
}
// Wait for the reset to wake us up if we're not quitting
if
(
!
(
*
quitting
))
{
MS_LOG
(
INFO
)
<<
"RandomDataOp worker "
<<
worker_id
<<
" entering sync wait."
;
RETURN_IF_NOT_OK
(
epoch_sync_wait_post_
.
Wait
());
prev
=
guys_out_
.
fetch_add
(
1
);
bool
last_guy_out
=
(
prev
+
1
)
==
num_workers_
;
// Last guy out will clear the wait post and set the row counts
if
(
last_guy_out
)
{
MS_LOG
(
INFO
)
<<
"RandomDataOp worker "
<<
worker_id
<<
" last guy out clearing wait post."
;
epoch_sync_wait_post_
.
Clear
();
guys_in_
=
0
;
all_out_
.
Set
();
}
}
MS_LOG
(
INFO
)
<<
"RandomDataOp worker "
<<
worker_id
<<
" epoch sync complete."
;
return
Status
::
OK
();
}
// The entry point code for when workers are launched
Status
RandomDataOp
::
WorkerEntry
(
int32_t
worker_id
)
{
MS_LOG
(
INFO
)
<<
"RandomDataOp worker "
<<
worker_id
<<
" entry"
;
// handshake with the master first to tell it we're alive
TaskManager
::
FindMe
()
->
Post
();
bool
quitting
=
false
;
std
::
unique_ptr
<
TensorQTable
>
new_tensor_table
=
nullptr
;
// Loop until the quitting variable gets set to true
do
{
// If we have not yet reached the row count for this worker then produce another record
if
(
worker_rows_packed_
[
worker_id
]
<
worker_max_rows_
[
worker_id
])
{
TensorRow
new_row
;
// Start a new tensor table if needed
if
(
new_tensor_table
==
nullptr
)
{
new_tensor_table
=
std
::
make_unique
<
TensorQTable
>
();
}
// Create the data for the row
RETURN_IF_NOT_OK
(
CreateRandomRow
(
worker_id
,
&
new_row
));
// Add the row to our table
new_tensor_table
->
push_back
(
std
::
move
(
new_row
));
worker_rows_packed_
[
worker_id
]
++
;
// If the tensor table is at capacity then it's time to send it to output
if
(
new_tensor_table
->
size
()
==
rows_per_buffer_
)
{
RETURN_IF_NOT_OK
(
PackAndSend
(
worker_id
,
std
::
move
(
new_tensor_table
)));
}
}
else
{
// We've reached the total row count for this worker, so it's time for epoch sync.
// There is likely some records built but not sent yet, so take care of those first
// (this buffer will be smaller than rows_per_buffer)
if
(
new_tensor_table
!=
nullptr
&&
new_tensor_table
->
size
()
>
0
)
{
RETURN_IF_NOT_OK
(
PackAndSend
(
worker_id
,
std
::
move
(
new_tensor_table
)));
}
// Now, let's enter the epoch sync
RETURN_IF_NOT_OK
(
EpochSync
(
worker_id
,
&
quitting
));
}
}
while
(
!
quitting
);
MS_LOG
(
INFO
)
<<
"RandomDataOp worker "
<<
worker_id
<<
" is now quitting."
;
return
Status
::
OK
();
}
// A helper function to stuff the tensor table into a buffer and send it to output connector
Status
RandomDataOp
::
PackAndSend
(
int32_t
worker_id
,
std
::
unique_ptr
<
TensorQTable
>
in_table
)
{
auto
new_buffer
=
std
::
make_unique
<
DataBuffer
>
(
GetNextBufferId
(),
DataBuffer
::
kDeBFlagNone
);
new_buffer
->
set_tensor_table
(
std
::
move
(
in_table
));
new_buffer
->
set_column_name_map
(
column_name_map_
);
RETURN_IF_NOT_OK
(
out_connector_
->
Add
(
worker_id
,
std
::
move
(
new_buffer
)));
return
Status
::
OK
();
}
// A helper function to create random data for the row
Status
RandomDataOp
::
CreateRandomRow
(
int32_t
worker_id
,
TensorRow
*
new_row
)
{
if
(
new_row
==
nullptr
)
{
return
Status
(
StatusCode
::
kUnexpectedError
,
__LINE__
,
__FILE__
,
"Missing tensor row output"
);
}
// Create a tensor for each column, then add the tensor to the row
for
(
int32_t
i
=
0
;
i
<
data_schema_
->
NumColumns
();
++
i
)
{
const
ColDescriptor
current_col
=
data_schema_
->
column
(
i
);
std
::
vector
<
dsize_t
>
current_shape
=
current_col
.
shape
().
AsVector
();
std
::
unique_ptr
<
TensorShape
>
new_shape
=
nullptr
;
std
::
unique_ptr
<
unsigned
char
[]
>
buf
=
nullptr
;
std
::
shared_ptr
<
Tensor
>
new_tensor
=
nullptr
;
// We need to resolve the shape to fill in any unknown dimensions with random
// values, then use that as our shape for this tensor.
for
(
int
j
=
0
;
j
<
current_shape
.
size
();
++
j
)
{
if
(
current_shape
[
j
]
==
TensorShape
::
kDimUnknown
)
{
current_shape
[
j
]
=
static_cast
<
dsize_t
>
(
GenRandomInt
(
1
,
kMaxDimValue
));
}
}
new_shape
=
std
::
make_unique
<
TensorShape
>
(
current_shape
);
int64_t
size_in_bytes
=
new_shape
->
NumOfElements
()
*
current_col
.
type
().
SizeInBytes
();
// Generate a random byte of data. This may cause some funny data for things like doubles,floats, bools
// however the random data op is not too concerned about the physical data itself.
std
::
uniform_int_distribution
<
uint8_t
>
uniDist
(
0
,
255
);
uint8_t
random_byte
=
uniDist
(
rand_gen_
);
// Now, create a chunk of memory for the entire tensor and copy this byte in repeatedly.
buf
=
std
::
make_unique
<
unsigned
char
[]
>
(
size_in_bytes
);
int
ret_code
=
memset_s
(
buf
.
get
(),
size_in_bytes
,
random_byte
,
size_in_bytes
);
if
(
ret_code
!=
0
)
{
return
Status
(
StatusCode
::
kUnexpectedError
,
__LINE__
,
__FILE__
,
"Failed to set random bytes for a tensor."
);
}
RETURN_IF_NOT_OK
(
Tensor
::
CreateTensor
(
&
new_tensor
,
current_col
.
tensorImpl
(),
*
new_shape
,
current_col
.
type
(),
buf
.
get
()));
// Add this tensor to the tensor row for output
(
*
new_row
).
push_back
(
std
::
move
(
new_tensor
));
}
return
Status
::
OK
();
}
// Overrides base class reset method. When an operator does a reset, it cleans up any state
// info from it's previous execution and then initializes itself so that it can be executed
// again.
Status
RandomDataOp
::
Reset
()
{
MS_LOG
(
INFO
)
<<
"RandomDataOp resetting."
;
// Ensure all guys are in the waitpost
if
(
guys_in_
!=
num_workers_
)
{
return
Status
(
StatusCode
::
kUnexpectedError
,
__LINE__
,
__FILE__
,
"Issuing a reset, but some workers are missing from epochSync!"
);
}
// reset the row counters for all workers
for
(
int32_t
w
=
0
;
w
<
num_workers_
;
w
++
)
{
worker_rows_packed_
[
w
]
=
0
;
worker_max_rows_
[
w
]
=
0
;
}
buffer_id_
=
0
;
// Re-assign round robin row counts, starting from the worker after the one that gave
// the eoe last time
int32_t
currentWorker
=
(
eoe_worker_id_
+
1
)
%
num_workers_
;
for
(
int64_t
r
=
0
;
r
<
total_rows_
;
r
++
)
{
worker_max_rows_
[
currentWorker
]
++
;
currentWorker
=
(
currentWorker
+
1
)
%
num_workers_
;
}
// Compute which worker should get the eoe for the next epoch
eoe_worker_id_
=
((
epoch_buffers_sent_
%
num_workers_
)
+
eoe_worker_id_
)
%
num_workers_
;
// Wake up the workers to get them going again in a new epoch
guys_out_
=
0
;
epoch_sync_wait_post_
.
Set
();
return
Status
::
OK
();
}
}
// namespace dataset
}
// namespace mindspore
mindspore/ccsrc/dataset/engine/datasetops/source/random_data_op.h
0 → 100644
浏览文件 @
270bf831
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DATASET_ENGINE_DATASETOPS_SOURCE_RANDOM_DATA_OP_
#define DATASET_ENGINE_DATASETOPS_SOURCE_RANDOM_DATA_OP_
#include <atomic>
#include <map>
#include <memory>
#include <mutex>
#include <random>
#include <string>
#include <vector>
#include <unordered_map>
#include <utility>
#include "dataset/util/status.h"
#include "dataset/core/tensor.h"
#include "dataset/core/data_type.h"
#include "dataset/engine/data_schema.h"
#include "dataset/engine/datasetops/parallel_op.h"
#include "dataset/util/wait_post.h"
namespace
mindspore
{
namespace
dataset
{
// The RandomDataOp is a leaf node storage operator that generates random data based
// on the schema specifications. Typically, it's used for testing and demonstrating
// various dataset operator pipelines. It is not "real" data to train with.
// The data that is random created is just random and repeated bytes, there is no
// "meaning" behind what these bytes are.
class
RandomDataOp
:
public
ParallelOp
{
public:
// Some constants to provide limits to random generation.
static
constexpr
int32_t
kMaxNumColumns
=
4
;
static
constexpr
int32_t
kMaxRank
=
4
;
static
constexpr
int32_t
kMaxDimValue
=
2048
;
static
constexpr
int32_t
kMaxDataType
=
(
DataType
::
DE_UNKNOWN
-
1
);
static
constexpr
int32_t
kMaxTotalRows
=
1024
;
// A nested builder class to aid in the construction of a RandomDataOp
class
Builder
{
public:
/**
* Builder constructor. Creates the builder object.
* @note No default args.
* @return This is a constructor.
*/
Builder
();
/**
* Default destructor
*/
~
Builder
()
=
default
;
/**
* The build method that produces the instantiated RandomDataOp as a shared pointer
* @param out_op - The output RandomDataOperator that was constructed
* @return Status - The error code return
*/
Status
Build
(
std
::
shared_ptr
<
RandomDataOp
>
*
out_op
);
/**
* Builder set method
* @param data_schema - A user-provided schema
* @return Builder - The modified builder by reference
*/
Builder
&
SetDataSchema
(
std
::
unique_ptr
<
DataSchema
>
data_schema
)
{
builder_data_schema_
=
std
::
move
(
data_schema
);
return
*
this
;
}
/**
* Builder set method
* @param num_workers - The number of workers
* @return Builder - The modified builder by reference
*/
Builder
&
SetNumWorkers
(
int32_t
num_workers
)
{
builder_num_workers_
=
num_workers
;
return
*
this
;
}
/**
* Builder set method
* @param op_connector_size - The size of the output connector
* @return Builder - The modified builder by reference
*/
Builder
&
SetOpConnectorSize
(
int32_t
op_connector_size
)
{
builder_op_connector_size_
=
op_connector_size
;
return
*
this
;
}
/**
* Builder set method
* @param rows_per_buffer - The number of rows in each DataBuffer
* @return Builder - The modified builder by reference
*/
Builder
&
SetRowsPerBuffer
(
int64_t
rows_per_buffer
)
{
builder_rows_per_buffer_
=
rows_per_buffer
;
return
*
this
;
}
/**
* Builder set method
* @param total_rows - The total number of rows in the dataset
* @return Builder - The modified builder by reference
*/
Builder
&
SetTotalRows
(
int64_t
total_rows
)
{
builder_total_rows_
=
total_rows
;
return
*
this
;
}
private:
/**
* Check if the required parameters are set by the builder.
* @return Status - The error code return
*/
Status
SanityCheck
()
const
;
std
::
unique_ptr
<
DataSchema
>
builder_data_schema_
;
int32_t
builder_num_workers_
;
int32_t
builder_op_connector_size_
;
int64_t
builder_rows_per_buffer_
;
int64_t
builder_total_rows_
;
};
// class Builder
/**
* Constructor for RandomDataOp
* @note Private constructor. Must use builder to construct.
* @param num_workers - The number of workers
* @param op_connector_size - The size of the output connector
* @param rows_per_buffer - The number of rows in each DataBuffer
* @param data_schema - A user-provided schema
* @param total_rows - The total number of rows in the dataset
* @return Builder - The modified builder by reference
*/
RandomDataOp
(
int32_t
num_workers
,
int32_t
op_connector_size
,
int64_t
rows_per_buffer
,
int64_t
total_rows
,
std
::
unique_ptr
<
DataSchema
>
data_schema
);
/**
* Destructor
*/
~
RandomDataOp
()
=
default
;
/**
* A print method typically used for debugging
* @param out - The output stream to write output to
* @param show_all - A bool to control if you want to show all info or just a summary
*/
void
Print
(
std
::
ostream
&
out
,
bool
show_all
)
const
override
;
/**
* << Stream output operator overload
* @notes This allows you to write the debug print info using stream operators
* @param out - reference to the output stream being overloaded
* @param so - reference to the ShuffleOp to display
* @return - the output stream must be returned
*/
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
const
RandomDataOp
&
op
)
{
op
.
Print
(
out
,
false
);
return
out
;
}
/**
* Class functor operator () override.
* All DatasetOps operate by launching a thread (see ExecutionTree). This class functor will
* provide the master loop that drives the logic for performing the work.
* @return Status - The error code return
*/
Status
operator
()()
override
;
/**
* Overrides base class reset method. When an operator does a reset, it cleans up any state
* info from it's previous execution and then initializes itself so that it can be executed
* again.
* @return Status - The error code return
*/
Status
Reset
()
override
;
/**
* Quick getter for total rows.
*/
int64_t
GetTotalRows
()
const
{
return
total_rows_
;
}
private:
/**
* The entry point code for when workers are launched
* @param worker_id - The worker id
* @return Status - The error code return
*/
Status
WorkerEntry
(
int32_t
worker_id
)
override
;
/**
* Helper function to produce a default/random schema if one didn't exist
@return Status - The error code return
*/
Status
GenerateSchema
();
/**
* Performs a synchronization between workers at the end of an epoch
* @param worker_id - The worker id
* @return Status - The error code return
*/
Status
EpochSync
(
int32_t
worker_id
,
bool
*
quitting
);
/**
* A helper function to stuff the tensor table into a buffer and send it to output connector
* @param worker_id - The worker id
* @param in_table - The tensor table to pack and send
* @return Status - The error code return
*/
Status
PackAndSend
(
int32_t
worker_id
,
std
::
unique_ptr
<
TensorQTable
>
in_table
);
/**
* A helper function to create random data for the row
* @param worker_id - The worker id
* @param new_row - The output row to produce
* @return Status - The error code return
*/
Status
CreateRandomRow
(
int32_t
worker_id
,
TensorRow
*
new_row
);
/**
* A quick inline for producing a random number between (and including) min/max
* @param min - minimum number that can be generated
* @param max - maximum number that can be generated
* @return - The generated random number
*/
inline
int32_t
GenRandomInt
(
int32_t
min
,
int32_t
max
)
{
std
::
uniform_int_distribution
<
int32_t
>
uniDist
(
min
,
max
);
return
uniDist
(
rand_gen_
);
}
/**
* A quick inline for producing the next buffer id in sequence, threadsafe
* @return - The next buffer id.
*/
inline
int32_t
GetNextBufferId
()
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
buffer_id_mutex_
);
return
++
buffer_id_
;
}
int32_t
buffer_id_
;
int64_t
rows_per_buffer_
;
int64_t
total_rows_
;
int64_t
epoch_buffers_sent_
;
std
::
atomic
<
int32_t
>
guys_in_
;
std
::
atomic
<
int32_t
>
guys_out_
;
int32_t
eoe_worker_id_
;
std
::
unique_ptr
<
DataSchema
>
data_schema_
;
std
::
vector
<
int64_t
>
worker_max_rows_
;
std
::
vector
<
int64_t
>
worker_rows_packed_
;
std
::
unordered_map
<
std
::
string
,
int32_t
>
column_name_map_
;
std
::
mt19937
rand_gen_
;
WaitPost
epoch_sync_wait_post_
;
WaitPost
all_out_
;
std
::
mutex
buffer_id_mutex_
;
};
// class RandomDataOp
}
// namespace dataset
}
// namespace mindspore
#endif // DATASET_ENGINE_DATASETOPS_SOURCE_RANDOM_DATA_OP_
mindspore/dataset/__init__.py
浏览文件 @
270bf831
...
...
@@ -21,7 +21,7 @@ can also create samplers with this module to sample data.
from
.core.configuration
import
config
from
.engine.datasets
import
StorageDataset
,
TFRecordDataset
,
ImageFolderDatasetV2
,
MnistDataset
,
MindDataset
,
\
GeneratorDataset
,
ManifestDataset
,
Cifar10Dataset
,
Cifar100Dataset
,
VOCDataset
,
CelebADataset
,
TextFileDataset
,
\
Schema
,
Shuffle
,
zip
Schema
,
Shuffle
,
zip
,
RandomDataset
from
.engine.samplers
import
DistributedSampler
,
PKSampler
,
RandomSampler
,
SequentialSampler
,
SubsetRandomSampler
,
\
WeightedRandomSampler
,
Sampler
from
.engine.serializer_deserializer
import
serialize
,
deserialize
,
show
...
...
mindspore/dataset/engine/datasets.py
浏览文件 @
270bf831
...
...
@@ -3146,6 +3146,57 @@ class Cifar100Dataset(SourceDataset):
return
get_num_rows
(
num_rows
,
self
.
num_shards
)
class
RandomDataset
(
SourceDataset
):
"""
A source dataset that generates random data.
Args:
num_samples (int): number of samples to generate.
schema (str or Schema, optional): Path to the json schema file or schema object (default=None).
If the schema is not provided, the meta data from the TFRecord file is considered the schema.
columns_list (list[str], optional): List of columns to be read (default=None, read all columns)
num_parallel_workers (int, optional): number of workers to read the data
(default=None, number set in the config).
"""
def
__init__
(
self
,
schema
=
None
,
columns_list
=
None
,
num_samples
=
None
,
num_parallel_workers
=
None
):
super
().
__init__
(
num_parallel_workers
)
schema_obj
=
None
if
(
schema
is
not
None
)
and
(
not
isinstance
(
schema
,
Schema
)):
schema_obj
=
Schema
(
schema
)
# read the schema file and convert to schema object to validate it
self
.
schema
=
schema
self
.
columns_list
=
columns_list
self
.
num_samples
=
num_samples
if
schema_obj
is
not
None
and
num_samples
is
None
:
self
.
num_samples
=
schema_obj
.
num_rows
def
get_args
(
self
):
args
=
super
().
get_args
()
if
self
.
schema
is
not
None
:
if
isinstance
(
self
.
schema
,
Schema
):
self
.
schema
.
datasetType
=
'Random'
if
self
.
num_samples
is
not
None
:
self
.
schema
.
num_rows
=
self
.
num_samples
args
[
"schema_json_string"
]
=
self
.
schema
.
to_json
()
else
:
args
[
"schema_file_path"
]
=
self
.
schema
args
[
"schema"
]
=
self
.
schema
if
self
.
columns_list
is
not
None
:
args
[
"columns_list"
]
=
self
.
columns_list
if
self
.
num_samples
is
not
None
:
args
[
"num_samples"
]
=
self
.
num_samples
return
args
def
get_dataset_size
(
self
):
"""
Get the number of batches in an epoch.
Return:
Number, number of batches.
"""
return
num_samples
class
Schema
:
"""
Class to represent a schema of dataset.
...
...
mindspore/dataset/engine/iterators.py
浏览文件 @
270bf831
...
...
@@ -192,6 +192,8 @@ class Iterator:
op_type
=
OpName
.
CIFAR100
elif
isinstance
(
dataset
,
de
.
CelebADataset
):
op_type
=
OpName
.
CELEBA
elif
isinstance
(
dataset
,
de
.
RandomDataset
):
op_type
=
OpName
.
RANDOMDATA
elif
isinstance
(
dataset
,
de
.
TextFileDataset
):
op_type
=
OpName
.
TEXTFILE
else
:
...
...
tests/ut/cpp/dataset/random_data_op_test.cc
0 → 100644
浏览文件 @
270bf831
/**
* Copyright 2019 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "dataset/core/client.h"
#include "common/common.h"
#include "gtest/gtest.h"
#include <memory>
#include <vector>
#include <iostream>
#include "dataset/core/tensor_shape.h"
#include "dataset/engine/datasetops/source/random_data_op.h"
#include "dataset/engine/data_schema.h"
using
namespace
mindspore
::
dataset
;
using
mindspore
::
MsLogLevel
::
INFO
;
using
mindspore
::
ExceptionType
::
NoExceptionType
;
using
mindspore
::
LogStream
;
class
MindDataTestRandomDataOp
:
public
UT
::
DatasetOpTesting
{
};
// Test info:
// - Simple test with a user-provided schema generated purely from DataSchema C API
// - has an interation loop
//
// Tree: single node tree with RandomDataOp
//
// RandomDataOp
//
TEST_F
(
MindDataTestRandomDataOp
,
RandomDataOpBasic1
)
{
Status
rc
;
int32_t
rank
=
0
;
// not used
MS_LOG
(
INFO
)
<<
"UT test RandomDataOpBasic1"
;
// Start with an empty execution tree
auto
myTree
=
std
::
make_shared
<
ExecutionTree
>
();
// Create a schema using the C api's
std
::
unique_ptr
<
DataSchema
>
testSchema
=
std
::
make_unique
<
DataSchema
>
();
// RandomDataOp can randomly fill in unknown dimension lengths of a shape.
// Most other ops cannot do that as they are limited by the physical data itself. We're
// more flexible with random data since it is just making stuff up on the fly.
TensorShape
c1Shape
({
TensorShape
::
kDimUnknown
,
TensorShape
::
kDimUnknown
,
3
});
ColDescriptor
c1
(
"image"
,
DataType
(
DataType
::
DE_INT8
),
TensorImpl
::
kFlexible
,
rank
,
// not used
&
c1Shape
);
// Column 2 will just be a scalar label number
TensorShape
c2Shape
({});
// empty shape is a 1-value scalar Tensor
ColDescriptor
c2
(
"label"
,
DataType
(
DataType
::
DE_UINT32
),
TensorImpl
::
kFlexible
,
rank
,
&
c2Shape
);
testSchema
->
AddColumn
(
c1
);
testSchema
->
AddColumn
(
c2
);
std
::
shared_ptr
<
RandomDataOp
>
myRandomDataOp
;
RandomDataOp
::
Builder
builder
;
rc
=
builder
.
SetRowsPerBuffer
(
2
)
.
SetNumWorkers
(
1
)
.
SetDataSchema
(
std
::
move
(
testSchema
))
.
SetTotalRows
(
25
)
.
Build
(
&
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssignRoot
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
std
::
ostringstream
ss
;
ss
<<
*
myRandomDataOp
;
MS_LOG
(
INFO
)
<<
"RandomDataOp print: %s"
<<
ss
.
str
();
MS_LOG
(
INFO
)
<<
"Launching tree and begin iteration"
;
rc
=
myTree
->
Prepare
();
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
Launch
();
EXPECT_TRUE
(
rc
.
IsOk
());
// Start the loop of reading tensors from our pipeline
DatasetIterator
dI
(
myTree
);
TensorRow
tensorList
;
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
int
rowCount
=
0
;
while
(
!
tensorList
.
empty
())
{
// Don't display these rows...too big to show
MS_LOG
(
INFO
)
<<
"Row fetched #: "
<<
rowCount
;
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
rowCount
++
;
}
ASSERT_EQ
(
rowCount
,
25
);
}
// Test info:
// - Simple test with a randomly generated schema
// - no iteration loop on this one, just create the op
//
// Tree: single node tree with RandomDataOp
//
// RandomDataOp
//
TEST_F
(
MindDataTestRandomDataOp
,
RandomDataOpBasic2
)
{
Status
rc
;
MS_LOG
(
INFO
)
<<
"UT test RandomDataOpBasic2"
;
// Start with an empty execution tree
auto
myTree
=
std
::
make_shared
<
ExecutionTree
>
();
std
::
shared_ptr
<
RandomDataOp
>
myRandomDataOp
;
RandomDataOp
::
Builder
builder
;
rc
=
builder
.
SetRowsPerBuffer
(
2
)
.
SetNumWorkers
(
1
)
.
Build
(
&
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssignRoot
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
std
::
ostringstream
ss
;
ss
<<
*
myRandomDataOp
;
MS_LOG
(
INFO
)
<<
"RandomDataOp print: "
<<
ss
.
str
();
}
// Test info:
// - json file test with iteration
//
// Tree: single node tree with RandomDataOp
//
// RandomDataOp
//
TEST_F
(
MindDataTestRandomDataOp
,
RandomDataOpBasic3
)
{
Status
rc
;
MS_LOG
(
INFO
)
<<
"UT test RandomDataOpBasic3"
;
// Start with an empty execution tree
auto
myTree
=
std
::
make_shared
<
ExecutionTree
>
();
std
::
unique_ptr
<
DataSchema
>
testSchema
=
std
::
make_unique
<
DataSchema
>
();
rc
=
testSchema
->
LoadSchemaFile
(
datasets_root_path_
+
"/testRandomData/datasetSchema.json"
,
{});
EXPECT_TRUE
(
rc
.
IsOk
());
std
::
shared_ptr
<
RandomDataOp
>
myRandomDataOp
;
RandomDataOp
::
Builder
builder
;
rc
=
builder
.
SetRowsPerBuffer
(
2
)
.
SetNumWorkers
(
1
)
.
SetDataSchema
(
std
::
move
(
testSchema
))
.
SetTotalRows
(
10
)
.
Build
(
&
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssignRoot
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
std
::
ostringstream
ss
;
ss
<<
*
myRandomDataOp
;
MS_LOG
(
INFO
)
<<
"RandomDataOp print: "
<<
ss
.
str
();
MS_LOG
(
INFO
)
<<
"Launching tree and begin iteration"
;
rc
=
myTree
->
Prepare
();
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
Launch
();
EXPECT_TRUE
(
rc
.
IsOk
());
// Start the loop of reading tensors from our pipeline
DatasetIterator
dI
(
myTree
);
TensorRow
tensorList
;
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
int
rowCount
=
0
;
while
(
!
tensorList
.
empty
())
{
// Don't display these rows...too big to show
MS_LOG
(
INFO
)
<<
"Row fetched #: "
<<
rowCount
;
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
rowCount
++
;
}
ASSERT_EQ
(
rowCount
,
10
);
}
// Test info:
// - json schema input it's a fairly simple one
// - has an interation loop
//
// Tree: RepeatOp over RandomDataOp
//
// RepeatOp
// |
// RandomDataOp
//
TEST_F
(
MindDataTestRandomDataOp
,
RandomDataOpBasic4
)
{
Status
rc
;
MS_LOG
(
INFO
)
<<
"UT test RandomDataOpBasic4"
;
// Start with an empty execution tree
auto
myTree
=
std
::
make_shared
<
ExecutionTree
>
();
std
::
unique_ptr
<
DataSchema
>
testSchema
=
std
::
make_unique
<
DataSchema
>
();
rc
=
testSchema
->
LoadSchemaFile
(
datasets_root_path_
+
"/testRandomData/datasetSchema2.json"
,
{});
EXPECT_TRUE
(
rc
.
IsOk
());
std
::
shared_ptr
<
RandomDataOp
>
myRandomDataOp
;
RandomDataOp
::
Builder
builder
;
rc
=
builder
.
SetRowsPerBuffer
(
2
)
.
SetNumWorkers
(
1
)
.
SetDataSchema
(
std
::
move
(
testSchema
))
.
SetTotalRows
(
10
)
.
Build
(
&
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
uint32_t
numRepeats
=
2
;
std
::
shared_ptr
<
RepeatOp
>
myRepeatOp
;
rc
=
RepeatOp
::
Builder
(
numRepeats
)
.
Build
(
&
myRepeatOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myRepeatOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myRepeatOp
->
AddChild
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssignRoot
(
myRepeatOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
MS_LOG
(
INFO
)
<<
"Launching tree and begin iteration"
;
rc
=
myTree
->
Prepare
();
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
Launch
();
EXPECT_TRUE
(
rc
.
IsOk
());
// Start the loop of reading tensors from our pipeline
DatasetIterator
dI
(
myTree
);
TensorRow
tensorList
;
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
int
rowCount
=
0
;
while
(
!
tensorList
.
empty
())
{
MS_LOG
(
INFO
)
<<
"Row display for row #: "
<<
rowCount
;
// Display the tensor by calling the printer on it
for
(
int
i
=
0
;
i
<
tensorList
.
size
();
i
++
)
{
std
::
ostringstream
ss
;
ss
<<
*
tensorList
[
i
]
<<
std
::
endl
;
MS_LOG
(
INFO
)
<<
"Tensor print: %s"
<<
ss
.
str
();
}
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
rowCount
++
;
}
ASSERT_EQ
(
rowCount
,
20
);
}
// Test info:
// - json schema input it's a fairly simple one
// - has an interation loop
// - same as MindDataTestRandomDataOpBasic4 except that this one will have parallel workers
//
// Tree: RepeatOp over RandomDataOp
//
// RepeatOp
// |
// RandomDataOp
//
TEST_F
(
MindDataTestRandomDataOp
,
RandomDataOpBasic5
)
{
Status
rc
;
MS_LOG
(
INFO
)
<<
"UT test RandomDataOpBasic5"
;
// Start with an empty execution tree
auto
myTree
=
std
::
make_shared
<
ExecutionTree
>
();
std
::
unique_ptr
<
DataSchema
>
testSchema
=
std
::
make_unique
<
DataSchema
>
();
rc
=
testSchema
->
LoadSchemaFile
(
datasets_root_path_
+
"/testRandomData/datasetSchema2.json"
,
{});
EXPECT_TRUE
(
rc
.
IsOk
());
std
::
shared_ptr
<
RandomDataOp
>
myRandomDataOp
;
RandomDataOp
::
Builder
builder
;
rc
=
builder
.
SetRowsPerBuffer
(
2
)
.
SetNumWorkers
(
4
)
.
SetDataSchema
(
std
::
move
(
testSchema
))
.
SetTotalRows
(
10
)
.
Build
(
&
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
uint32_t
numRepeats
=
3
;
std
::
shared_ptr
<
RepeatOp
>
myRepeatOp
;
rc
=
RepeatOp
::
Builder
(
numRepeats
)
.
Build
(
&
myRepeatOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myRepeatOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myRepeatOp
->
AddChild
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssignRoot
(
myRepeatOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
MS_LOG
(
INFO
)
<<
"Launching tree and begin iteration"
;
rc
=
myTree
->
Prepare
();
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
Launch
();
EXPECT_TRUE
(
rc
.
IsOk
());
// Start the loop of reading tensors from our pipeline
DatasetIterator
dI
(
myTree
);
TensorRow
tensorList
;
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
int
rowCount
=
0
;
while
(
!
tensorList
.
empty
())
{
MS_LOG
(
INFO
)
<<
"Row display for row #: "
<<
rowCount
;
// Display the tensor by calling the printer on it
for
(
int
i
=
0
;
i
<
tensorList
.
size
();
i
++
)
{
std
::
ostringstream
ss
;
ss
<<
*
tensorList
[
i
]
<<
std
::
endl
;
MS_LOG
(
INFO
)
<<
"Tensor print: "
,
ss
.
str
();
}
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
rowCount
++
;
}
ASSERT_EQ
(
rowCount
,
30
);
}
// Test info:
// - repeat shuffle random
//
// Tree: RepeatOp over RandomDataOp
//
// RepeatOp
// |
// ShuffleOp
// |
// RandomDataOp
//
TEST_F
(
MindDataTestRandomDataOp
,
RandomDataOpTree1
)
{
Status
rc
;
MS_LOG
(
INFO
)
<<
"UT test RandomDataOpTree1"
;
// Start with an empty execution tree
auto
myTree
=
std
::
make_shared
<
ExecutionTree
>
();
std
::
unique_ptr
<
DataSchema
>
testSchema
=
std
::
make_unique
<
DataSchema
>
();
rc
=
testSchema
->
LoadSchemaFile
(
datasets_root_path_
+
"/testRandomData/datasetSchema2.json"
,
{});
EXPECT_TRUE
(
rc
.
IsOk
());
std
::
shared_ptr
<
RandomDataOp
>
myRandomDataOp
;
RandomDataOp
::
Builder
builder
;
rc
=
builder
.
SetRowsPerBuffer
(
2
)
.
SetNumWorkers
(
4
)
.
SetDataSchema
(
std
::
move
(
testSchema
))
.
SetTotalRows
(
10
)
.
Build
(
&
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
std
::
shared_ptr
<
ShuffleOp
>
myShuffleOp
;
rc
=
ShuffleOp
::
Builder
()
.
SetRowsPerBuffer
(
2
)
.
SetShuffleSize
(
4
)
.
Build
(
&
myShuffleOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myShuffleOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
uint32_t
numRepeats
=
3
;
std
::
shared_ptr
<
RepeatOp
>
myRepeatOp
;
rc
=
RepeatOp
::
Builder
(
numRepeats
)
.
Build
(
&
myRepeatOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssociateNode
(
myRepeatOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myRepeatOp
->
AddChild
(
myShuffleOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myShuffleOp
->
AddChild
(
myRandomDataOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
AssignRoot
(
myRepeatOp
);
EXPECT_TRUE
(
rc
.
IsOk
());
MS_LOG
(
INFO
)
<<
"Launching tree and begin iteration"
;
rc
=
myTree
->
Prepare
();
EXPECT_TRUE
(
rc
.
IsOk
());
rc
=
myTree
->
Launch
();
EXPECT_TRUE
(
rc
.
IsOk
());
// Start the loop of reading tensors from our pipeline
DatasetIterator
dI
(
myTree
);
TensorRow
tensorList
;
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
int
rowCount
=
0
;
while
(
!
tensorList
.
empty
())
{
MS_LOG
(
INFO
)
<<
"Row display for row #: "
<<
rowCount
;
// Display the tensor by calling the printer on it
for
(
int
i
=
0
;
i
<
tensorList
.
size
();
i
++
)
{
std
::
ostringstream
ss
;
ss
<<
*
tensorList
[
i
]
<<
std
::
endl
;
MS_LOG
(
INFO
)
<<
"Tensor print: "
<<
ss
.
str
();
}
rc
=
dI
.
FetchNextTensorRow
(
&
tensorList
);
EXPECT_TRUE
(
rc
.
IsOk
());
rowCount
++
;
}
ASSERT_EQ
(
rowCount
,
30
);
}
tests/ut/data/dataset/testRandomData/datasetSchema.json
0 → 100644
浏览文件 @
270bf831
{
"columns"
:
{
"image"
:
{
"type"
:
"uint8"
,
"rank"
:
3
,
"shape"
:
[
1920
,
1080
,
3
]
},
"label"
:
{
"type"
:
"int32"
,
"rank"
:
1
,
"shape"
:
[
1
]
}
}
}
tests/ut/data/dataset/testRandomData/datasetSchema2.json
0 → 100644
浏览文件 @
270bf831
{
"columns"
:
{
"image"
:
{
"type"
:
"uint8"
,
"rank"
:
2
,
"shape"
:
[
28
,
28
]
},
"label"
:
{
"type"
:
"uint8"
,
"rank"
:
1
,
"shape"
:
[
1
]
}
}
}
tests/ut/python/dataset/test_random_dataset.py
0 → 100644
浏览文件 @
270bf831
# Copyright 2019 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
import
mindspore.common.dtype
as
mstype
import
mindspore.dataset
as
ds
from
pathlib
import
Path
# just a basic test with parallel random data op
def
test_randomdataset_basic1
():
print
(
"Test randomdataset basic"
)
schema
=
ds
.
Schema
()
schema
.
add_column
(
'image'
,
de_type
=
mstype
.
uint8
,
shape
=
[
2
])
schema
.
add_column
(
'label'
,
de_type
=
mstype
.
uint8
,
shape
=
[
1
])
# apply dataset operations
ds1
=
ds
.
RandomDataset
(
schema
=
schema
,
num_samples
=
50
,
num_parallel_workers
=
4
)
ds1
=
ds1
.
repeat
(
4
)
num_iter
=
0
for
data
in
ds1
.
create_dict_iterator
():
# each data is a dictionary
# in this example, each dictionary has keys "image" and "label"
print
(
"{} image: {}"
.
format
(
num_iter
,
data
[
"image"
]))
print
(
"{} label: {}"
.
format
(
num_iter
,
data
[
"label"
]))
num_iter
+=
1
print
(
"Number of data in ds1: "
,
num_iter
)
assert
(
num_iter
==
200
)
# Another simple test
def
test_randomdataset_basic2
():
print
(
"Test randomdataset basic 2"
)
schema
=
ds
.
Schema
()
schema
.
add_column
(
'image'
,
de_type
=
mstype
.
uint8
,
shape
=
[
640
,
480
,
3
])
# 921600 bytes (a bit less than 1 MB per image)
schema
.
add_column
(
'label'
,
de_type
=
mstype
.
uint8
,
shape
=
[
1
])
# Make up about 10 samples
ds1
=
ds
.
RandomDataset
(
schema
=
schema
,
num_samples
=
10
,
num_parallel_workers
=
1
)
# cache size allows for about 4 images since each image just a bit less than 1MB, after that we will have to spill
ds1
=
ds1
.
repeat
(
4
)
num_iter
=
0
for
data
in
ds1
.
create_dict_iterator
():
# each data is a dictionary
# in this example, each dictionary has keys "image" and "label"
#print(data["image"])
print
(
"printing the label: {}"
.
format
(
data
[
"label"
]))
num_iter
+=
1
print
(
"Number of data in ds1: "
,
num_iter
)
assert
(
num_iter
==
40
)
if
__name__
==
'__main__'
:
test_randomdataset_basic1
()
test_randomdataset_basic2
()
print
(
'test_randomdataset_basic Ended.
\n
'
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录