Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
ed70de80
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ed70de80
编写于
7月 28, 2020
作者:
L
liyong
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix coredump when number of file list more than 1000.
上级
e4a7ca7f
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
37 addition
and
23 deletion
+37
-23
mindspore/ccsrc/minddata/mindrecord/common/shard_pybind.cc
mindspore/ccsrc/minddata/mindrecord/common/shard_pybind.cc
+1
-0
mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h
...re/ccsrc/minddata/mindrecord/include/common/shard_utils.h
+2
-1
mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
+1
-1
mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc
mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc
+2
-2
mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
+14
-4
mindspore/dataset/engine/datasets.py
mindspore/dataset/engine/datasets.py
+15
-15
mindspore/dataset/engine/validators.py
mindspore/dataset/engine/validators.py
+2
-0
未找到文件。
mindspore/ccsrc/minddata/mindrecord/common/shard_pybind.cc
浏览文件 @
ed70de80
...
...
@@ -133,6 +133,7 @@ void BindGlobalParams(py::module *m) {
(
*
m
).
attr
(
"MAX_PAGE_SIZE"
)
=
kMaxPageSize
;
(
*
m
).
attr
(
"MIN_SHARD_COUNT"
)
=
kMinShardCount
;
(
*
m
).
attr
(
"MAX_SHARD_COUNT"
)
=
kMaxShardCount
;
(
*
m
).
attr
(
"MAX_FILE_COUNT"
)
=
kMaxFileCount
;
(
*
m
).
attr
(
"MIN_CONSUMER_COUNT"
)
=
kMinConsumerCount
;
(
void
)(
*
m
).
def
(
"get_max_thread_num"
,
&
GetMaxThreadNum
);
}
...
...
mindspore/ccsrc/minddata/mindrecord/include/common/shard_utils.h
浏览文件 @
ed70de80
...
...
@@ -104,7 +104,8 @@ const uint64_t kInt64Len = 8;
const
uint64_t
kMinFileSize
=
kInt64Len
;
const
int
kMinShardCount
=
1
;
const
int
kMaxShardCount
=
1000
;
const
int
kMaxShardCount
=
1000
;
// write
const
int
kMaxFileCount
=
4096
;
// read
const
int
kMinConsumerCount
=
1
;
const
int
kMaxConsumerCount
=
128
;
...
...
mindspore/ccsrc/minddata/mindrecord/include/shard_header.h
浏览文件 @
ed70de80
...
...
@@ -152,7 +152,7 @@ class ShardHeader {
MSRStatus
CheckIndexField
(
const
std
::
string
&
field
,
const
json
&
schema
);
void
ParsePage
(
const
json
&
page
,
int
shard_index
,
bool
load_dataset
);
MSRStatus
ParsePage
(
const
json
&
page
,
int
shard_index
,
bool
load_dataset
);
MSRStatus
ParseStatistics
(
const
json
&
statistics
);
...
...
mindspore/ccsrc/minddata/mindrecord/io/shard_reader.cc
浏览文件 @
ed70de80
...
...
@@ -252,7 +252,7 @@ std::vector<std::tuple<int, int, int, uint64_t>> ShardReader::ReadRowGroupSummar
if
(
shard_count
<=
0
)
{
return
row_group_summary
;
}
if
(
shard_count
<=
kMax
Shard
Count
)
{
if
(
shard_count
<=
kMax
File
Count
)
{
for
(
int
shard_id
=
0
;
shard_id
<
shard_count
;
++
shard_id
)
{
// return -1 when page's size equals to 0.
auto
last_page_id
=
shard_header_
->
GetLastPageId
(
shard_id
);
...
...
@@ -1054,7 +1054,7 @@ MSRStatus ShardReader::CreateTasksByRow(const std::vector<std::tuple<int, int, i
}
auto
offsets
=
std
::
get
<
1
>
(
ret
);
auto
local_columns
=
std
::
get
<
2
>
(
ret
);
if
(
shard_count_
<=
kMax
Shard
Count
)
{
if
(
shard_count_
<=
kMax
File
Count
)
{
for
(
int
shard_id
=
0
;
shard_id
<
shard_count_
;
shard_id
++
)
{
for
(
uint32_t
i
=
0
;
i
<
offsets
[
shard_id
].
size
();
i
+=
1
)
{
tasks_
.
InsertTask
(
TaskType
::
kCommonTask
,
offsets
[
shard_id
][
i
][
0
],
offsets
[
shard_id
][
i
][
1
],
...
...
mindspore/ccsrc/minddata/mindrecord/meta/shard_header.cc
浏览文件 @
ed70de80
...
...
@@ -55,7 +55,9 @@ MSRStatus ShardHeader::InitializeHeader(const std::vector<json> &headers, bool l
header_size_
=
header
[
"header_size"
].
get
<
uint64_t
>
();
page_size_
=
header
[
"page_size"
].
get
<
uint64_t
>
();
}
ParsePage
(
header
[
"page"
],
shard_index
,
load_dataset
);
if
(
SUCCESS
!=
ParsePage
(
header
[
"page"
],
shard_index
,
load_dataset
))
{
return
FAILED
;
}
shard_index
++
;
}
return
SUCCESS
;
...
...
@@ -248,11 +250,16 @@ MSRStatus ShardHeader::ParseIndexFields(const json &index_fields) {
return
SUCCESS
;
}
void
ShardHeader
::
ParsePage
(
const
json
&
pages
,
int
shard_index
,
bool
load_dataset
)
{
MSRStatus
ShardHeader
::
ParsePage
(
const
json
&
pages
,
int
shard_index
,
bool
load_dataset
)
{
// set shard_index when load_dataset is false
if
(
pages_
.
empty
()
&&
shard_count_
<=
kMaxShardCount
)
{
if
(
shard_count_
>
kMaxFileCount
)
{
MS_LOG
(
ERROR
)
<<
"The number of mindrecord files is greater than max value: "
<<
kMaxFileCount
;
return
FAILED
;
}
if
(
pages_
.
empty
()
&&
shard_count_
<=
kMaxFileCount
)
{
pages_
.
resize
(
shard_count_
);
}
for
(
auto
&
page
:
pages
)
{
int
page_id
=
page
[
"page_id"
];
int
shard_id
=
page
[
"shard_id"
];
...
...
@@ -275,6 +282,7 @@ void ShardHeader::ParsePage(const json &pages, int shard_index, bool load_datase
pages_
[
shard_index
].
push_back
(
std
::
move
(
parsed_page
));
}
}
return
SUCCESS
;
}
MSRStatus
ShardHeader
::
ParseStatistics
(
const
json
&
statistics
)
{
...
...
@@ -715,7 +723,9 @@ MSRStatus ShardHeader::FileToPages(const std::string dump_file_name) {
std
::
string
line
;
while
(
std
::
getline
(
page_in_handle
,
line
))
{
ParsePage
(
json
::
parse
(
line
),
-
1
,
true
);
if
(
SUCCESS
!=
ParsePage
(
json
::
parse
(
line
),
-
1
,
true
))
{
return
FAILED
;
}
}
page_in_handle
.
close
();
...
...
mindspore/dataset/engine/datasets.py
浏览文件 @
ed70de80
...
...
@@ -1054,45 +1054,45 @@ class Dataset:
* - type in 'dataset'
- type in 'mindrecord'
- detail
* -
DE_BOOL
* -
bool
- None
- Not support
* -
DE_INT
8
* -
int
8
- int32
-
* -
DE_UINT
8
* -
uint
8
- bytes(1D uint8)
- Drop dimension
* -
DE_INT
16
* -
int
16
- int32
-
* -
DE_UINT
16
* -
uint
16
- int32
-
* -
DE_INT
32
* -
int
32
- int32
-
* -
DE_UINT
32
* -
uint
32
- int64
-
* -
DE_INT
64
* -
int
64
- int64
-
* -
DE_UINT
64
* -
uint
64
- None
- Not support
* -
DE_FLOAT
16
-
Not support
* -
float
16
-
float32
-
* -
DE_FLOAT
32
* -
float
32
- float32
-
* -
DE_FLOAT
64
* -
float
64
- float64
-
* -
DE_STRING
* -
string
- string
- Not support multi-dimensional
DE_STRING
- Not support multi-dimensional
string
Note:
1. To save the samples in order, should set dataset's shuffle false and num_files 1.
...
...
mindspore/dataset/engine/validators.py
浏览文件 @
ed70de80
...
...
@@ -278,6 +278,8 @@ def check_minddataset(method):
dataset_file
=
param_dict
.
get
(
'dataset_file'
)
if
isinstance
(
dataset_file
,
list
):
if
len
(
dataset_file
)
>
4096
:
raise
ValueError
(
"length of dataset_file should less than or equal to {}."
.
format
(
4096
))
for
f
in
dataset_file
:
check_file
(
f
)
else
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录