Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
正统之独孤求败
mindspore
提交
88dcd908
M
mindspore
项目概览
正统之独孤求败
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
88dcd908
编写于
7月 18, 2020
作者:
L
Li Hongzhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
limit summary of exhausting the disk
上级
0aaa2d47
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
94 addition
and
55 deletion
+94
-55
mindspore/train/callback/_summary_collector.py
mindspore/train/callback/_summary_collector.py
+49
-38
mindspore/train/summary/_summary_writer.py
mindspore/train/summary/_summary_writer.py
+11
-4
mindspore/train/summary/_writer_pool.py
mindspore/train/summary/_writer_pool.py
+6
-6
mindspore/train/summary/summary_record.py
mindspore/train/summary/summary_record.py
+28
-7
未找到文件。
mindspore/train/callback/_summary_collector.py
浏览文件 @
88dcd908
...
...
@@ -108,6 +108,10 @@ class SummaryCollector(Callback):
custom_lineage_data (Union[dict, None]): Allows you to customize the data and present it on the MingInsight
lineage page. In the custom data, the key type support str, and the value type support str/int/float.
Default: None, it means there is no custom data.
collect_tensor_freq (Optional[int]): Same as the `collect_freq`, but controls TensorSummary specifically.
Default: None, which means the frequency is auto-calculated just to collect at most 50 steps TensorSummary.
max_file_size (Optional[int]): The maximum size in bytes each file can be written to the disk.
Default: None, which means no limit.
Raises:
ValueError: If the parameter value is not expected.
...
...
@@ -145,16 +149,28 @@ class SummaryCollector(Callback):
'histogram_regular'
:
None
}
def
__init__
(
self
,
summary_dir
,
collect_freq
=
10
,
collect_specified_data
=
None
,
keep_default_action
=
True
,
custom_lineage_data
=
None
):
def
__init__
(
self
,
summary_dir
,
collect_freq
=
10
,
collect_specified_data
=
None
,
keep_default_action
=
True
,
custom_lineage_data
=
None
,
collect_tensor_freq
=
None
,
max_file_size
=
None
):
super
(
SummaryCollector
,
self
).
__init__
()
self
.
_summary_dir
=
self
.
_process_summary_dir
(
summary_dir
)
self
.
_record
=
None
self
.
_check_
collect_freq
(
collect_freq
)
self
.
_check_
positive
(
'collect_freq'
,
collect_freq
)
self
.
_collect_freq
=
collect_freq
self
.
_check_positive
(
'collect_tensor_freq'
,
collect_tensor_freq
,
allow_none
=
True
)
self
.
_collect_tensor_freq
=
collect_tensor_freq
self
.
_check_positive
(
'max_file_size'
,
max_file_size
,
allow_none
=
True
)
self
.
_max_file_size
=
max_file_size
self
.
_check_action
(
keep_default_action
)
self
.
_collect_specified_data
=
self
.
_process_specified_data
(
collect_specified_data
,
keep_default_action
)
...
...
@@ -165,16 +181,14 @@ class SummaryCollector(Callback):
self
.
_custom_lineage_data
=
custom_lineage_data
self
.
_temp_optimizer
=
None
self
.
_has_saved_train_network
=
False
self
.
_has_saved_custom_data
=
False
self
.
_is_parse_loss_success
=
True
self
.
_first_step
=
True
self
.
_dataset_sink_mode
=
True
def
__enter__
(
self
):
self
.
_first_step
=
True
self
.
_dataset_sink_mode
=
True
self
.
_record
=
SummaryRecord
(
log_dir
=
self
.
_summary_dir
)
self
.
_record
=
SummaryRecord
(
log_dir
=
self
.
_summary_dir
,
max_file_size
=
self
.
_max_file_size
)
self
.
_first_step
,
self
.
_dataset_sink_mode
=
True
,
True
return
self
def
__exit__
(
self
,
*
err
):
...
...
@@ -198,11 +212,13 @@ class SummaryCollector(Callback):
return
summary_dir
@
staticmethod
def
_check_collect_freq
(
freq
):
"""Check collect freq type and value."""
check_value_type
(
'collect_freq'
,
freq
,
int
)
if
freq
<=
0
:
raise
ValueError
(
f
'For `collect_freq` the value should be greater than 0, but got `
{
freq
}
`.'
)
def
_check_positive
(
name
,
value
,
allow_none
=
False
):
"""Check if the value to be int type and positive."""
if
allow_none
:
return
check_value_type
(
name
,
value
,
int
)
if
value
<=
0
:
raise
ValueError
(
f
'For `
{
name
}
` the value should be greater than 0, but got `
{
value
}
`.'
)
@
staticmethod
def
_check_custom_lineage_data
(
custom_lineage_data
):
...
...
@@ -276,6 +292,9 @@ class SummaryCollector(Callback):
self
.
_collect_graphs
(
cb_params
)
self
.
_collect_dataset_graph
(
cb_params
)
if
self
.
_collect_tensor_freq
is
None
:
total_step
=
cb_params
.
epoch_num
*
cb_params
.
batch_num
self
.
_collect_tensor_freq
=
max
(
self
.
_collect_freq
,
total_step
//
50
)
if
self
.
_custom_lineage_data
and
not
self
.
_has_saved_custom_data
:
packaged_custom_data
=
self
.
_package_custom_lineage_data
(
self
.
_custom_lineage_data
)
...
...
@@ -287,24 +306,29 @@ class SummaryCollector(Callback):
def
step_end
(
self
,
run_context
):
cb_params
=
run_context
.
original_args
()
if
cb_params
.
mode
!=
ModeEnum
.
TRAIN
.
value
:
return
if
self
.
_first_step
:
# Notice: This way of determining whether dataset sink mode is True does not work in the eval scenario
self
.
_dataset_sink_mode
=
bool
(
cb_params
.
cur_step_num
==
cb_params
.
batch_num
)
if
cb_params
.
mode
==
ModeEnum
.
TRAIN
.
value
:
if
not
self
.
_is_collect_this_step
(
cb_params
):
return
self
.
_dataset_sink_mode
=
cb_params
.
cur_step_num
==
cb_params
.
batch_num
self
.
_collect_at_step_end
(
cb_params
,
plugin_filter
=
None
)
self
.
_first_step
=
False
else
:
current
=
cb_params
.
cur_epoch_num
if
self
.
_dataset_sink_mode
else
cb_params
.
cur_step_num
if
current
%
self
.
_collect_freq
==
0
and
current
%
self
.
_collect_tensor_freq
==
0
:
self
.
_collect_at_step_end
(
cb_params
,
plugin_filter
=
None
)
elif
current
%
self
.
_collect_tensor_freq
==
0
:
self
.
_collect_at_step_end
(
cb_params
,
lambda
plugin
:
plugin
==
PluginEnum
.
TENSOR
.
value
)
elif
current
%
self
.
_collect_freq
==
0
:
self
.
_collect_at_step_end
(
cb_params
,
lambda
plugin
:
plugin
!=
PluginEnum
.
TENSOR
.
value
)
if
not
self
.
_has_saved_train_network
:
self
.
_collect_graphs
(
cb_params
)
self
.
_collect_input_data
(
cb_params
)
self
.
_collect_metric
(
cb_params
)
self
.
_collect_histogram
(
cb_params
)
def
_collect_at_step_end
(
self
,
cb_params
,
plugin_filter
):
self
.
_collect_input_data
(
cb_params
)
self
.
_collect_metric
(
cb_params
)
self
.
_collect_histogram
(
cb_params
)
self
.
_record
.
record
(
cb_params
.
cur_step_num
,
plugin_filter
=
plugin_filter
)
self
.
_first_step
=
False
self
.
_record
.
record
(
cb_params
.
cur_step_num
)
def
end
(
self
,
run_context
):
cb_params
=
run_context
.
original_args
()
...
...
@@ -331,18 +355,6 @@ class SummaryCollector(Callback):
raise
ValueError
(
f
"There are more than one
{
self
.
__class__
.
__name__
}
instance in callback list,"
f
"but expected only one
{
self
.
__class__
.
__name__
}
instance."
)
def
_is_collect_this_step
(
self
,
cb_params
):
"""Decide whether to collect data for the current step."""
# Make sure the first step data is recorded
if
not
self
.
_first_step
:
if
self
.
_dataset_sink_mode
:
if
cb_params
.
cur_epoch_num
%
self
.
_collect_freq
:
return
False
else
:
if
cb_params
.
cur_step_num
%
self
.
_collect_freq
:
return
False
return
True
@
staticmethod
def
_package_custom_lineage_data
(
custom_lineage_data
):
"""
...
...
@@ -411,7 +423,6 @@ class SummaryCollector(Callback):
if
graph_proto
is
None
:
return
self
.
_has_saved_train_network
=
True
self
.
_record
.
add_value
(
PluginEnum
.
GRAPH
.
value
,
'train_network/auto'
,
graph_proto
)
def
_collect_metric
(
self
,
cb_params
):
...
...
mindspore/train/summary/_summary_writer.py
浏览文件 @
88dcd908
...
...
@@ -24,8 +24,8 @@ from ._summary_adapter import package_init_event
class
BaseWriter
:
"""BaseWriter to be subclass."""
def
__init__
(
self
,
filepath
)
->
None
:
self
.
_filepath
=
filepath
def
__init__
(
self
,
filepath
,
max_file_size
=
None
)
->
None
:
self
.
_filepath
,
self
.
_max_file_size
=
filepath
,
max_file_size
self
.
_writer
:
EventWriter_
=
None
def
init_writer
(
self
):
...
...
@@ -46,8 +46,15 @@ class BaseWriter:
def
write
(
self
,
plugin
,
data
):
"""Write data to file."""
if
self
.
writer
and
disk_usage
(
self
.
_filepath
).
free
<
len
(
data
)
*
32
:
raise
RuntimeError
(
'The disk space may be soon exhausted.'
)
self
.
writer
.
Write
(
data
)
raise
RuntimeError
(
f
'The disk space may be soon exhausted by the
{
type
(
self
).
__name__
}
.'
)
if
self
.
_max_file_size
is
None
:
self
.
writer
.
Write
(
data
)
elif
self
.
_max_file_size
>
0
:
self
.
_max_file_size
-=
len
(
data
)
self
.
writer
.
Write
(
data
)
else
:
raise
RuntimeError
(
f
"The file written by the
{
type
(
self
).
__name__
}
"
f
"has exceeded the specified max file size."
)
def
flush
(
self
):
"""Flush the writer."""
...
...
mindspore/train/summary/_writer_pool.py
浏览文件 @
88dcd908
...
...
@@ -51,10 +51,11 @@ class WriterPool(Process):
filelist (str): The mapping from short name to long filename.
"""
def
__init__
(
self
,
base_dir
,
**
filedict
)
->
None
:
def
__init__
(
self
,
base_dir
,
max_file_size
,
**
filedict
)
->
None
:
super
().
__init__
()
self
.
_base_dir
,
self
.
_filedict
=
base_dir
,
filedict
self
.
_queue
,
self
.
_writers_
=
Queue
(
cpu_count
()
*
2
),
None
self
.
_max_file_size
=
max_file_size
self
.
start
()
def
run
(
self
):
...
...
@@ -88,9 +89,9 @@ class WriterPool(Process):
for
plugin
,
filename
in
self
.
_filedict
.
items
():
filepath
=
os
.
path
.
join
(
self
.
_base_dir
,
filename
)
if
plugin
==
'summary'
:
self
.
_writers_
.
append
(
SummaryWriter
(
filepath
))
self
.
_writers_
.
append
(
SummaryWriter
(
filepath
,
self
.
_max_file_size
))
elif
plugin
==
'lineage'
:
self
.
_writers_
.
append
(
LineageWriter
(
filepath
))
self
.
_writers_
.
append
(
LineageWriter
(
filepath
,
self
.
_max_file_size
))
return
self
.
_writers_
def
_write
(
self
,
plugin
,
data
):
...
...
@@ -98,9 +99,8 @@ class WriterPool(Process):
for
writer
in
self
.
_writers
[:]:
try
:
writer
.
write
(
plugin
,
data
)
except
RuntimeError
:
logger
.
warning
(
f
'The disk space may be soon exhausted by this
{
type
(
writer
).
__name__
}
, '
'so the writer will be closed and not for further writing.'
)
except
RuntimeError
as
e
:
logger
.
warning
(
e
.
args
[
0
])
self
.
_writers
.
remove
(
writer
)
writer
.
close
()
...
...
mindspore/train/summary/summary_record.py
浏览文件 @
88dcd908
...
...
@@ -75,14 +75,17 @@ class SummaryRecord:
Args:
log_dir (str): The log_dir is a directory location to save the summary.
queue_max_size (int): The capacity of event queue.(reserved). Default: 0.
flush_time (int): Frequency to flush the summaries to disk, the unit is second. Default: 120.
queue_max_size (int):
Deprecated.
The capacity of event queue.(reserved). Default: 0.
flush_time (int):
Deprecated.
Frequency to flush the summaries to disk, the unit is second. Default: 120.
file_prefix (str): The prefix of file. Default: "events".
file_suffix (str): The suffix of file. Default: "_MS".
network (Cell): Obtain a pipeline through network for saving graph summary. Default: None.
max_file_size (Optional[int]): The maximum size in bytes each file can be written to the disk.
\
Unlimited by default.
Raises:
TypeError: If `queue_max_size` and `flush_time` is not int, or `file_prefix` and `file_suffix` is not str.
TypeError: If `max_file_size`, `queue_max_size` or `flush_time` is not int,
\
or `file_prefix` and `file_suffix` is not str.
RuntimeError: If the log_dir can not be resolved to a canonicalized absolute pathname.
Examples:
...
...
@@ -103,7 +106,8 @@ class SummaryRecord:
flush_time
=
120
,
file_prefix
=
"events"
,
file_suffix
=
"_MS"
,
network
=
None
):
network
=
None
,
max_file_size
=
None
):
self
.
_closed
,
self
.
_event_writer
=
False
,
None
self
.
_mode
,
self
.
_data_pool
=
'train'
,
_dictlist
()
...
...
@@ -113,11 +117,18 @@ class SummaryRecord:
self
.
log_path
=
_make_directory
(
log_dir
)
if
not
isinstance
(
max_file_size
,
(
int
,
type
(
None
))):
raise
TypeError
(
"The 'max_file_size' should be int type."
)
if
not
isinstance
(
queue_max_size
,
int
)
or
not
isinstance
(
flush_time
,
int
):
raise
TypeError
(
"`queue_max_size` and `flush_time` should be int"
)
if
not
isinstance
(
file_prefix
,
str
)
or
not
isinstance
(
file_suffix
,
str
):
raise
TypeError
(
"`file_prefix` and `file_suffix` should be str."
)
if
max_file_size
is
not
None
and
max_file_size
<
0
:
logger
.
warning
(
"The 'max_file_size' should be greater than 0."
)
max_file_size
=
None
self
.
queue_max_size
=
queue_max_size
if
queue_max_size
<
0
:
# 0 is not limit
...
...
@@ -142,6 +153,7 @@ class SummaryRecord:
raise
RuntimeError
(
ex
)
self
.
_event_writer
=
WriterPool
(
log_dir
,
max_file_size
,
summary
=
self
.
full_file_name
,
lineage
=
get_event_file_name
(
'events'
,
'_lineage'
))
atexit
.
register
(
self
.
close
)
...
...
@@ -152,7 +164,7 @@ class SummaryRecord:
raise
ValueError
(
'SummaryRecord has been closed.'
)
return
self
def
__exit__
(
self
,
extype
,
exvalue
,
traceback
):
def
__exit__
(
self
,
*
err
):
"""Exit the context manager."""
self
.
close
()
...
...
@@ -229,13 +241,15 @@ class SummaryRecord:
else
:
raise
ValueError
(
f
'No such plugin of
{
repr
(
plugin
)
}
'
)
def
record
(
self
,
step
,
train_network
=
None
):
def
record
(
self
,
step
,
train_network
=
None
,
plugin_filter
=
None
):
"""
Record the summary.
Args:
step (int): Represents training step number.
train_network (Cell): The network that called the callback.
plugin_filter (Optional[Callable[[str], bool]]): The filter function,
\
which is used to filter out plugins from being written by return False.
Returns:
bool, whether the record process is successful or not.
...
...
@@ -266,7 +280,14 @@ class SummaryRecord:
if
self
.
_mode
==
'train'
:
self
.
_add_summary_tensor_data
()
self
.
_event_writer
.
write
(
self
.
_consume_data_pool
(
step
))
if
not
plugin_filter
:
self
.
_event_writer
.
write
(
self
.
_consume_data_pool
(
step
))
else
:
filtered
=
{}
for
plugin
,
datalist
in
self
.
_consume_data_pool
(
step
).
items
():
if
plugin_filter
(
plugin
):
filtered
[
plugin
]
=
datalist
self
.
_event_writer
.
write
(
filtered
)
return
True
def
_add_summary_tensor_data
(
self
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录