提交 8333aea5 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!4061 Fix several minor issues

Merge pull request !4061 from LiHongzhang/fix_summary
...@@ -111,10 +111,10 @@ class SummaryCollector(Callback): ...@@ -111,10 +111,10 @@ class SummaryCollector(Callback):
Default: None, it means there is no custom data. Default: None, it means there is no custom data.
collect_tensor_freq (Optional[int]): Same semantic as the `collect_freq`, but controls TensorSummary only. collect_tensor_freq (Optional[int]): Same semantic as the `collect_freq`, but controls TensorSummary only.
Because TensorSummary data is too large compared to other summary data, this parameter is used to reduce Because TensorSummary data is too large compared to other summary data, this parameter is used to reduce
its collection. By default, TensorSummary data will be collected at most 21 steps, but not more than how its collection. By default, TensorSummary data will be collected at most 20 steps, but not more than how
many steps other summary data will be collected. many steps other summary data will be collected.
Default: None, which means to follow the behavior as described above. For example, given `collect_freq=10`, Default: None, which means to follow the behavior as described above. For example, given `collect_freq=10`,
when the total steps is 600, TensorSummary will be collected 21 steps, while other summary data 61 steps, when the total steps is 600, TensorSummary will be collected 20 steps, while other summary data 61 steps,
but when the total steps is 20, both TensorSummary and other summary will be collected 3 steps. but when the total steps is 20, both TensorSummary and other summary will be collected 3 steps.
Also note that when in parallel mode, the total steps will be splitted evenly, which will Also note that when in parallel mode, the total steps will be splitted evenly, which will
affect how many steps TensorSummary will be collected. affect how many steps TensorSummary will be collected.
...@@ -176,6 +176,7 @@ class SummaryCollector(Callback): ...@@ -176,6 +176,7 @@ class SummaryCollector(Callback):
self._check_positive('collect_tensor_freq', collect_tensor_freq, allow_none=True) self._check_positive('collect_tensor_freq', collect_tensor_freq, allow_none=True)
self._collect_tensor_freq = collect_tensor_freq self._collect_tensor_freq = collect_tensor_freq
self._tensor_collect_range = None
self._check_positive('max_file_size', max_file_size, allow_none=True) self._check_positive('max_file_size', max_file_size, allow_none=True)
self._max_file_size = max_file_size self._max_file_size = max_file_size
...@@ -296,12 +297,6 @@ class SummaryCollector(Callback): ...@@ -296,12 +297,6 @@ class SummaryCollector(Callback):
self._record.set_mode(cb_params.mode) self._record.set_mode(cb_params.mode)
if cb_params.mode == ModeEnum.TRAIN.value:
if self._collect_tensor_freq is None:
default_tensor_summary_limit = 20
total_step = cb_params.epoch_num * cb_params.batch_num
self._collect_tensor_freq = max(self._collect_freq, total_step // default_tensor_summary_limit)
def step_end(self, run_context): def step_end(self, run_context):
cb_params = run_context.original_args() cb_params = run_context.original_args()
if cb_params.mode != ModeEnum.TRAIN.value: if cb_params.mode != ModeEnum.TRAIN.value:
...@@ -322,17 +317,36 @@ class SummaryCollector(Callback): ...@@ -322,17 +317,36 @@ class SummaryCollector(Callback):
if self._first_step: if self._first_step:
# Notice: This way of determining whether dataset sink mode is True does not work in the eval scenario # Notice: This way of determining whether dataset sink mode is True does not work in the eval scenario
self._dataset_sink_mode = cb_params.cur_step_num == cb_params.batch_num self._dataset_sink_mode = cb_params.cur_step_num == cb_params.batch_num
self._tensor_collect_range = self._get_tensor_collect_range(cb_params, self._dataset_sink_mode)
self._collect_at_step_end(cb_params, plugin_filter=None) self._collect_at_step_end(cb_params, plugin_filter=None)
self._first_step = False self._first_step = False
else: else:
current = cb_params.cur_epoch_num if self._dataset_sink_mode else cb_params.cur_step_num current = cb_params.cur_epoch_num if self._dataset_sink_mode else cb_params.cur_step_num
if current % self._collect_freq == 0 and current % self._collect_tensor_freq == 0: if current % self._collect_freq == 0 and current in self._tensor_collect_range:
self._collect_at_step_end(cb_params, plugin_filter=None) self._collect_at_step_end(cb_params, plugin_filter=None)
elif current % self._collect_tensor_freq == 0: elif current in self._tensor_collect_range:
self._collect_at_step_end(cb_params, lambda plugin: plugin == PluginEnum.TENSOR.value) self._collect_at_step_end(cb_params, lambda plugin: plugin == PluginEnum.TENSOR.value)
elif current % self._collect_freq == 0: elif current % self._collect_freq == 0:
self._collect_at_step_end(cb_params, lambda plugin: plugin != PluginEnum.TENSOR.value) self._collect_at_step_end(cb_params, lambda plugin: plugin != PluginEnum.TENSOR.value)
def _get_tensor_collect_range(self, cb_params, dataset_sink_mode):
"""Get tensor collect range."""
total_step = cb_params.epoch_num
if not dataset_sink_mode:
total_step *= cb_params.batch_num
if self._collect_tensor_freq is not None:
# `total_step + 1`: `total_step` would be a value of `cb_params.cur_step_num`.
return range(0, total_step + 1, self._collect_tensor_freq)
summary_to_collect = len(range(0, total_step + 1, self._collect_freq))
default_tensor_summary_limit = 20
if summary_to_collect > default_tensor_summary_limit:
tensor_freq = total_step // (default_tensor_summary_limit - 1)
if tensor_freq > 1:
return range(0, total_step + 1, tensor_freq)[:default_tensor_summary_limit]
# `cb_params.cur_step_num` counting from `1`, when `1` is in the range, take `1` more steps.
return range(0, total_step + 1)[:default_tensor_summary_limit + 1]
return range(0, total_step + 1, self._collect_freq)
def _collect_at_step_end(self, cb_params, plugin_filter): def _collect_at_step_end(self, cb_params, plugin_filter):
self._collect_input_data(cb_params) self._collect_input_data(cb_params)
self._collect_metric(cb_params) self._collect_metric(cb_params)
...@@ -577,7 +591,8 @@ class SummaryCollector(Callback): ...@@ -577,7 +591,8 @@ class SummaryCollector(Callback):
""" """
learning_rate = optimizer.learning_rate learning_rate = optimizer.learning_rate
if not isinstance(learning_rate, Parameter): if not isinstance(learning_rate, Parameter):
logger.info("The learning rate detected in the optimizer is not a Parameter type, so it is not recorded.") logger.warning("The learning rate detected in the optimizer "
"is not a Parameter type, so it is not recorded.")
return None return None
return learning_rate.data return learning_rate.data
......
...@@ -20,6 +20,8 @@ from shutil import disk_usage ...@@ -20,6 +20,8 @@ from shutil import disk_usage
from ..._c_expression import EventWriter_ from ..._c_expression import EventWriter_
from ._summary_adapter import package_init_event from ._summary_adapter import package_init_event
FREE_DISK_SPACE_TIMES = 32
class BaseWriter: class BaseWriter:
"""BaseWriter to be subclass.""" """BaseWriter to be subclass."""
...@@ -45,13 +47,13 @@ class BaseWriter: ...@@ -45,13 +47,13 @@ class BaseWriter:
def write(self, plugin, data): def write(self, plugin, data):
"""Write data to file.""" """Write data to file."""
if self.writer and disk_usage(self._filepath).free < len(data) * 32:
raise RuntimeError(f"The disk space may be soon exhausted by the '{self._filepath}'.")
# 8: data length # 8: data length
# 4: crc32 of data length # 4: crc32 of data length
# 4: crc32 of data # 4: crc32 of data
metadata_length = 8 + 4 + 4 metadata_length = 8 + 4 + 4
required_length = len(data) + metadata_length required_length = len(data) + metadata_length
if self.writer and disk_usage(self._filepath).free < required_length * FREE_DISK_SPACE_TIMES:
raise RuntimeError(f"The disk space may be soon exhausted by the '{self._filepath}'.")
if self._max_file_size is None: if self._max_file_size is None:
self.writer.Write(data) self.writer.Write(data)
elif self._max_file_size >= required_length: elif self._max_file_size >= required_length:
...@@ -77,7 +79,7 @@ class SummaryWriter(BaseWriter): ...@@ -77,7 +79,7 @@ class SummaryWriter(BaseWriter):
def init_writer(self): def init_writer(self):
"""Write some metadata etc.""" """Write some metadata etc."""
self.writer.Write(package_init_event().SerializeToString()) self.write('summary', package_init_event().SerializeToString())
def write(self, plugin, data): def write(self, plugin, data):
"""Write data to file.""" """Write data to file."""
......
...@@ -156,6 +156,7 @@ class SummaryRecord: ...@@ -156,6 +156,7 @@ class SummaryRecord:
max_file_size, max_file_size,
summary=self.full_file_name, summary=self.full_file_name,
lineage=get_event_file_name('events', '_lineage')) lineage=get_event_file_name('events', '_lineage'))
_get_summary_tensor_data()
atexit.register(self.close) atexit.register(self.close)
def __enter__(self): def __enter__(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册