提交 0e91d26a 编写于 作者: H Hui Zhang

fix log; add report to trainer

上级 6de20de3
......@@ -36,6 +36,7 @@ from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
from deepspeech.training.trainer import Trainer
from deepspeech.training.reporter import report
from deepspeech.utils import error_rate
from deepspeech.utils import layer_tools
from deepspeech.utils import mp_tools
......@@ -67,7 +68,9 @@ class DeepSpeech2Trainer(Trainer):
super().__init__(config, args)
def train_batch(self, batch_index, batch_data, msg):
train_conf = self.config.training
batch_size = self.config.collator.batch_size
accum_grad = self.config.training.accum_grad
start = time.time()
# forward
......@@ -78,7 +81,7 @@ class DeepSpeech2Trainer(Trainer):
}
# loss backward
if (batch_index + 1) % train_conf.accum_grad != 0:
if (batch_index + 1) % accum_grad != 0:
# Disable gradient synchronizations across DDP processes.
# Within this context, gradients will be accumulated on module
# variables, which will later be synchronized.
......@@ -93,20 +96,19 @@ class DeepSpeech2Trainer(Trainer):
layer_tools.print_grads(self.model, print_func=None)
# optimizer step
if (batch_index + 1) % train_conf.accum_grad == 0:
if (batch_index + 1) % accum_grad == 0:
self.optimizer.step()
self.optimizer.clear_grad()
self.iteration += 1
iteration_time = time.time() - start
msg += "batch cost: {:>.3f}s, ".format(iteration_time)
msg += "batch size: {}, ".format(self.config.collator.batch_size)
msg += "accum: {}, ".format(train_conf.accum_grad)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
logger.info(msg)
for k, v in losses_np.items():
report(k, v)
report("batch_size", batch_size)
report("accum", accum_grad)
report("step_cost", iteration_time)
if dist.get_rank() == 0 and self.visualizer:
for k, v in losses_np.items():
# `step -1` since we update `step` after optimizer.step().
......
......@@ -17,6 +17,7 @@ import os
import sys
import time
from collections import defaultdict
from collections import OrderedDict
from contextlib import nullcontext
from pathlib import Path
from typing import Optional
......@@ -36,6 +37,8 @@ from deepspeech.training.optimizer import OptimizerFactory
from deepspeech.training.scheduler import LRSchedulerFactory
from deepspeech.training.timer import Timer
from deepspeech.training.trainer import Trainer
from deepspeech.training.reporter import report
from deepspeech.training.reporter import ObsScope
from deepspeech.utils import ctc_utils
from deepspeech.utils import error_rate
from deepspeech.utils import layer_tools
......@@ -121,12 +124,11 @@ class U2Trainer(Trainer):
iteration_time = time.time() - start
if (batch_index + 1) % train_conf.log_interval == 0:
msg += "train time: {:>.3f}s, ".format(iteration_time)
msg += "batch size: {}, ".format(self.config.collator.batch_size)
msg += "accum: {}, ".format(train_conf.accum_grad)
msg += ', '.join('{}: {:>.6f}'.format(k, v)
for k, v in losses_np.items())
logger.info(msg)
for k, v in losses_np.items():
report(k, v)
report("batch_size", self.config.collator.batch_size)
report("accum", train_conf.accum_grad)
report("step_cost", iteration_time)
if dist.get_rank() == 0 and self.visualizer:
losses_np_v = losses_np.copy()
......@@ -199,15 +201,25 @@ class U2Trainer(Trainer):
data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
self.after_train_batch()
msg = "Train:"
observation = OrderedDict()
with ObsScope(observation):
report("Rank", dist.get_rank())
report("epoch", self.epoch)
report('step', self.iteration)
report('step/total', (batch_index + 1) / len(self.train_loader))
report("lr", self.lr_scheduler())
self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('reader_cost', dataload_time)
observation['batch_cost'] = observation['reader_cost']+observation['step_cost']
observation['samples'] = observation['batch_size']
observation['ips[sent./sec]'] = observation['batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k}: "
msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}"
msg += ","
logger.info(msg)
data_start_time = time.time()
except Exception as e:
logger.error(e)
......
......@@ -14,12 +14,15 @@
import sys
import time
from pathlib import Path
from collections import OrderedDict
import paddle
from paddle import distributed as dist
from tensorboardX import SummaryWriter
from deepspeech.training.timer import Timer
from deepspeech.training.reporter import report
from deepspeech.training.reporter import ObsScope
from deepspeech.utils import mp_tools
from deepspeech.utils import profiler
from deepspeech.utils.checkpoint import Checkpoint
......@@ -27,6 +30,7 @@ from deepspeech.utils.log import Log
from deepspeech.utils.utility import seed_all
from deepspeech.utils.utility import UpdateConfig
__all__ = ["Trainer"]
logger = Log(__name__).getlog()
......@@ -98,6 +102,9 @@ class Trainer():
self.checkpoint_dir = None
self.iteration = 0
self.epoch = 0
self.rank = dist.get_rank()
logger.info(f"Rank: {self.rank}/{dist.get_world_size()}")
if args.seed:
seed_all(args.seed)
......@@ -223,15 +230,25 @@ class Trainer():
data_start_time = time.time()
for batch_index, batch in enumerate(self.train_loader):
dataload_time = time.time() - data_start_time
msg = "Train: Rank: {}, ".format(dist.get_rank())
msg += "epoch: {}, ".format(self.epoch)
msg += "step: {}, ".format(self.iteration)
msg += "batch : {}/{}, ".format(batch_index + 1,
len(self.train_loader))
msg += "lr: {:>.8f}, ".format(self.lr_scheduler())
msg += "data time: {:>.3f}s, ".format(dataload_time)
self.train_batch(batch_index, batch, msg)
self.after_train_batch()
msg = "Train:"
observation = OrderedDict()
with ObsScope(observation):
report("Rank", dist.get_rank())
report("epoch", self.epoch)
report('step', self.iteration)
report('step/total', (batch_index + 1) / len(self.train_loader))
report("lr", self.lr_scheduler())
self.train_batch(batch_index, batch, msg)
self.after_train_batch()
report('reader_cost', dataload_time)
observation['batch_cost'] = observation['reader_cost']+observation['step_cost']
observation['samples'] = observation['batch_size']
observation['ips[sent./sec]'] = observation['batch_size'] / observation['batch_cost']
for k, v in observation.items():
msg += f" {k}: "
msg += f"{v:>.8f}" if isinstance(v, float) else f"{v}"
msg += ","
logger.info(msg)
data_start_time = time.time()
except Exception as e:
logger.error(e)
......
#!/bin/bash
profiler_options=
benchmark_batch_size=
benchmark_max_step=
benchmark_batch_size=0
benchmark_max_step=0
# seed may break model convergence
seed=0
......@@ -52,4 +52,4 @@ if [ $? -ne 0 ]; then
exit 1
fi
exit 0
\ No newline at end of file
exit 0
#!/bin/bash
profiler_options=
benchmark_batch_size=
benchmark_max_step=
benchmark_batch_size=0
benchmark_max_step=0
# seed may break model convergence
seed=0
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册