未验证 提交 8911e29b 编写于 作者: Y Yiqun Liu 提交者: GitHub

Calculate and print the average time for video models (#4876)

* Calculate and print the average time for video models.

* Calculate and print the average time for dygraph of slowfast.

* Print ips for nextvlad, slowfast, tsm and tsn.
上级 308da28d
......@@ -75,7 +75,7 @@ class Youtube8mMetrics(Metrics):
perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(pred,
label)
gap = youtube8m_metrics.calculate_gap(pred, label)
logger.info(info + ' , loss = {0}, Hit@1 = {1}, PERR = {2}, GAP = {3}'.format(\
logger.info(info + ', loss: {0}, Hit@1: {1}, PERR: {2}, GAP: {3}'.format(\
'%.6f' % loss, '%.2f' % hit_at_one, '%.2f' % perr, '%.2f' % gap))
def accumulate(self, fetch_list, info=''):
......@@ -107,9 +107,9 @@ class Youtube8mMetrics(Metrics):
else:
epoch_info_dict = self.calculator.get()
logger.info(info + '\tavg_hit_at_one: {0},\tavg_perr: {1},\tavg_loss :{2},\taps: {3},\tgap:{4}'\
logger.info(info + '\tavg_hit_at_one: {0},\tavg_perr: {1},\tavg_loss :{2},\tgap:{3}'\
.format(epoch_info_dict['avg_hit_at_one'], epoch_info_dict['avg_perr'], \
epoch_info_dict['avg_loss'], epoch_info_dict['aps'], epoch_info_dict['gap']))
epoch_info_dict['avg_loss'], epoch_info_dict['gap']))
def reset(self):
self.calculator.clear()
......
......@@ -232,6 +232,7 @@ def train(args):
train_dataloader,
train_fetch_list,
train_metrics,
train_batch_size=train_config.TRAIN.batch_size,
epochs=epochs,
log_interval=args.log_interval,
valid_interval=args.valid_interval,
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class TimeAverager(object):
def __init__(self):
self.reset()
def reset(self):
self._cnt = 0
self._total_time = 0
self._total_samples = 0
def record(self, usetime, num_samples=None):
self._cnt += 1
self._total_time += usetime
if num_samples:
self._total_samples += num_samples
def get_average(self):
if self._cnt == 0:
return 0
return self._total_time / float(self._cnt)
def get_ips_average(self):
if not self._total_samples or self._cnt == 0:
return 0
return float(self._total_samples) / self._total_time
......@@ -19,6 +19,7 @@ import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid import profiler
from utils.timer import TimeAverager
import logging
import shutil
......@@ -72,39 +73,71 @@ def test_with_dataloader(exe,
test_metrics.finalize_and_log_out("[TEST] Finish")
def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, \
train_fetch_list, train_metrics, epochs = 10, \
log_interval = 0, valid_interval = 0, save_dir = './', \
num_trainers = 1, trainer_id = 0, \
save_model_name = 'model', fix_random_seed = False, \
compiled_test_prog = None, test_dataloader = None, \
test_fetch_list = None, test_metrics = None, \
is_profiler = None, profiler_path = None):
def train_with_dataloader(exe,
train_prog,
compiled_train_prog,
train_dataloader,
train_fetch_list,
train_metrics,
train_batch_size=None,
epochs=10,
log_interval=0,
valid_interval=0,
save_dir='./',
num_trainers=1,
trainer_id=0,
save_model_name='model',
fix_random_seed=False,
compiled_test_prog=None,
test_dataloader=None,
test_fetch_list=None,
test_metrics=None,
is_profiler=None,
profiler_path=None):
if not train_dataloader:
logger.error("[TRAIN] get dataloader failed.")
epoch_periods = []
train_loss = 0
epoch_periods = []
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
for epoch in range(epochs):
log_lr_and_step()
train_iter = 0
epoch_periods = []
cur_time = time.time()
batch_start = time.time()
for data in train_dataloader():
reader_cost_averager.record(time.time() - batch_start)
train_outs = exe.run(compiled_train_prog,
fetch_list=train_fetch_list,
feed=data)
period = time.time() - cur_time
epoch_periods.append(period)
timeStamp = time.time()
localTime = time.localtime(timeStamp)
strTime = time.strftime("%Y-%m-%d %H:%M:%S", localTime)
batch_cost = time.time() - batch_start
epoch_periods.append(batch_cost)
batch_cost_averager.record(batch_cost, num_samples=train_batch_size)
local_time = time.localtime(time.time())
str_time = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
if log_interval > 0 and (train_iter % log_interval == 0):
train_metrics.calculate_and_log_out(train_outs, \
info = '[TRAIN {}] Epoch {}, iter {}, time {}, '.format(strTime, epoch, train_iter, period))
time_info_str = "batch_cost: {:.5f} sec, reader_cost: {:.5f} sec".format(
batch_cost_averager.get_average(),
reader_cost_averager.get_average())
if train_batch_size:
time_info_str += ", ips: {:.5f} samples/sec".format(
batch_cost_averager.get_ips_average())
train_metrics.calculate_and_log_out(
train_outs,
info='[TRAIN {}] Epoch {}, iter {}, {}'.format(
str_time, epoch, train_iter, time_info_str))
reader_cost_averager.reset()
batch_cost_averager.reset()
train_iter += 1
cur_time = time.time()
batch_start = time.time()
# NOTE: profiler tools, used for benchmark
if is_profiler and epoch == 0 and train_iter == log_interval:
......@@ -118,8 +151,9 @@ def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader
'No iteration was executed, please check the data reader')
sys.exit(1)
logger.info('[TRAIN] Epoch {} training finished, average time: {}'.
format(epoch, np.mean(epoch_periods[1:])))
logger.info(
'[TRAIN] Epoch {} training finished, average time: {:.5f} sec'.
format(epoch, np.mean(epoch_periods[1:])))
if trainer_id == 0:
save_model(exe, train_prog, save_dir, save_model_name,
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class TimeAverager(object):
def __init__(self):
self.reset()
def reset(self):
self._cnt = 0
self._total_time = 0
self._total_samples = 0
def record(self, usetime, num_samples=None):
self._cnt += 1
self._total_time += usetime
if num_samples:
self._total_samples += num_samples
def get_average(self):
if self._cnt == 0:
return 0
return self._total_time / float(self._cnt)
def get_ips_average(self):
if not self._total_samples or self._cnt == 0:
return 0
return float(self._total_samples) / self._total_time
......@@ -30,6 +30,7 @@ from model import *
from config_utils import *
from lr_policy import get_epoch_lr
from kinetics_dataset import KineticsDataset
from timer import TimeAverager
logging.root.handlers = []
FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
......@@ -345,6 +346,8 @@ def train(args):
+ str(local_rank))
# 4. train loop
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
for epoch in range(train_config.TRAIN.epoch):
epoch_start = time.time()
if args.resume and epoch <= args.resume_epoch:
......@@ -361,7 +364,8 @@ def train(args):
train_config.TRAIN.epoch))
batch_start = time.time()
for batch_id, data in enumerate(train_loader):
batch_reader_end = time.time()
reader_cost_averager.record(time.time() - batch_start)
y_data = data[2]
labels = to_variable(y_data)
labels.stop_gradient = True
......@@ -387,9 +391,13 @@ def train(args):
optimizer.minimize(avg_loss)
video_model.clear_gradients()
total_loss += avg_loss.numpy()[0]
total_acc1 += acc_top1.numpy()[0]
total_acc5 += acc_top5.numpy()[0]
avg_loss_value = avg_loss.numpy()[0]
acc_top1_value = acc_top1.numpy()[0]
acc_top5_value = acc_top5.numpy()[0]
total_loss += avg_loss_value
total_acc1 += acc_top1_value
total_acc5 += acc_top5_value
total_sample += 1
if args.use_visualdl:
vdl_writer.add_scalar(
......@@ -405,15 +413,23 @@ def train(args):
step=epoch * train_iter_num + batch_id,
value=1.0 - acc_top5.numpy())
train_batch_cost = time.time() - batch_start
train_reader_cost = batch_reader_end - batch_start
batch_start = time.time()
batch_cost_averager.record(
time.time() - batch_start, num_samples=bs_train_single)
if batch_id % args.log_interval == 0:
print( "[Epoch %d, batch %d] loss %.5f, err1 %.5f, err5 %.5f, batch_cost: %.5f s, reader_cost: %.5f s" % \
(epoch, batch_id, avg_loss.numpy(), 1.0 - acc_top1.numpy(), 1. - acc_top5.numpy(), train_batch_cost, train_reader_cost))
print(
"[Epoch %d, batch %d] loss %.5f, err1 %.5f, err5 %.5f, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f samples/sec"
%
(epoch, batch_id, avg_loss_value, 1.0 - acc_top1_value,
1. - acc_top5_value, batch_cost_averager.get_average(),
reader_cost_averager.get_average(),
batch_cost_averager.get_ips_average()))
reader_cost_averager.reset()
batch_cost_averager.reset()
batch_start = time.time()
train_epoch_cost = time.time() - epoch_start
print( '[Epoch %d end] avg_loss %.5f, avg_err1 %.5f, avg_err5= %.5f, epoch_cost: %.5f s' % \
print( '[Epoch %d end] avg_loss %.5f, avg_err1 %.5f, avg_err5= %.5f, epoch_cost: %.5f sec' % \
(epoch, total_loss / total_sample, 1. - total_acc1 / total_sample, 1. - total_acc5 / total_sample, train_epoch_cost))
if args.use_visualdl:
vdl_writer.add_scalar(
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class TimeAverager(object):
def __init__(self):
self.reset()
def reset(self):
self._cnt = 0
self._total_time = 0
self._total_samples = 0
def record(self, usetime, num_samples=None):
self._cnt += 1
self._total_time += usetime
if num_samples:
self._total_samples += num_samples
def get_average(self):
if self._cnt == 0:
return 0
return self._total_time / float(self._cnt)
def get_ips_average(self):
if not self._total_samples or self._cnt == 0:
return 0
return float(self._total_samples) / self._total_time
......@@ -25,6 +25,7 @@ from model import TSM_ResNet
from config_utils import *
from reader import KineticsReader
from ucf101_reader import UCF101Reader
from timer import TimeAverager
logging.root.handlers = []
FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
......@@ -92,7 +93,12 @@ def parse_args():
type=str,
default='./ResNet50_pretrained/',
help='default weights is ./ResNet50_pretrained/.')
parser.add_argument(
'--log_interval',
type=int,
default=10,
help='mini-batch interval to log.')
args = parser.parse_args()
return args
......@@ -245,16 +251,23 @@ def train(args):
train_reader)
# 6. train loop
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
for epoch in range(train_config.TRAIN.epoch):
epoch_start = time.time()
video_model.train()
total_loss = 0.0
total_acc1 = 0.0
total_acc5 = 0.0
total_sample = 0
t_last = time.time()
# 6.1 for each batch, call model() , backward(), and minimize()
batch_start = time.time()
for batch_id, data in enumerate(train_reader()):
t1 = time.time()
reader_cost_averager.record(t1 - batch_start)
x_data = np.array([item[0] for item in data])
y_data = np.array([item[1] for item in data]).reshape([-1, 1])
......@@ -287,26 +300,38 @@ def train(args):
t4 = time.time()
optimizer.minimize(avg_loss)
video_model.clear_gradients()
t5 = time.time()
total_loss += avg_loss.numpy()[0]
total_acc1 += acc_top1.numpy()[0]
total_acc5 += acc_top5.numpy()[0]
total_sample += 1
avg_loss_value = avg_loss.numpy()[0]
acc_top1_value = acc_top1.numpy()[0]
acc_top5_value = acc_top5.numpy()[0]
print(
'TRAIN Epoch: %d, iter: %d, loss: %.5f, acc1: %.5f, acc5: %.5f, lr: %.5f, forward_cost:%.5f s, backward_cost:%.5f s, minimize_cost:%.5f s, to_variable_cost: %.5f s, batch_cost: %.5f s, reader_cost: %.5f s'
% (epoch, batch_id, avg_loss.numpy()[0],
acc_top1.numpy()[0], acc_top5.numpy()[0],
current_step_lr, t3 - t2, t4 - t3, t5 - t4, t2 - t1,
t5 - t_last, t2 - t_last))
t_last = time.time()
total_loss += avg_loss_value
total_acc1 += acc_top1_value
total_acc5 += acc_top5_value
total_sample += 1
t5 = time.time()
batch_cost_averager.record(
t5 - batch_start, num_samples=train_config.TRAIN.batch_size)
if batch_id % args.log_interval == 0:
print(
'TRAIN Epoch: %d, iter: %d, loss: %.5f, acc1: %.5f, acc5: %.5f, lr: %.5f, forward_cost:%.5f s, backward_cost:%.5f s, minimize_cost:%.5f s, to_variable_cost: %.5f s, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f samples/sec'
% (epoch, batch_id, avg_loss_value, acc_top1_value,
acc_top5_value, current_step_lr, t3 - t2, t4 - t3,
t5 - t4, t2 - t1, batch_cost_averager.get_average(),
reader_cost_averager.get_average(),
batch_cost_averager.get_ips_average()))
batch_cost_averager.reset()
reader_cost_averager.reset()
batch_start = time.time()
train_epoch_cost = time.time() - epoch_start
print(
'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}, lr={}'.
'TRAIN End, Epoch {}, avg_loss= {:.5f}, avg_acc1= {:.5f}, avg_acc5= {:.5f}, lr={:.5f}, epoch_cost: {:.5f} sec'.
format(epoch, total_loss / total_sample, total_acc1 /
total_sample, total_acc5 / total_sample,
current_step_lr))
total_sample, total_acc5 / total_sample, current_step_lr,
train_epoch_cost))
# 6.2 save checkpoint
if local_rank == 0:
......
......@@ -25,6 +25,7 @@ import paddle.nn.functional as F
from paddle.jit import to_static
from paddle.static import InputSpec
class ConvBNLayer(paddle.nn.Layer):
def __init__(self,
in_channels,
......@@ -48,9 +49,9 @@ class ConvBNLayer(paddle.nn.Layer):
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
self._act = act
self._batch_norm = BatchNorm2D(
out_channels,
weight_attr=ParamAttr(name=bn_name + "_scale"),
......@@ -104,7 +105,6 @@ class BottleneckBlock(paddle.nn.Layer):
self.shortcut = shortcut
def forward(self, inputs):
y = self.conv0(inputs)
conv1 = self.conv1(y)
......@@ -161,8 +161,9 @@ class BasicBlock(paddle.nn.Layer):
else:
short = self.short(inputs)
y = paddle.add(short, conv1)
y = F.relu(y)
return y
y = F.relu(y)
return y
class TSN_ResNet(paddle.nn.Layer):
def __init__(self, config):
......@@ -184,7 +185,7 @@ class TSN_ResNet(paddle.nn.Layer):
elif self.layers == 152:
depth = [3, 8, 36, 3]
in_channels = [64, 256, 512,
1024] if self.layers >= 50 else [64, 64, 128, 256]
1024] if self.layers >= 50 else [64, 64, 128, 256]
out_channels = [64, 128, 256, 512]
self.conv = ConvBNLayer(
......@@ -194,8 +195,7 @@ class TSN_ResNet(paddle.nn.Layer):
stride=2,
act="relu",
name="conv1")
self.pool2D_max = MaxPool2D(
kernel_size=3, stride=2, padding=1)
self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
self.block_list = []
if self.layers >= 50:
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class TimeAverager(object):
def __init__(self):
self.reset()
def reset(self):
self._cnt = 0
self._total_time = 0
self._total_samples = 0
def record(self, usetime, num_samples=None):
self._cnt += 1
self._total_time += usetime
if num_samples:
self._total_samples += num_samples
def get_average(self):
if self._cnt == 0:
return 0
return self._total_time / float(self._cnt)
def get_ips_average(self):
if not self._total_samples or self._cnt == 0:
return 0
return float(self._total_samples) / self._total_time
......@@ -25,6 +25,8 @@ import ast
from model import TSN_ResNet
from utils.config_utils import *
from reader.ucf101_reader import UCF101Reader
from timer import TimeAverager
import paddle
from paddle.io import DataLoader, DistributedBatchSampler
from compose import TSN_UCF101_Dataset
......@@ -93,6 +95,11 @@ def parse_args():
default=True,
help='whether to validating in training phase.'
'default value is True.')
parser.add_argument(
'--log_interval',
type=int,
default=10,
help='mini-batch interval to log.')
args = parser.parse_args()
return args
......@@ -126,8 +133,7 @@ def val(epoch, model, val_loader, cfg, args):
outputs = model(imgs)
loss = F.cross_entropy(
input=outputs, label=labels, ignore_index=-1)
loss = F.cross_entropy(input=outputs, label=labels, ignore_index=-1)
avg_loss = paddle.mean(loss)
acc_top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
acc_top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
......@@ -182,56 +188,51 @@ def train(args):
place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) \
if use_data_parallel else paddle.CUDAPlace(0)
if use_data_parallel:
paddle.distributed.init_parallel_env()
video_model = TSN_ResNet(train_config)
if use_data_parallel:
video_model = paddle.DataParallel(video_model)
pre_state_dict = paddle.load(args.pretrain)
#if paddle.distributed.parallel.Env().local_rank == 0:
video_model = init_model(video_model, pre_state_dict)
optimizer = create_optimizer(train_config.TRAIN,
video_model.parameters())
optimizer = create_optimizer(train_config.TRAIN, video_model.parameters())
bs_denominator = 1
if args.use_gpu:
# check number of GPUs
# check number of GPUs
gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
if gpus == "":
pass
else:
gpus = gpus.split(",")
num_gpus = len(gpus)
bs_denominator = num_gpus
bs_denominator = num_gpus
bs_train_single = int(train_config.TRAIN.batch_size / bs_denominator)
bs_val_single = int(valid_config.VALID.batch_size / bs_denominator)
train_dataset = TSN_UCF101_Dataset(train_config, 'train')
val_dataset = TSN_UCF101_Dataset(valid_config, 'valid')
train_sampler = DistributedBatchSampler(
train_dataset,
batch_size=bs_train_single,
shuffle=train_config.TRAIN.use_shuffle,
drop_last=True)
train_dataset,
batch_size=bs_train_single,
shuffle=train_config.TRAIN.use_shuffle,
drop_last=True)
train_loader = DataLoader(
train_dataset,
batch_sampler=train_sampler,
places=place,
num_workers=train_config.TRAIN.num_workers,
return_list=True)
val_sampler = DistributedBatchSampler(
val_dataset, batch_size=bs_val_single)
train_dataset,
batch_sampler=train_sampler,
places=place,
num_workers=train_config.TRAIN.num_workers,
return_list=True)
val_sampler = DistributedBatchSampler(val_dataset, batch_size=bs_val_single)
val_loader = DataLoader(
val_dataset,
batch_sampler=val_sampler,
places=place,
num_workers=valid_config.VALID.num_workers,
return_list=True)
val_dataset,
batch_sampler=val_sampler,
places=place,
num_workers=valid_config.VALID.num_workers,
return_list=True)
# resume training the model
if args.resume is not None:
......@@ -239,15 +240,21 @@ def train(args):
video_model.set_dict(model_state)
optimizer.set_dict(opt_state)
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
for epoch in range(1, train_config.TRAIN.epoch + 1):
epoch_start = time.time()
video_model.train()
total_loss = 0.0
total_acc1 = 0.0
total_acc5 = 0.0
total_sample = 0
batch_start = time.time()
for batch_id, data in enumerate(train_loader):
train_reader_cost = time.time() - batch_start
reader_cost_averager.record(time.time() - batch_start)
imgs = paddle.to_tensor(data[0], place=paddle.CUDAPinnedPlace())
labels = paddle.to_tensor(data[1], place=paddle.CUDAPinnedPlace())
labels.stop_gradient = True
......@@ -256,14 +263,12 @@ def train(args):
loss = F.cross_entropy(input=outputs, label=labels, ignore_index=-1)
avg_loss = paddle.mean(loss)
acc_top1 = paddle.metric.accuracy(
input=outputs, label=labels, k=1)
acc_top5 = paddle.metric.accuracy(
input=outputs, label=labels, k=5)
acc_top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
acc_top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
dy_out = avg_loss.numpy()[0]
if use_data_parallel:
# (data_parallel step5/6)
# (data_parallel step5/6)
avg_loss = video_model.scale_loss(avg_loss)
avg_loss.backward()
video_model.apply_collective_grads()
......@@ -278,18 +283,27 @@ def train(args):
total_acc1 += acc_top1.numpy()[0]
total_acc5 += acc_top5.numpy()[0]
total_sample += 1
train_batch_cost = time.time() - batch_start
print(
'TRAIN Epoch: {}, iter: {}, batch_cost: {:.5f} s, reader_cost: {:.5f} s, loss={:.6f}, acc1 {:.6f}, acc5 {:.6f} '.
format(epoch, batch_id, train_batch_cost, train_reader_cost,
total_loss / total_sample, total_acc1 / total_sample,
total_acc5 / total_sample))
batch_cost_averager.record(
time.time() - batch_start, num_samples=bs_train_single)
if batch_id % args.log_interval == 0:
print(
'TRAIN Epoch: {}, iter: {}, loss={:.6f}, acc1 {:.6f}, acc5 {:.6f}, batch_cost: {:.5f} sec, reader_cost: {:.5f} sec, ips: {:.5f} samples/sec'.
format(epoch, batch_id, total_loss / total_sample,
total_acc1 / total_sample, total_acc5 / total_sample,
batch_cost_averager.get_average(),
reader_cost_averager.get_average(),
batch_cost_averager.get_ips_average()))
batch_cost_averager.reset()
reader_cost_averager.reset()
batch_start = time.time()
train_epoch_cost = time.time() - epoch_start
print(
'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}'.
format(epoch, total_loss / total_sample, total_acc1 /
total_sample, total_acc5 / total_sample))
'TRAIN End, Epoch {}, avg_loss= {:.6f}, avg_acc1= {:.6f}, avg_acc5= {:.6f}, epoch_cost: {:.5f} sec'.
format(epoch, total_loss / total_sample, total_acc1 / total_sample,
total_acc5 / total_sample, train_epoch_cost))
# save model's and optimizer's parameters which used for resuming the training stage
save_parameters = (not use_data_parallel) or (
......@@ -302,13 +316,12 @@ def train(args):
model_path = os.path.join(
args.checkpoint,
"_" + model_path_pre + "_epoch{}".format(epoch))
paddle.save(
video_model.state_dict(), model_path)
paddle.save(video_model.state_dict(), model_path)
paddle.save(optimizer.state_dict(), model_path)
if args.validate:
video_model.eval()
val_acc = val(epoch, video_model,valid_loader, valid_config, args)
val_acc = val(epoch, video_model, val_loader, valid_config, args)
# save the best parameters in trainging stage
if epoch == 1:
best_acc = val_acc
......@@ -318,12 +331,13 @@ def train(args):
if paddle.distributed.ParallelEnv().local_rank == 0:
if not os.path.isdir(args.weights):
os.makedirs(args.weights)
paddle.save(video_model.state_dict(), args.weights + "/final")
paddle.save(video_model.state_dict(),
args.weights + "/final")
else:
if paddle.distributed.parallel.Env().local_rank == 0:
if not os.path.isdir(args.weights):
os.makedirs(args.weights)
paddle.save(video_model.state_dict(),args.weights + "/final")
paddle.save(video_model.state_dict(), args.weights + "/final")
logger.info('[TRAIN] training finished')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册