未验证 提交 8911e29b 编写于 作者: Y Yiqun Liu 提交者: GitHub

Calculate and print the average time for video models (#4876)

* Calculate and print the average time for video models.

* Calculate and print the average time for dygraph of slowfast.

* Print ips for nextvlad, slowfast, tsm and tsn.
上级 308da28d
...@@ -75,7 +75,7 @@ class Youtube8mMetrics(Metrics): ...@@ -75,7 +75,7 @@ class Youtube8mMetrics(Metrics):
perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(pred, perr = youtube8m_metrics.calculate_precision_at_equal_recall_rate(pred,
label) label)
gap = youtube8m_metrics.calculate_gap(pred, label) gap = youtube8m_metrics.calculate_gap(pred, label)
logger.info(info + ' , loss = {0}, Hit@1 = {1}, PERR = {2}, GAP = {3}'.format(\ logger.info(info + ', loss: {0}, Hit@1: {1}, PERR: {2}, GAP: {3}'.format(\
'%.6f' % loss, '%.2f' % hit_at_one, '%.2f' % perr, '%.2f' % gap)) '%.6f' % loss, '%.2f' % hit_at_one, '%.2f' % perr, '%.2f' % gap))
def accumulate(self, fetch_list, info=''): def accumulate(self, fetch_list, info=''):
...@@ -107,9 +107,9 @@ class Youtube8mMetrics(Metrics): ...@@ -107,9 +107,9 @@ class Youtube8mMetrics(Metrics):
else: else:
epoch_info_dict = self.calculator.get() epoch_info_dict = self.calculator.get()
logger.info(info + '\tavg_hit_at_one: {0},\tavg_perr: {1},\tavg_loss :{2},\taps: {3},\tgap:{4}'\ logger.info(info + '\tavg_hit_at_one: {0},\tavg_perr: {1},\tavg_loss :{2},\tgap:{3}'\
.format(epoch_info_dict['avg_hit_at_one'], epoch_info_dict['avg_perr'], \ .format(epoch_info_dict['avg_hit_at_one'], epoch_info_dict['avg_perr'], \
epoch_info_dict['avg_loss'], epoch_info_dict['aps'], epoch_info_dict['gap'])) epoch_info_dict['avg_loss'], epoch_info_dict['gap']))
def reset(self): def reset(self):
self.calculator.clear() self.calculator.clear()
......
...@@ -232,6 +232,7 @@ def train(args): ...@@ -232,6 +232,7 @@ def train(args):
train_dataloader, train_dataloader,
train_fetch_list, train_fetch_list,
train_metrics, train_metrics,
train_batch_size=train_config.TRAIN.batch_size,
epochs=epochs, epochs=epochs,
log_interval=args.log_interval, log_interval=args.log_interval,
valid_interval=args.valid_interval, valid_interval=args.valid_interval,
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class TimeAverager(object):
def __init__(self):
self.reset()
def reset(self):
self._cnt = 0
self._total_time = 0
self._total_samples = 0
def record(self, usetime, num_samples=None):
self._cnt += 1
self._total_time += usetime
if num_samples:
self._total_samples += num_samples
def get_average(self):
if self._cnt == 0:
return 0
return self._total_time / float(self._cnt)
def get_ips_average(self):
if not self._total_samples or self._cnt == 0:
return 0
return float(self._total_samples) / self._total_time
...@@ -19,6 +19,7 @@ import numpy as np ...@@ -19,6 +19,7 @@ import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import profiler from paddle.fluid import profiler
from utils.timer import TimeAverager
import logging import logging
import shutil import shutil
...@@ -72,39 +73,71 @@ def test_with_dataloader(exe, ...@@ -72,39 +73,71 @@ def test_with_dataloader(exe,
test_metrics.finalize_and_log_out("[TEST] Finish") test_metrics.finalize_and_log_out("[TEST] Finish")
def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, \ def train_with_dataloader(exe,
train_fetch_list, train_metrics, epochs = 10, \ train_prog,
log_interval = 0, valid_interval = 0, save_dir = './', \ compiled_train_prog,
num_trainers = 1, trainer_id = 0, \ train_dataloader,
save_model_name = 'model', fix_random_seed = False, \ train_fetch_list,
compiled_test_prog = None, test_dataloader = None, \ train_metrics,
test_fetch_list = None, test_metrics = None, \ train_batch_size=None,
is_profiler = None, profiler_path = None): epochs=10,
log_interval=0,
valid_interval=0,
save_dir='./',
num_trainers=1,
trainer_id=0,
save_model_name='model',
fix_random_seed=False,
compiled_test_prog=None,
test_dataloader=None,
test_fetch_list=None,
test_metrics=None,
is_profiler=None,
profiler_path=None):
if not train_dataloader: if not train_dataloader:
logger.error("[TRAIN] get dataloader failed.") logger.error("[TRAIN] get dataloader failed.")
epoch_periods = []
train_loss = 0 train_loss = 0
epoch_periods = []
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
for epoch in range(epochs): for epoch in range(epochs):
log_lr_and_step() log_lr_and_step()
train_iter = 0 train_iter = 0
epoch_periods = [] epoch_periods = []
cur_time = time.time() batch_start = time.time()
for data in train_dataloader(): for data in train_dataloader():
reader_cost_averager.record(time.time() - batch_start)
train_outs = exe.run(compiled_train_prog, train_outs = exe.run(compiled_train_prog,
fetch_list=train_fetch_list, fetch_list=train_fetch_list,
feed=data) feed=data)
period = time.time() - cur_time
epoch_periods.append(period) batch_cost = time.time() - batch_start
timeStamp = time.time() epoch_periods.append(batch_cost)
localTime = time.localtime(timeStamp) batch_cost_averager.record(batch_cost, num_samples=train_batch_size)
strTime = time.strftime("%Y-%m-%d %H:%M:%S", localTime)
local_time = time.localtime(time.time())
str_time = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
if log_interval > 0 and (train_iter % log_interval == 0): if log_interval > 0 and (train_iter % log_interval == 0):
train_metrics.calculate_and_log_out(train_outs, \ time_info_str = "batch_cost: {:.5f} sec, reader_cost: {:.5f} sec".format(
info = '[TRAIN {}] Epoch {}, iter {}, time {}, '.format(strTime, epoch, train_iter, period)) batch_cost_averager.get_average(),
reader_cost_averager.get_average())
if train_batch_size:
time_info_str += ", ips: {:.5f} samples/sec".format(
batch_cost_averager.get_ips_average())
train_metrics.calculate_and_log_out(
train_outs,
info='[TRAIN {}] Epoch {}, iter {}, {}'.format(
str_time, epoch, train_iter, time_info_str))
reader_cost_averager.reset()
batch_cost_averager.reset()
train_iter += 1 train_iter += 1
cur_time = time.time() batch_start = time.time()
# NOTE: profiler tools, used for benchmark # NOTE: profiler tools, used for benchmark
if is_profiler and epoch == 0 and train_iter == log_interval: if is_profiler and epoch == 0 and train_iter == log_interval:
...@@ -118,7 +151,8 @@ def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader ...@@ -118,7 +151,8 @@ def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader
'No iteration was executed, please check the data reader') 'No iteration was executed, please check the data reader')
sys.exit(1) sys.exit(1)
logger.info('[TRAIN] Epoch {} training finished, average time: {}'. logger.info(
'[TRAIN] Epoch {} training finished, average time: {:.5f} sec'.
format(epoch, np.mean(epoch_periods[1:]))) format(epoch, np.mean(epoch_periods[1:])))
if trainer_id == 0: if trainer_id == 0:
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class TimeAverager(object):
def __init__(self):
self.reset()
def reset(self):
self._cnt = 0
self._total_time = 0
self._total_samples = 0
def record(self, usetime, num_samples=None):
self._cnt += 1
self._total_time += usetime
if num_samples:
self._total_samples += num_samples
def get_average(self):
if self._cnt == 0:
return 0
return self._total_time / float(self._cnt)
def get_ips_average(self):
if not self._total_samples or self._cnt == 0:
return 0
return float(self._total_samples) / self._total_time
...@@ -30,6 +30,7 @@ from model import * ...@@ -30,6 +30,7 @@ from model import *
from config_utils import * from config_utils import *
from lr_policy import get_epoch_lr from lr_policy import get_epoch_lr
from kinetics_dataset import KineticsDataset from kinetics_dataset import KineticsDataset
from timer import TimeAverager
logging.root.handlers = [] logging.root.handlers = []
FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s' FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
...@@ -345,6 +346,8 @@ def train(args): ...@@ -345,6 +346,8 @@ def train(args):
+ str(local_rank)) + str(local_rank))
# 4. train loop # 4. train loop
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
for epoch in range(train_config.TRAIN.epoch): for epoch in range(train_config.TRAIN.epoch):
epoch_start = time.time() epoch_start = time.time()
if args.resume and epoch <= args.resume_epoch: if args.resume and epoch <= args.resume_epoch:
...@@ -361,7 +364,8 @@ def train(args): ...@@ -361,7 +364,8 @@ def train(args):
train_config.TRAIN.epoch)) train_config.TRAIN.epoch))
batch_start = time.time() batch_start = time.time()
for batch_id, data in enumerate(train_loader): for batch_id, data in enumerate(train_loader):
batch_reader_end = time.time() reader_cost_averager.record(time.time() - batch_start)
y_data = data[2] y_data = data[2]
labels = to_variable(y_data) labels = to_variable(y_data)
labels.stop_gradient = True labels.stop_gradient = True
...@@ -387,9 +391,13 @@ def train(args): ...@@ -387,9 +391,13 @@ def train(args):
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
video_model.clear_gradients() video_model.clear_gradients()
total_loss += avg_loss.numpy()[0] avg_loss_value = avg_loss.numpy()[0]
total_acc1 += acc_top1.numpy()[0] acc_top1_value = acc_top1.numpy()[0]
total_acc5 += acc_top5.numpy()[0] acc_top5_value = acc_top5.numpy()[0]
total_loss += avg_loss_value
total_acc1 += acc_top1_value
total_acc5 += acc_top5_value
total_sample += 1 total_sample += 1
if args.use_visualdl: if args.use_visualdl:
vdl_writer.add_scalar( vdl_writer.add_scalar(
...@@ -405,15 +413,23 @@ def train(args): ...@@ -405,15 +413,23 @@ def train(args):
step=epoch * train_iter_num + batch_id, step=epoch * train_iter_num + batch_id,
value=1.0 - acc_top5.numpy()) value=1.0 - acc_top5.numpy())
train_batch_cost = time.time() - batch_start batch_cost_averager.record(
train_reader_cost = batch_reader_end - batch_start time.time() - batch_start, num_samples=bs_train_single)
batch_start = time.time()
if batch_id % args.log_interval == 0: if batch_id % args.log_interval == 0:
print( "[Epoch %d, batch %d] loss %.5f, err1 %.5f, err5 %.5f, batch_cost: %.5f s, reader_cost: %.5f s" % \ print(
(epoch, batch_id, avg_loss.numpy(), 1.0 - acc_top1.numpy(), 1. - acc_top5.numpy(), train_batch_cost, train_reader_cost)) "[Epoch %d, batch %d] loss %.5f, err1 %.5f, err5 %.5f, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f samples/sec"
%
(epoch, batch_id, avg_loss_value, 1.0 - acc_top1_value,
1. - acc_top5_value, batch_cost_averager.get_average(),
reader_cost_averager.get_average(),
batch_cost_averager.get_ips_average()))
reader_cost_averager.reset()
batch_cost_averager.reset()
batch_start = time.time()
train_epoch_cost = time.time() - epoch_start train_epoch_cost = time.time() - epoch_start
print( '[Epoch %d end] avg_loss %.5f, avg_err1 %.5f, avg_err5= %.5f, epoch_cost: %.5f s' % \ print( '[Epoch %d end] avg_loss %.5f, avg_err1 %.5f, avg_err5= %.5f, epoch_cost: %.5f sec' % \
(epoch, total_loss / total_sample, 1. - total_acc1 / total_sample, 1. - total_acc5 / total_sample, train_epoch_cost)) (epoch, total_loss / total_sample, 1. - total_acc1 / total_sample, 1. - total_acc5 / total_sample, train_epoch_cost))
if args.use_visualdl: if args.use_visualdl:
vdl_writer.add_scalar( vdl_writer.add_scalar(
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class TimeAverager(object):
def __init__(self):
self.reset()
def reset(self):
self._cnt = 0
self._total_time = 0
self._total_samples = 0
def record(self, usetime, num_samples=None):
self._cnt += 1
self._total_time += usetime
if num_samples:
self._total_samples += num_samples
def get_average(self):
if self._cnt == 0:
return 0
return self._total_time / float(self._cnt)
def get_ips_average(self):
if not self._total_samples or self._cnt == 0:
return 0
return float(self._total_samples) / self._total_time
...@@ -25,6 +25,7 @@ from model import TSM_ResNet ...@@ -25,6 +25,7 @@ from model import TSM_ResNet
from config_utils import * from config_utils import *
from reader import KineticsReader from reader import KineticsReader
from ucf101_reader import UCF101Reader from ucf101_reader import UCF101Reader
from timer import TimeAverager
logging.root.handlers = [] logging.root.handlers = []
FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s' FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
...@@ -92,6 +93,11 @@ def parse_args(): ...@@ -92,6 +93,11 @@ def parse_args():
type=str, type=str,
default='./ResNet50_pretrained/', default='./ResNet50_pretrained/',
help='default weights is ./ResNet50_pretrained/.') help='default weights is ./ResNet50_pretrained/.')
parser.add_argument(
'--log_interval',
type=int,
default=10,
help='mini-batch interval to log.')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -245,16 +251,23 @@ def train(args): ...@@ -245,16 +251,23 @@ def train(args):
train_reader) train_reader)
# 6. train loop # 6. train loop
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
for epoch in range(train_config.TRAIN.epoch): for epoch in range(train_config.TRAIN.epoch):
epoch_start = time.time()
video_model.train() video_model.train()
total_loss = 0.0 total_loss = 0.0
total_acc1 = 0.0 total_acc1 = 0.0
total_acc5 = 0.0 total_acc5 = 0.0
total_sample = 0 total_sample = 0
t_last = time.time()
# 6.1 for each batch, call model() , backward(), and minimize() # 6.1 for each batch, call model() , backward(), and minimize()
batch_start = time.time()
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
t1 = time.time() t1 = time.time()
reader_cost_averager.record(t1 - batch_start)
x_data = np.array([item[0] for item in data]) x_data = np.array([item[0] for item in data])
y_data = np.array([item[1] for item in data]).reshape([-1, 1]) y_data = np.array([item[1] for item in data]).reshape([-1, 1])
...@@ -287,26 +300,38 @@ def train(args): ...@@ -287,26 +300,38 @@ def train(args):
t4 = time.time() t4 = time.time()
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
video_model.clear_gradients() video_model.clear_gradients()
t5 = time.time()
total_loss += avg_loss.numpy()[0] avg_loss_value = avg_loss.numpy()[0]
total_acc1 += acc_top1.numpy()[0] acc_top1_value = acc_top1.numpy()[0]
total_acc5 += acc_top5.numpy()[0] acc_top5_value = acc_top5.numpy()[0]
total_loss += avg_loss_value
total_acc1 += acc_top1_value
total_acc5 += acc_top5_value
total_sample += 1 total_sample += 1
t5 = time.time()
batch_cost_averager.record(
t5 - batch_start, num_samples=train_config.TRAIN.batch_size)
if batch_id % args.log_interval == 0:
print( print(
'TRAIN Epoch: %d, iter: %d, loss: %.5f, acc1: %.5f, acc5: %.5f, lr: %.5f, forward_cost:%.5f s, backward_cost:%.5f s, minimize_cost:%.5f s, to_variable_cost: %.5f s, batch_cost: %.5f s, reader_cost: %.5f s' 'TRAIN Epoch: %d, iter: %d, loss: %.5f, acc1: %.5f, acc5: %.5f, lr: %.5f, forward_cost:%.5f s, backward_cost:%.5f s, minimize_cost:%.5f s, to_variable_cost: %.5f s, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f samples/sec'
% (epoch, batch_id, avg_loss.numpy()[0], % (epoch, batch_id, avg_loss_value, acc_top1_value,
acc_top1.numpy()[0], acc_top5.numpy()[0], acc_top5_value, current_step_lr, t3 - t2, t4 - t3,
current_step_lr, t3 - t2, t4 - t3, t5 - t4, t2 - t1, t5 - t4, t2 - t1, batch_cost_averager.get_average(),
t5 - t_last, t2 - t_last)) reader_cost_averager.get_average(),
t_last = time.time() batch_cost_averager.get_ips_average()))
batch_cost_averager.reset()
reader_cost_averager.reset()
batch_start = time.time()
train_epoch_cost = time.time() - epoch_start
print( print(
'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}, lr={}'. 'TRAIN End, Epoch {}, avg_loss= {:.5f}, avg_acc1= {:.5f}, avg_acc5= {:.5f}, lr={:.5f}, epoch_cost: {:.5f} sec'.
format(epoch, total_loss / total_sample, total_acc1 / format(epoch, total_loss / total_sample, total_acc1 /
total_sample, total_acc5 / total_sample, total_sample, total_acc5 / total_sample, current_step_lr,
current_step_lr)) train_epoch_cost))
# 6.2 save checkpoint # 6.2 save checkpoint
if local_rank == 0: if local_rank == 0:
......
...@@ -25,6 +25,7 @@ import paddle.nn.functional as F ...@@ -25,6 +25,7 @@ import paddle.nn.functional as F
from paddle.jit import to_static from paddle.jit import to_static
from paddle.static import InputSpec from paddle.static import InputSpec
class ConvBNLayer(paddle.nn.Layer): class ConvBNLayer(paddle.nn.Layer):
def __init__(self, def __init__(self,
in_channels, in_channels,
...@@ -104,7 +105,6 @@ class BottleneckBlock(paddle.nn.Layer): ...@@ -104,7 +105,6 @@ class BottleneckBlock(paddle.nn.Layer):
self.shortcut = shortcut self.shortcut = shortcut
def forward(self, inputs): def forward(self, inputs):
y = self.conv0(inputs) y = self.conv0(inputs)
conv1 = self.conv1(y) conv1 = self.conv1(y)
...@@ -164,6 +164,7 @@ class BasicBlock(paddle.nn.Layer): ...@@ -164,6 +164,7 @@ class BasicBlock(paddle.nn.Layer):
y = F.relu(y) y = F.relu(y)
return y return y
class TSN_ResNet(paddle.nn.Layer): class TSN_ResNet(paddle.nn.Layer):
def __init__(self, config): def __init__(self, config):
super(TSN_ResNet, self).__init__() super(TSN_ResNet, self).__init__()
...@@ -194,8 +195,7 @@ class TSN_ResNet(paddle.nn.Layer): ...@@ -194,8 +195,7 @@ class TSN_ResNet(paddle.nn.Layer):
stride=2, stride=2,
act="relu", act="relu",
name="conv1") name="conv1")
self.pool2D_max = MaxPool2D( self.pool2D_max = MaxPool2D(kernel_size=3, stride=2, padding=1)
kernel_size=3, stride=2, padding=1)
self.block_list = [] self.block_list = []
if self.layers >= 50: if self.layers >= 50:
......
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class TimeAverager(object):
def __init__(self):
self.reset()
def reset(self):
self._cnt = 0
self._total_time = 0
self._total_samples = 0
def record(self, usetime, num_samples=None):
self._cnt += 1
self._total_time += usetime
if num_samples:
self._total_samples += num_samples
def get_average(self):
if self._cnt == 0:
return 0
return self._total_time / float(self._cnt)
def get_ips_average(self):
if not self._total_samples or self._cnt == 0:
return 0
return float(self._total_samples) / self._total_time
...@@ -25,6 +25,8 @@ import ast ...@@ -25,6 +25,8 @@ import ast
from model import TSN_ResNet from model import TSN_ResNet
from utils.config_utils import * from utils.config_utils import *
from reader.ucf101_reader import UCF101Reader from reader.ucf101_reader import UCF101Reader
from timer import TimeAverager
import paddle import paddle
from paddle.io import DataLoader, DistributedBatchSampler from paddle.io import DataLoader, DistributedBatchSampler
from compose import TSN_UCF101_Dataset from compose import TSN_UCF101_Dataset
...@@ -93,6 +95,11 @@ def parse_args(): ...@@ -93,6 +95,11 @@ def parse_args():
default=True, default=True,
help='whether to validating in training phase.' help='whether to validating in training phase.'
'default value is True.') 'default value is True.')
parser.add_argument(
'--log_interval',
type=int,
default=10,
help='mini-batch interval to log.')
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -126,8 +133,7 @@ def val(epoch, model, val_loader, cfg, args): ...@@ -126,8 +133,7 @@ def val(epoch, model, val_loader, cfg, args):
outputs = model(imgs) outputs = model(imgs)
loss = F.cross_entropy( loss = F.cross_entropy(input=outputs, label=labels, ignore_index=-1)
input=outputs, label=labels, ignore_index=-1)
avg_loss = paddle.mean(loss) avg_loss = paddle.mean(loss)
acc_top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1) acc_top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
acc_top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5) acc_top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
...@@ -182,20 +188,17 @@ def train(args): ...@@ -182,20 +188,17 @@ def train(args):
place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) \ place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) \
if use_data_parallel else paddle.CUDAPlace(0) if use_data_parallel else paddle.CUDAPlace(0)
if use_data_parallel: if use_data_parallel:
paddle.distributed.init_parallel_env() paddle.distributed.init_parallel_env()
video_model = TSN_ResNet(train_config) video_model = TSN_ResNet(train_config)
if use_data_parallel: if use_data_parallel:
video_model = paddle.DataParallel(video_model) video_model = paddle.DataParallel(video_model)
pre_state_dict = paddle.load(args.pretrain) pre_state_dict = paddle.load(args.pretrain)
#if paddle.distributed.parallel.Env().local_rank == 0: #if paddle.distributed.parallel.Env().local_rank == 0:
video_model = init_model(video_model, pre_state_dict) video_model = init_model(video_model, pre_state_dict)
optimizer = create_optimizer(train_config.TRAIN, optimizer = create_optimizer(train_config.TRAIN, video_model.parameters())
video_model.parameters())
bs_denominator = 1 bs_denominator = 1
if args.use_gpu: if args.use_gpu:
...@@ -223,8 +226,7 @@ def train(args): ...@@ -223,8 +226,7 @@ def train(args):
places=place, places=place,
num_workers=train_config.TRAIN.num_workers, num_workers=train_config.TRAIN.num_workers,
return_list=True) return_list=True)
val_sampler = DistributedBatchSampler( val_sampler = DistributedBatchSampler(val_dataset, batch_size=bs_val_single)
val_dataset, batch_size=bs_val_single)
val_loader = DataLoader( val_loader = DataLoader(
val_dataset, val_dataset,
batch_sampler=val_sampler, batch_sampler=val_sampler,
...@@ -232,22 +234,27 @@ def train(args): ...@@ -232,22 +234,27 @@ def train(args):
num_workers=valid_config.VALID.num_workers, num_workers=valid_config.VALID.num_workers,
return_list=True) return_list=True)
# resume training the model # resume training the model
if args.resume is not None: if args.resume is not None:
model_state, opt_state = paddle.load(args.resume) model_state, opt_state = paddle.load(args.resume)
video_model.set_dict(model_state) video_model.set_dict(model_state)
optimizer.set_dict(opt_state) optimizer.set_dict(opt_state)
reader_cost_averager = TimeAverager()
batch_cost_averager = TimeAverager()
for epoch in range(1, train_config.TRAIN.epoch + 1): for epoch in range(1, train_config.TRAIN.epoch + 1):
epoch_start = time.time()
video_model.train() video_model.train()
total_loss = 0.0 total_loss = 0.0
total_acc1 = 0.0 total_acc1 = 0.0
total_acc5 = 0.0 total_acc5 = 0.0
total_sample = 0 total_sample = 0
batch_start = time.time() batch_start = time.time()
for batch_id, data in enumerate(train_loader): for batch_id, data in enumerate(train_loader):
train_reader_cost = time.time() - batch_start reader_cost_averager.record(time.time() - batch_start)
imgs = paddle.to_tensor(data[0], place=paddle.CUDAPinnedPlace()) imgs = paddle.to_tensor(data[0], place=paddle.CUDAPinnedPlace())
labels = paddle.to_tensor(data[1], place=paddle.CUDAPinnedPlace()) labels = paddle.to_tensor(data[1], place=paddle.CUDAPinnedPlace())
labels.stop_gradient = True labels.stop_gradient = True
...@@ -256,10 +263,8 @@ def train(args): ...@@ -256,10 +263,8 @@ def train(args):
loss = F.cross_entropy(input=outputs, label=labels, ignore_index=-1) loss = F.cross_entropy(input=outputs, label=labels, ignore_index=-1)
avg_loss = paddle.mean(loss) avg_loss = paddle.mean(loss)
acc_top1 = paddle.metric.accuracy( acc_top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
input=outputs, label=labels, k=1) acc_top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
acc_top5 = paddle.metric.accuracy(
input=outputs, label=labels, k=5)
dy_out = avg_loss.numpy()[0] dy_out = avg_loss.numpy()[0]
if use_data_parallel: if use_data_parallel:
...@@ -278,18 +283,27 @@ def train(args): ...@@ -278,18 +283,27 @@ def train(args):
total_acc1 += acc_top1.numpy()[0] total_acc1 += acc_top1.numpy()[0]
total_acc5 += acc_top5.numpy()[0] total_acc5 += acc_top5.numpy()[0]
total_sample += 1 total_sample += 1
train_batch_cost = time.time() - batch_start
batch_cost_averager.record(
time.time() - batch_start, num_samples=bs_train_single)
if batch_id % args.log_interval == 0:
print( print(
'TRAIN Epoch: {}, iter: {}, batch_cost: {:.5f} s, reader_cost: {:.5f} s, loss={:.6f}, acc1 {:.6f}, acc5 {:.6f} '. 'TRAIN Epoch: {}, iter: {}, loss={:.6f}, acc1 {:.6f}, acc5 {:.6f}, batch_cost: {:.5f} sec, reader_cost: {:.5f} sec, ips: {:.5f} samples/sec'.
format(epoch, batch_id, train_batch_cost, train_reader_cost, format(epoch, batch_id, total_loss / total_sample,
total_loss / total_sample, total_acc1 / total_sample, total_acc1 / total_sample, total_acc5 / total_sample,
total_acc5 / total_sample)) batch_cost_averager.get_average(),
reader_cost_averager.get_average(),
batch_cost_averager.get_ips_average()))
batch_cost_averager.reset()
reader_cost_averager.reset()
batch_start = time.time() batch_start = time.time()
train_epoch_cost = time.time() - epoch_start
print( print(
'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}'. 'TRAIN End, Epoch {}, avg_loss= {:.6f}, avg_acc1= {:.6f}, avg_acc5= {:.6f}, epoch_cost: {:.5f} sec'.
format(epoch, total_loss / total_sample, total_acc1 / format(epoch, total_loss / total_sample, total_acc1 / total_sample,
total_sample, total_acc5 / total_sample)) total_acc5 / total_sample, train_epoch_cost))
# save model's and optimizer's parameters which used for resuming the training stage # save model's and optimizer's parameters which used for resuming the training stage
save_parameters = (not use_data_parallel) or ( save_parameters = (not use_data_parallel) or (
...@@ -302,13 +316,12 @@ def train(args): ...@@ -302,13 +316,12 @@ def train(args):
model_path = os.path.join( model_path = os.path.join(
args.checkpoint, args.checkpoint,
"_" + model_path_pre + "_epoch{}".format(epoch)) "_" + model_path_pre + "_epoch{}".format(epoch))
paddle.save( paddle.save(video_model.state_dict(), model_path)
video_model.state_dict(), model_path)
paddle.save(optimizer.state_dict(), model_path) paddle.save(optimizer.state_dict(), model_path)
if args.validate: if args.validate:
video_model.eval() video_model.eval()
val_acc = val(epoch, video_model,valid_loader, valid_config, args) val_acc = val(epoch, video_model, val_loader, valid_config, args)
# save the best parameters in trainging stage # save the best parameters in trainging stage
if epoch == 1: if epoch == 1:
best_acc = val_acc best_acc = val_acc
...@@ -318,12 +331,13 @@ def train(args): ...@@ -318,12 +331,13 @@ def train(args):
if paddle.distributed.ParallelEnv().local_rank == 0: if paddle.distributed.ParallelEnv().local_rank == 0:
if not os.path.isdir(args.weights): if not os.path.isdir(args.weights):
os.makedirs(args.weights) os.makedirs(args.weights)
paddle.save(video_model.state_dict(), args.weights + "/final") paddle.save(video_model.state_dict(),
args.weights + "/final")
else: else:
if paddle.distributed.parallel.Env().local_rank == 0: if paddle.distributed.parallel.Env().local_rank == 0:
if not os.path.isdir(args.weights): if not os.path.isdir(args.weights):
os.makedirs(args.weights) os.makedirs(args.weights)
paddle.save(video_model.state_dict(),args.weights + "/final") paddle.save(video_model.state_dict(), args.weights + "/final")
logger.info('[TRAIN] training finished') logger.info('[TRAIN] training finished')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册