未验证 提交 08232bbb 编写于 作者: W wuzewu 提交者: GitHub

Merge pull request #300 from wuyefeilin/dygraph

add training resume, best model save, vdl, remained time computation
......@@ -26,6 +26,8 @@ import models
import utils.logging as logging
from utils import get_environ_info
from utils import load_pretrained_model
from utils import resume
from utils import Timer, calculate_eta
from val import evaluate
......@@ -78,7 +80,13 @@ def parse_args():
parser.add_argument(
'--pretrained_model',
dest='pretrained_model',
help='The path of pretrained weight',
help='The path of pretrained model',
type=str,
default=None)
parser.add_argument(
'--resume_model',
dest='resume_model',
help='The path of resume model',
type=str,
default=None)
parser.add_argument(
......@@ -104,6 +112,17 @@ def parse_args():
dest='do_eval',
help='Eval while training',
action='store_true')
parser.add_argument(
'--log_steps',
dest='log_steps',
help='Display logging information at every log_steps',
default=10,
type=int)
parser.add_argument(
'--use_vdl',
dest='use_vdl',
help='Whether to record the data to VisualDL during training',
action='store_true')
return parser.parse_args()
......@@ -117,13 +136,20 @@ def train(model,
num_epochs=100,
batch_size=2,
pretrained_model=None,
resume_model=None,
save_interval_epochs=1,
log_steps=10,
num_classes=None,
num_workers=8):
num_workers=8,
use_vdl=False):
ignore_index = model.ignore_index
nranks = ParallelEnv().nranks
load_pretrained_model(model, pretrained_model)
start_epoch = 0
if resume_model is not None:
start_epoch = resume(model, optimizer, resume_model)
elif pretrained_model is not None:
load_pretrained_model(model, pretrained_model)
if not os.path.isdir(save_dir):
if os.path.exists(save_dir):
......@@ -144,9 +170,19 @@ def train(model,
return_list=True,
)
num_steps_each_epoch = len(train_dataset) // batch_size
if use_vdl:
from visualdl import LogWriter
log_writer = LogWriter(save_dir)
for epoch in range(num_epochs):
timer = Timer()
timer.start()
avg_loss = 0.0
steps_per_epoch = len(batch_sampler)
total_steps = steps_per_epoch * (num_epochs - start_epoch)
num_steps = 0
best_mean_iou = -1.0
best_model_epoch = 1
for epoch in range(start_epoch, num_epochs):
for step, data in enumerate(loader):
images = data[0]
labels = data[1].astype('int64')
......@@ -160,22 +196,37 @@ def train(model,
loss.backward()
optimizer.minimize(loss)
model.clear_gradients()
logging.info("[TRAIN] Epoch={}/{}, Step={}/{}, loss={}".format(
epoch + 1, num_epochs, step + 1, len(batch_sampler),
loss.numpy()))
avg_loss += loss.numpy()[0]
lr = optimizer.current_step_lr()
num_steps += 1
if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0:
avg_loss /= log_steps
time_step = timer.elapsed_time() / log_steps
remain_steps = total_steps - num_steps
logging.info(
"[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, sec/step={:.4f} | ETA {}"
.format(epoch + 1, num_epochs, step + 1, steps_per_epoch,
avg_loss, lr, time_step,
calculate_eta(remain_steps, time_step)))
if use_vdl:
log_writer.add_scalar('Train/loss', avg_loss, num_steps)
log_writer.add_scalar('Train/lr', lr, num_steps)
avg_loss = 0.0
timer.restart()
if ((epoch + 1) % save_interval_epochs == 0
or num_steps_each_epoch == num_epochs - 1
) and ParallelEnv().local_rank == 0:
or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0:
current_save_dir = os.path.join(save_dir,
"epoch_{}".format(epoch + 1))
if not os.path.isdir(current_save_dir):
os.makedirs(current_save_dir)
fluid.save_dygraph(model.state_dict(),
os.path.join(current_save_dir, 'model'))
fluid.save_dygraph(optimizer.state_dict(),
os.path.join(current_save_dir, 'model'))
if eval_dataset is not None:
evaluate(
mean_iou, mean_acc = evaluate(
model,
eval_dataset,
places=places,
......@@ -184,7 +235,24 @@ def train(model,
batch_size=batch_size,
ignore_index=ignore_index,
epoch_id=epoch + 1)
if mean_iou > best_mean_iou:
best_mean_iou = mean_iou
best_model_epoch = epoch + 1
best_model_dir = os.path.join(save_dir, "best_model")
fluid.save_dygraph(model.state_dict(),
os.path.join(best_model_dir, 'model'))
logging.info(
'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}'
.format(best_model_epoch, best_mean_iou))
if use_vdl:
log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
epoch + 1)
log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
epoch + 1)
model.train()
if use_vdl:
log_writer.close()
def main(args):
......@@ -223,7 +291,9 @@ def main(args):
num_classes=train_dataset.num_classes, ignore_index=255)
# Creat optimizer
num_steps_each_epoch = len(train_dataset) // args.batch_size
# todo, may less one than len(loader)
num_steps_each_epoch = len(train_dataset) // (
args.batch_size * ParallelEnv().nranks)
decay_step = args.num_epochs * num_steps_each_epoch
lr_decay = fluid.layers.polynomial_decay(
args.learning_rate, decay_step, end_learning_rate=0, power=0.9)
......@@ -243,9 +313,12 @@ def main(args):
num_epochs=args.num_epochs,
batch_size=args.batch_size,
pretrained_model=args.pretrained_model,
resume_model=args.resume_model,
save_interval_epochs=args.save_interval_epochs,
log_steps=args.log_steps,
num_classes=train_dataset.num_classes,
num_workers=args.num_workers)
num_workers=args.num_workers,
use_vdl=args.use_vdl)
if __name__ == '__main__':
......
......@@ -16,3 +16,4 @@ from . import logging
from . import download
from .metrics import ConfusionMatrix
from .utils import *
from .timer import Timer, calculate_eta
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class Timer(object):
""" Simple timer class for measuring time consuming """
def __init__(self):
self._start_time = 0.0
self._end_time = 0.0
self._elapsed_time = 0.0
self._is_running = False
def start(self):
self._is_running = True
self._start_time = time.time()
def restart(self):
self.start()
def stop(self):
self._is_running = False
self._end_time = time.time()
def elapsed_time(self):
self._end_time = time.time()
self._elapsed_time = self._end_time - self._start_time
if not self.is_running:
return 0.0
return self._elapsed_time
@property
def is_running(self):
return self._is_running
def calculate_eta(remaining_step, speed):
if remaining_step < 0:
remaining_step = 0
remaining_time = int(remaining_step * speed)
result = "{:0>2}:{:0>2}:{:0>2}"
arr = []
for i in range(2, -1, -1):
arr.append(int(remaining_time / 60**i))
remaining_time %= 60**i
return result.format(*arr)
......@@ -49,7 +49,7 @@ def get_environ_info():
def load_pretrained_model(model, pretrained_model):
if pretrained_model is not None:
logging.info('Load pretrained model!')
logging.info('Load pretrained model from {}'.format(pretrained_model))
if os.path.exists(pretrained_model):
ckpt_path = os.path.join(pretrained_model, 'model')
para_state_dict, _ = fluid.load_dygraph(ckpt_path)
......@@ -74,8 +74,30 @@ def load_pretrained_model(model, pretrained_model):
else:
raise ValueError(
'The pretrained model directory is not Found: {}'.formnat(
'The pretrained model directory is not Found: {}'.format(
pretrained_model))
else:
logging.info('No pretrained model to load, train from scratch')
def resume(model, optimizer, resume_model):
if resume_model is not None:
logging.info('Resume model from {}'.format(resume_model))
if os.path.exists(resume_model):
ckpt_path = os.path.join(resume_model, 'model')
para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path)
model.set_dict(para_state_dict)
optimizer.set_dict(opti_state_dict)
epoch = resume_model.split('_')[-1]
if epoch.isdigit():
epoch = int(epoch)
return epoch
else:
raise ValueError(
'The resume model directory is not Found: {}'.format(
resume_model))
else:
logging.info('No model need to resume')
def visualize(image, result, save_dir=None, weight=0.6):
......
......@@ -29,6 +29,7 @@ import models
import utils.logging as logging
from utils import get_environ_info
from utils import ConfusionMatrix
from utils import Timer, calculate_eta
def parse_args():
......@@ -96,12 +97,14 @@ def evaluate(model,
places=places,
return_list=True,
)
total_steps = math.ceil(len(eval_dataset) * 1.0 / batch_size)
total_steps = len(batch_sampler)
conf_mat = ConfusionMatrix(num_classes, streaming=True)
logging.info(
"Start to evaluating(total_samples={}, total_steps={})...".format(
len(eval_dataset), total_steps))
timer = Timer()
timer.start()
for step, data in enumerate(loader):
images = data[0]
labels = data[1].astype('int64')
......@@ -113,8 +116,13 @@ def evaluate(model,
conf_mat.calculate(pred=pred, label=labels, ignore=mask)
_, iou = conf_mat.mean_iou()
logging.info("[EVAL] Epoch={}, Step={}/{}, iou={}".format(
epoch_id, step + 1, total_steps, iou))
time_step = timer.elapsed_time()
remain_step = total_steps - step - 1
logging.info(
"[EVAL] Epoch={}, Step={}/{}, iou={:4f}, sec/step={:.4f} | ETA {}".
format(epoch_id, step + 1, total_steps, iou, time_step,
calculate_eta(remain_step, time_step)))
timer.restart()
category_iou, miou = conf_mat.mean_iou()
category_acc, macc = conf_mat.accuracy()
......@@ -123,6 +131,7 @@ def evaluate(model,
logging.info("[EVAL] Category IoU: " + str(category_iou))
logging.info("[EVAL] Category Acc: " + str(category_acc))
logging.info("[EVAL] Kappa:{:.4f} ".format(conf_mat.kappa()))
return miou, macc
def main(args):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册