未验证 提交 08232bbb 编写于 作者: W wuzewu 提交者: GitHub

Merge pull request #300 from wuyefeilin/dygraph

add training resume, best model save, vdl, remained time computation
...@@ -26,6 +26,8 @@ import models ...@@ -26,6 +26,8 @@ import models
import utils.logging as logging import utils.logging as logging
from utils import get_environ_info from utils import get_environ_info
from utils import load_pretrained_model from utils import load_pretrained_model
from utils import resume
from utils import Timer, calculate_eta
from val import evaluate from val import evaluate
...@@ -78,7 +80,13 @@ def parse_args(): ...@@ -78,7 +80,13 @@ def parse_args():
parser.add_argument( parser.add_argument(
'--pretrained_model', '--pretrained_model',
dest='pretrained_model', dest='pretrained_model',
help='The path of pretrained weight', help='The path of pretrained model',
type=str,
default=None)
parser.add_argument(
'--resume_model',
dest='resume_model',
help='The path of resume model',
type=str, type=str,
default=None) default=None)
parser.add_argument( parser.add_argument(
...@@ -104,6 +112,17 @@ def parse_args(): ...@@ -104,6 +112,17 @@ def parse_args():
dest='do_eval', dest='do_eval',
help='Eval while training', help='Eval while training',
action='store_true') action='store_true')
parser.add_argument(
'--log_steps',
dest='log_steps',
help='Display logging information at every log_steps',
default=10,
type=int)
parser.add_argument(
'--use_vdl',
dest='use_vdl',
help='Whether to record the data to VisualDL during training',
action='store_true')
return parser.parse_args() return parser.parse_args()
...@@ -117,12 +136,19 @@ def train(model, ...@@ -117,12 +136,19 @@ def train(model,
num_epochs=100, num_epochs=100,
batch_size=2, batch_size=2,
pretrained_model=None, pretrained_model=None,
resume_model=None,
save_interval_epochs=1, save_interval_epochs=1,
log_steps=10,
num_classes=None, num_classes=None,
num_workers=8): num_workers=8,
use_vdl=False):
ignore_index = model.ignore_index ignore_index = model.ignore_index
nranks = ParallelEnv().nranks nranks = ParallelEnv().nranks
start_epoch = 0
if resume_model is not None:
start_epoch = resume(model, optimizer, resume_model)
elif pretrained_model is not None:
load_pretrained_model(model, pretrained_model) load_pretrained_model(model, pretrained_model)
if not os.path.isdir(save_dir): if not os.path.isdir(save_dir):
...@@ -144,9 +170,19 @@ def train(model, ...@@ -144,9 +170,19 @@ def train(model,
return_list=True, return_list=True,
) )
num_steps_each_epoch = len(train_dataset) // batch_size if use_vdl:
from visualdl import LogWriter
log_writer = LogWriter(save_dir)
for epoch in range(num_epochs): timer = Timer()
timer.start()
avg_loss = 0.0
steps_per_epoch = len(batch_sampler)
total_steps = steps_per_epoch * (num_epochs - start_epoch)
num_steps = 0
best_mean_iou = -1.0
best_model_epoch = 1
for epoch in range(start_epoch, num_epochs):
for step, data in enumerate(loader): for step, data in enumerate(loader):
images = data[0] images = data[0]
labels = data[1].astype('int64') labels = data[1].astype('int64')
...@@ -160,22 +196,37 @@ def train(model, ...@@ -160,22 +196,37 @@ def train(model,
loss.backward() loss.backward()
optimizer.minimize(loss) optimizer.minimize(loss)
model.clear_gradients() model.clear_gradients()
logging.info("[TRAIN] Epoch={}/{}, Step={}/{}, loss={}".format( avg_loss += loss.numpy()[0]
epoch + 1, num_epochs, step + 1, len(batch_sampler), lr = optimizer.current_step_lr()
loss.numpy())) num_steps += 1
if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0:
avg_loss /= log_steps
time_step = timer.elapsed_time() / log_steps
remain_steps = total_steps - num_steps
logging.info(
"[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, sec/step={:.4f} | ETA {}"
.format(epoch + 1, num_epochs, step + 1, steps_per_epoch,
avg_loss, lr, time_step,
calculate_eta(remain_steps, time_step)))
if use_vdl:
log_writer.add_scalar('Train/loss', avg_loss, num_steps)
log_writer.add_scalar('Train/lr', lr, num_steps)
avg_loss = 0.0
timer.restart()
if ((epoch + 1) % save_interval_epochs == 0 if ((epoch + 1) % save_interval_epochs == 0
or num_steps_each_epoch == num_epochs - 1 or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0:
) and ParallelEnv().local_rank == 0:
current_save_dir = os.path.join(save_dir, current_save_dir = os.path.join(save_dir,
"epoch_{}".format(epoch + 1)) "epoch_{}".format(epoch + 1))
if not os.path.isdir(current_save_dir): if not os.path.isdir(current_save_dir):
os.makedirs(current_save_dir) os.makedirs(current_save_dir)
fluid.save_dygraph(model.state_dict(), fluid.save_dygraph(model.state_dict(),
os.path.join(current_save_dir, 'model')) os.path.join(current_save_dir, 'model'))
fluid.save_dygraph(optimizer.state_dict(),
os.path.join(current_save_dir, 'model'))
if eval_dataset is not None: if eval_dataset is not None:
evaluate( mean_iou, mean_acc = evaluate(
model, model,
eval_dataset, eval_dataset,
places=places, places=places,
...@@ -184,7 +235,24 @@ def train(model, ...@@ -184,7 +235,24 @@ def train(model,
batch_size=batch_size, batch_size=batch_size,
ignore_index=ignore_index, ignore_index=ignore_index,
epoch_id=epoch + 1) epoch_id=epoch + 1)
if mean_iou > best_mean_iou:
best_mean_iou = mean_iou
best_model_epoch = epoch + 1
best_model_dir = os.path.join(save_dir, "best_model")
fluid.save_dygraph(model.state_dict(),
os.path.join(best_model_dir, 'model'))
logging.info(
'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}'
.format(best_model_epoch, best_mean_iou))
if use_vdl:
log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
epoch + 1)
log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
epoch + 1)
model.train() model.train()
if use_vdl:
log_writer.close()
def main(args): def main(args):
...@@ -223,7 +291,9 @@ def main(args): ...@@ -223,7 +291,9 @@ def main(args):
num_classes=train_dataset.num_classes, ignore_index=255) num_classes=train_dataset.num_classes, ignore_index=255)
# Creat optimizer # Creat optimizer
num_steps_each_epoch = len(train_dataset) // args.batch_size # todo, may less one than len(loader)
num_steps_each_epoch = len(train_dataset) // (
args.batch_size * ParallelEnv().nranks)
decay_step = args.num_epochs * num_steps_each_epoch decay_step = args.num_epochs * num_steps_each_epoch
lr_decay = fluid.layers.polynomial_decay( lr_decay = fluid.layers.polynomial_decay(
args.learning_rate, decay_step, end_learning_rate=0, power=0.9) args.learning_rate, decay_step, end_learning_rate=0, power=0.9)
...@@ -243,9 +313,12 @@ def main(args): ...@@ -243,9 +313,12 @@ def main(args):
num_epochs=args.num_epochs, num_epochs=args.num_epochs,
batch_size=args.batch_size, batch_size=args.batch_size,
pretrained_model=args.pretrained_model, pretrained_model=args.pretrained_model,
resume_model=args.resume_model,
save_interval_epochs=args.save_interval_epochs, save_interval_epochs=args.save_interval_epochs,
log_steps=args.log_steps,
num_classes=train_dataset.num_classes, num_classes=train_dataset.num_classes,
num_workers=args.num_workers) num_workers=args.num_workers,
use_vdl=args.use_vdl)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -16,3 +16,4 @@ from . import logging ...@@ -16,3 +16,4 @@ from . import logging
from . import download from . import download
from .metrics import ConfusionMatrix from .metrics import ConfusionMatrix
from .utils import * from .utils import *
from .timer import Timer, calculate_eta
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
class Timer(object):
""" Simple timer class for measuring time consuming """
def __init__(self):
self._start_time = 0.0
self._end_time = 0.0
self._elapsed_time = 0.0
self._is_running = False
def start(self):
self._is_running = True
self._start_time = time.time()
def restart(self):
self.start()
def stop(self):
self._is_running = False
self._end_time = time.time()
def elapsed_time(self):
self._end_time = time.time()
self._elapsed_time = self._end_time - self._start_time
if not self.is_running:
return 0.0
return self._elapsed_time
@property
def is_running(self):
return self._is_running
def calculate_eta(remaining_step, speed):
if remaining_step < 0:
remaining_step = 0
remaining_time = int(remaining_step * speed)
result = "{:0>2}:{:0>2}:{:0>2}"
arr = []
for i in range(2, -1, -1):
arr.append(int(remaining_time / 60**i))
remaining_time %= 60**i
return result.format(*arr)
...@@ -49,7 +49,7 @@ def get_environ_info(): ...@@ -49,7 +49,7 @@ def get_environ_info():
def load_pretrained_model(model, pretrained_model): def load_pretrained_model(model, pretrained_model):
if pretrained_model is not None: if pretrained_model is not None:
logging.info('Load pretrained model!') logging.info('Load pretrained model from {}'.format(pretrained_model))
if os.path.exists(pretrained_model): if os.path.exists(pretrained_model):
ckpt_path = os.path.join(pretrained_model, 'model') ckpt_path = os.path.join(pretrained_model, 'model')
para_state_dict, _ = fluid.load_dygraph(ckpt_path) para_state_dict, _ = fluid.load_dygraph(ckpt_path)
...@@ -74,8 +74,30 @@ def load_pretrained_model(model, pretrained_model): ...@@ -74,8 +74,30 @@ def load_pretrained_model(model, pretrained_model):
else: else:
raise ValueError( raise ValueError(
'The pretrained model directory is not Found: {}'.formnat( 'The pretrained model directory is not Found: {}'.format(
pretrained_model)) pretrained_model))
else:
logging.info('No pretrained model to load, train from scratch')
def resume(model, optimizer, resume_model):
if resume_model is not None:
logging.info('Resume model from {}'.format(resume_model))
if os.path.exists(resume_model):
ckpt_path = os.path.join(resume_model, 'model')
para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path)
model.set_dict(para_state_dict)
optimizer.set_dict(opti_state_dict)
epoch = resume_model.split('_')[-1]
if epoch.isdigit():
epoch = int(epoch)
return epoch
else:
raise ValueError(
'The resume model directory is not Found: {}'.format(
resume_model))
else:
logging.info('No model need to resume')
def visualize(image, result, save_dir=None, weight=0.6): def visualize(image, result, save_dir=None, weight=0.6):
......
...@@ -29,6 +29,7 @@ import models ...@@ -29,6 +29,7 @@ import models
import utils.logging as logging import utils.logging as logging
from utils import get_environ_info from utils import get_environ_info
from utils import ConfusionMatrix from utils import ConfusionMatrix
from utils import Timer, calculate_eta
def parse_args(): def parse_args():
...@@ -96,12 +97,14 @@ def evaluate(model, ...@@ -96,12 +97,14 @@ def evaluate(model,
places=places, places=places,
return_list=True, return_list=True,
) )
total_steps = math.ceil(len(eval_dataset) * 1.0 / batch_size) total_steps = len(batch_sampler)
conf_mat = ConfusionMatrix(num_classes, streaming=True) conf_mat = ConfusionMatrix(num_classes, streaming=True)
logging.info( logging.info(
"Start to evaluating(total_samples={}, total_steps={})...".format( "Start to evaluating(total_samples={}, total_steps={})...".format(
len(eval_dataset), total_steps)) len(eval_dataset), total_steps))
timer = Timer()
timer.start()
for step, data in enumerate(loader): for step, data in enumerate(loader):
images = data[0] images = data[0]
labels = data[1].astype('int64') labels = data[1].astype('int64')
...@@ -113,8 +116,13 @@ def evaluate(model, ...@@ -113,8 +116,13 @@ def evaluate(model,
conf_mat.calculate(pred=pred, label=labels, ignore=mask) conf_mat.calculate(pred=pred, label=labels, ignore=mask)
_, iou = conf_mat.mean_iou() _, iou = conf_mat.mean_iou()
logging.info("[EVAL] Epoch={}, Step={}/{}, iou={}".format( time_step = timer.elapsed_time()
epoch_id, step + 1, total_steps, iou)) remain_step = total_steps - step - 1
logging.info(
"[EVAL] Epoch={}, Step={}/{}, iou={:4f}, sec/step={:.4f} | ETA {}".
format(epoch_id, step + 1, total_steps, iou, time_step,
calculate_eta(remain_step, time_step)))
timer.restart()
category_iou, miou = conf_mat.mean_iou() category_iou, miou = conf_mat.mean_iou()
category_acc, macc = conf_mat.accuracy() category_acc, macc = conf_mat.accuracy()
...@@ -123,6 +131,7 @@ def evaluate(model, ...@@ -123,6 +131,7 @@ def evaluate(model,
logging.info("[EVAL] Category IoU: " + str(category_iou)) logging.info("[EVAL] Category IoU: " + str(category_iou))
logging.info("[EVAL] Category Acc: " + str(category_acc)) logging.info("[EVAL] Category Acc: " + str(category_acc))
logging.info("[EVAL] Kappa:{:.4f} ".format(conf_mat.kappa())) logging.info("[EVAL] Kappa:{:.4f} ".format(conf_mat.kappa()))
return miou, macc
def main(args): def main(args):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册