From d1067776fdaa7597017dd20c1b53f82e603816f2 Mon Sep 17 00:00:00 2001 From: chenguowei01 Date: Thu, 30 Jul 2020 14:25:49 +0800 Subject: [PATCH] put train, evaluate and infer functions in core --- dygraph/core/__init__.py | 19 +++++ dygraph/core/infer.py | 72 ++++++++++++++++++ dygraph/core/train.py | 154 +++++++++++++++++++++++++++++++++++++++ dygraph/core/val.py | 84 +++++++++++++++++++++ dygraph/infer.py | 49 +------------ dygraph/train.py | 130 +-------------------------------- dygraph/val.py | 60 +-------------- 7 files changed, 332 insertions(+), 236 deletions(-) create mode 100644 dygraph/core/__init__.py create mode 100644 dygraph/core/infer.py create mode 100644 dygraph/core/train.py create mode 100644 dygraph/core/val.py diff --git a/dygraph/core/__init__.py b/dygraph/core/__init__.py new file mode 100644 index 00000000..202629f5 --- /dev/null +++ b/dygraph/core/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .train import train +from .val import evaluate +from .infer import infer + +__all__ = ['train', 'evaluate', 'infer'] diff --git a/dygraph/core/infer.py b/dygraph/core/infer.py new file mode 100644 index 00000000..fa390625 --- /dev/null +++ b/dygraph/core/infer.py @@ -0,0 +1,72 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +from paddle.fluid.dygraph.base import to_variable +import numpy as np +import paddle.fluid as fluid +import cv2 +import tqdm + +import utils +import utils.logging as logging + + +def mkdir(path): + sub_dir = os.path.dirname(path) + if not os.path.exists(sub_dir): + os.makedirs(sub_dir) + + +def infer(model, test_dataset=None, model_dir=None, save_dir='output'): + ckpt_path = os.path.join(model_dir, 'model') + para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path) + model.set_dict(para_state_dict) + model.eval() + + added_saved_dir = os.path.join(save_dir, 'added') + pred_saved_dir = os.path.join(save_dir, 'prediction') + + logging.info("Start to predict...") + for im, im_info, im_path in tqdm.tqdm(test_dataset): + im = to_variable(im) + pred, _ = model(im) + pred = pred.numpy() + pred = np.squeeze(pred).astype('uint8') + for info in im_info[::-1]: + if info[0] == 'resize': + h, w = info[1][0], info[1][1] + pred = cv2.resize(pred, (w, h), cv2.INTER_NEAREST) + elif info[0] == 'padding': + h, w = info[1][0], info[1][1] + pred = pred[0:h, 0:w] + else: + raise Exception("Unexpected info '{}' in im_info".format( + info[0])) + + im_file = im_path.replace(test_dataset.data_dir, '') + if im_file[0] == '/': + im_file = im_file[1:] + # save added image + added_image = utils.visualize(im_path, pred, weight=0.6) + added_image_path = os.path.join(added_saved_dir, im_file) + mkdir(added_image_path) + cv2.imwrite(added_image_path, added_image) + + # save prediction + pred_im = utils.visualize(im_path, pred, weight=0.0) + pred_saved_path = os.path.join(pred_saved_dir, im_file) + mkdir(pred_saved_path) + cv2.imwrite(pred_saved_path, pred_im) diff --git a/dygraph/core/train.py b/dygraph/core/train.py new file mode 100644 index 00000000..af74c42c --- /dev/null +++ b/dygraph/core/train.py @@ -0,0 +1,154 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle.fluid as fluid +from paddle.fluid.dygraph.parallel import ParallelEnv +from paddle.fluid.io import DataLoader +from paddle.incubate.hapi.distributed import DistributedBatchSampler + +import utils.logging as logging +from utils import load_pretrained_model +from utils import resume +from utils import Timer, calculate_eta +from .val import evaluate + + +def train(model, + train_dataset, + places=None, + eval_dataset=None, + optimizer=None, + save_dir='output', + num_epochs=100, + batch_size=2, + pretrained_model=None, + resume_model=None, + save_interval_epochs=1, + log_steps=10, + num_classes=None, + num_workers=8, + use_vdl=False): + ignore_index = model.ignore_index + nranks = ParallelEnv().nranks + + start_epoch = 0 + if resume_model is not None: + start_epoch = resume(model, optimizer, resume_model) + elif pretrained_model is not None: + load_pretrained_model(model, pretrained_model) + + if not os.path.isdir(save_dir): + if os.path.exists(save_dir): + os.remove(save_dir) + os.makedirs(save_dir) + + if nranks > 1: + strategy = fluid.dygraph.prepare_context() + model_parallel = fluid.dygraph.DataParallel(model, strategy) + + batch_sampler = DistributedBatchSampler( + train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) + loader = DataLoader( + train_dataset, + batch_sampler=batch_sampler, + places=places, + num_workers=num_workers, + return_list=True, + ) + + if use_vdl: + from visualdl import LogWriter + log_writer = LogWriter(save_dir) + + timer = Timer() + timer.start() + avg_loss = 0.0 + steps_per_epoch = len(batch_sampler) + total_steps = steps_per_epoch * (num_epochs - start_epoch) + num_steps = 0 + best_mean_iou = -1.0 + best_model_epoch = -1 + for epoch in range(start_epoch, num_epochs): + for step, data in enumerate(loader): + images = data[0] + labels = data[1].astype('int64') + if nranks > 1: + loss = model_parallel(images, labels) + loss = model_parallel.scale_loss(loss) + loss.backward() + model_parallel.apply_collective_grads() + else: + loss = model(images, labels) + loss.backward() + optimizer.minimize(loss) + model.clear_gradients() + avg_loss += loss.numpy()[0] + lr = optimizer.current_step_lr() + num_steps += 1 + if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0: + avg_loss /= log_steps + time_step = timer.elapsed_time() / log_steps + remain_steps = total_steps - num_steps + logging.info( + "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, sec/step={:.4f} | ETA {}" + .format(epoch + 1, num_epochs, step + 1, steps_per_epoch, + avg_loss * nranks, lr, time_step, + calculate_eta(remain_steps, time_step))) + if use_vdl: + log_writer.add_scalar('Train/loss', avg_loss, num_steps) + log_writer.add_scalar('Train/lr', lr, num_steps) + log_writer.add_scalar('Train/time_step', time_step, + num_steps) + avg_loss = 0.0 + timer.restart() + + if ((epoch + 1) % save_interval_epochs == 0 + or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0: + current_save_dir = os.path.join(save_dir, + "epoch_{}".format(epoch + 1)) + if not os.path.isdir(current_save_dir): + os.makedirs(current_save_dir) + fluid.save_dygraph(model.state_dict(), + os.path.join(current_save_dir, 'model')) + fluid.save_dygraph(optimizer.state_dict(), + os.path.join(current_save_dir, 'model')) + + if eval_dataset is not None: + mean_iou, mean_acc = evaluate( + model, + eval_dataset, + model_dir=current_save_dir, + num_classes=num_classes, + ignore_index=ignore_index, + epoch_id=epoch + 1) + if mean_iou > best_mean_iou: + best_mean_iou = mean_iou + best_model_epoch = epoch + 1 + best_model_dir = os.path.join(save_dir, "best_model") + fluid.save_dygraph(model.state_dict(), + os.path.join(best_model_dir, 'model')) + logging.info( + 'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}' + .format(best_model_epoch, best_mean_iou)) + + if use_vdl: + log_writer.add_scalar('Evaluate/mean_iou', mean_iou, + epoch + 1) + log_writer.add_scalar('Evaluate/mean_acc', mean_acc, + epoch + 1) + model.train() + if use_vdl: + log_writer.close() diff --git a/dygraph/core/val.py b/dygraph/core/val.py new file mode 100644 index 00000000..2ec5f3b8 --- /dev/null +++ b/dygraph/core/val.py @@ -0,0 +1,84 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import tqdm +import cv2 +from paddle.fluid.dygraph.base import to_variable +import paddle.fluid as fluid + +import utils.logging as logging +from utils import ConfusionMatrix +from utils import Timer, calculate_eta + + +def evaluate(model, + eval_dataset=None, + model_dir=None, + num_classes=None, + ignore_index=255, + epoch_id=None): + ckpt_path = os.path.join(model_dir, 'model') + para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path) + model.set_dict(para_state_dict) + model.eval() + + total_steps = len(eval_dataset) + conf_mat = ConfusionMatrix(num_classes, streaming=True) + + logging.info( + "Start to evaluating(total_samples={}, total_steps={})...".format( + len(eval_dataset), total_steps)) + timer = Timer() + timer.start() + for step, (im, im_info, label) in enumerate(eval_dataset): + im = to_variable(im) + pred, _ = model(im) + pred = pred.numpy().astype('float32') + pred = np.squeeze(pred) + for info in im_info[::-1]: + if info[0] == 'resize': + h, w = info[1][0], info[1][1] + pred = cv2.resize(pred, (w, h), cv2.INTER_NEAREST) + elif info[0] == 'padding': + h, w = info[1][0], info[1][1] + pred = pred[0:h, 0:w] + else: + raise Exception("Unexpected info '{}' in im_info".format( + info[0])) + pred = pred[np.newaxis, :, :, np.newaxis] + pred = pred.astype('int64') + mask = label != ignore_index + + conf_mat.calculate(pred=pred, label=label, ignore=mask) + _, iou = conf_mat.mean_iou() + + time_step = timer.elapsed_time() + remain_step = total_steps - step - 1 + logging.info( + "[EVAL] Epoch={}, Step={}/{}, iou={:4f}, sec/step={:.4f} | ETA {}". + format(epoch_id, step + 1, total_steps, iou, time_step, + calculate_eta(remain_step, time_step))) + timer.restart() + + category_iou, miou = conf_mat.mean_iou() + category_acc, macc = conf_mat.accuracy() + logging.info("[EVAL] #image={} acc={:.4f} IoU={:.4f}".format( + len(eval_dataset), macc, miou)) + logging.info("[EVAL] Category IoU: " + str(category_iou)) + logging.info("[EVAL] Category Acc: " + str(category_acc)) + logging.info("[EVAL] Kappa:{:.4f} ".format(conf_mat.kappa())) + return miou, macc diff --git a/dygraph/infer.py b/dygraph/infer.py index 3d4c5d5d..691fe1a1 100644 --- a/dygraph/infer.py +++ b/dygraph/infer.py @@ -28,6 +28,7 @@ from models import MODELS import utils import utils.logging as logging from utils import get_environ_info +from core import infer def parse_args(): @@ -81,54 +82,6 @@ def parse_args(): return parser.parse_args() -def mkdir(path): - sub_dir = os.path.dirname(path) - if not os.path.exists(sub_dir): - os.makedirs(sub_dir) - - -def infer(model, test_dataset=None, model_dir=None, save_dir='output'): - ckpt_path = os.path.join(model_dir, 'model') - para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path) - model.set_dict(para_state_dict) - model.eval() - - added_saved_dir = os.path.join(save_dir, 'added') - pred_saved_dir = os.path.join(save_dir, 'prediction') - - logging.info("Start to predict...") - for im, im_info, im_path in tqdm.tqdm(test_dataset): - im = to_variable(im) - pred, _ = model(im) - pred = pred.numpy() - pred = np.squeeze(pred).astype('uint8') - for info in im_info[::-1]: - if info[0] == 'resize': - h, w = info[1][0], info[1][1] - pred = cv2.resize(pred, (w, h), cv2.INTER_NEAREST) - elif info[0] == 'padding': - h, w = info[1][0], info[1][1] - pred = pred[0:h, 0:w] - else: - raise Exception("Unexpected info '{}' in im_info".format( - info[0])) - - im_file = im_path.replace(test_dataset.data_dir, '') - if im_file[0] == '/': - im_file = im_file[1:] - # save added image - added_image = utils.visualize(im_path, pred, weight=0.6) - added_image_path = os.path.join(added_saved_dir, im_file) - mkdir(added_image_path) - cv2.imwrite(added_image_path, added_image) - - # save prediction - pred_im = utils.visualize(im_path, pred, weight=0.0) - pred_saved_path = os.path.join(pred_saved_dir, im_file) - mkdir(pred_saved_path) - cv2.imwrite(pred_saved_path, pred_im) - - def main(args): env_info = get_environ_info() places = fluid.CUDAPlace(ParallelEnv().dev_id) \ diff --git a/dygraph/train.py b/dygraph/train.py index a95fd090..f665be25 100644 --- a/dygraph/train.py +++ b/dygraph/train.py @@ -28,7 +28,7 @@ from utils import get_environ_info from utils import load_pretrained_model from utils import resume from utils import Timer, calculate_eta -from val import evaluate +from core import train def parse_args(): @@ -128,134 +128,6 @@ def parse_args(): return parser.parse_args() -def train(model, - train_dataset, - places=None, - eval_dataset=None, - optimizer=None, - save_dir='output', - num_epochs=100, - batch_size=2, - pretrained_model=None, - resume_model=None, - save_interval_epochs=1, - log_steps=10, - num_classes=None, - num_workers=8, - use_vdl=False): - ignore_index = model.ignore_index - nranks = ParallelEnv().nranks - - start_epoch = 0 - if resume_model is not None: - start_epoch = resume(model, optimizer, resume_model) - elif pretrained_model is not None: - load_pretrained_model(model, pretrained_model) - - if not os.path.isdir(save_dir): - if os.path.exists(save_dir): - os.remove(save_dir) - os.makedirs(save_dir) - - if nranks > 1: - strategy = fluid.dygraph.prepare_context() - model_parallel = fluid.dygraph.DataParallel(model, strategy) - - batch_sampler = DistributedBatchSampler( - train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) - loader = DataLoader( - train_dataset, - batch_sampler=batch_sampler, - places=places, - num_workers=num_workers, - return_list=True, - ) - - if use_vdl: - from visualdl import LogWriter - log_writer = LogWriter(save_dir) - - timer = Timer() - timer.start() - avg_loss = 0.0 - steps_per_epoch = len(batch_sampler) - total_steps = steps_per_epoch * (num_epochs - start_epoch) - num_steps = 0 - best_mean_iou = -1.0 - best_model_epoch = -1 - for epoch in range(start_epoch, num_epochs): - for step, data in enumerate(loader): - images = data[0] - labels = data[1].astype('int64') - if nranks > 1: - loss = model_parallel(images, labels) - loss = model_parallel.scale_loss(loss) - loss.backward() - model_parallel.apply_collective_grads() - else: - loss = model(images, labels) - loss.backward() - optimizer.minimize(loss) - model.clear_gradients() - avg_loss += loss.numpy()[0] - lr = optimizer.current_step_lr() - num_steps += 1 - if num_steps % log_steps == 0 and ParallelEnv().local_rank == 0: - avg_loss /= log_steps - time_step = timer.elapsed_time() / log_steps - remain_steps = total_steps - num_steps - logging.info( - "[TRAIN] Epoch={}/{}, Step={}/{}, loss={:.4f}, lr={:.6f}, sec/step={:.4f} | ETA {}" - .format(epoch + 1, num_epochs, step + 1, steps_per_epoch, - avg_loss, lr, time_step, - calculate_eta(remain_steps, time_step))) - if use_vdl: - log_writer.add_scalar('Train/loss', avg_loss, num_steps) - log_writer.add_scalar('Train/lr', lr, num_steps) - log_writer.add_scalar('Train/time_step', time_step, - num_steps) - avg_loss = 0.0 - timer.restart() - - if ((epoch + 1) % save_interval_epochs == 0 - or epoch + 1 == num_epochs) and ParallelEnv().local_rank == 0: - current_save_dir = os.path.join(save_dir, - "epoch_{}".format(epoch + 1)) - if not os.path.isdir(current_save_dir): - os.makedirs(current_save_dir) - fluid.save_dygraph(model.state_dict(), - os.path.join(current_save_dir, 'model')) - fluid.save_dygraph(optimizer.state_dict(), - os.path.join(current_save_dir, 'model')) - - if eval_dataset is not None: - mean_iou, mean_acc = evaluate( - model, - eval_dataset, - model_dir=current_save_dir, - num_classes=num_classes, - ignore_index=ignore_index, - epoch_id=epoch + 1) - if mean_iou > best_mean_iou: - best_mean_iou = mean_iou - best_model_epoch = epoch + 1 - best_model_dir = os.path.join(save_dir, "best_model") - fluid.save_dygraph(model.state_dict(), - os.path.join(best_model_dir, 'model')) - logging.info( - 'Current evaluated best model in eval_dataset is epoch_{}, miou={:4f}' - .format(best_model_epoch, best_mean_iou)) - - if use_vdl: - log_writer.add_scalar('Evaluate/mean_iou', mean_iou, - epoch + 1) - log_writer.add_scalar('Evaluate/mean_acc', mean_acc, - epoch + 1) - model.train() - if use_vdl: - log_writer.close() - - def main(args): env_info = get_environ_info() places = fluid.CUDAPlace(ParallelEnv().dev_id) \ diff --git a/dygraph/val.py b/dygraph/val.py index 60fbd17e..abe2126b 100644 --- a/dygraph/val.py +++ b/dygraph/val.py @@ -32,6 +32,7 @@ import utils.logging as logging from utils import get_environ_info from utils import ConfusionMatrix from utils import Timer, calculate_eta +from core import evaluate def parse_args(): @@ -73,65 +74,6 @@ def parse_args(): return parser.parse_args() -def evaluate(model, - eval_dataset=None, - model_dir=None, - num_classes=None, - ignore_index=255, - epoch_id=None): - ckpt_path = os.path.join(model_dir, 'model') - para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path) - model.set_dict(para_state_dict) - model.eval() - - total_steps = len(eval_dataset) - conf_mat = ConfusionMatrix(num_classes, streaming=True) - - logging.info( - "Start to evaluating(total_samples={}, total_steps={})...".format( - len(eval_dataset), total_steps)) - timer = Timer() - timer.start() - for step, (im, im_info, label) in enumerate(eval_dataset): - im = to_variable(im) - pred, _ = model(im) - pred = pred.numpy().astype('float32') - pred = np.squeeze(pred) - for info in im_info[::-1]: - if info[0] == 'resize': - h, w = info[1][0], info[1][1] - pred = cv2.resize(pred, (w, h), cv2.INTER_NEAREST) - elif info[0] == 'padding': - h, w = info[1][0], info[1][1] - pred = pred[0:h, 0:w] - else: - raise Exception("Unexpected info '{}' in im_info".format( - info[0])) - pred = pred[np.newaxis, :, :, np.newaxis] - pred = pred.astype('int64') - mask = label != ignore_index - - conf_mat.calculate(pred=pred, label=label, ignore=mask) - _, iou = conf_mat.mean_iou() - - time_step = timer.elapsed_time() - remain_step = total_steps - step - 1 - logging.info( - "[EVAL] Epoch={}, Step={}/{}, iou={:4f}, sec/step={:.4f} | ETA {}". - format(epoch_id, step + 1, total_steps, iou, time_step, - calculate_eta(remain_step, time_step))) - timer.restart() - - category_iou, miou = conf_mat.mean_iou() - category_acc, macc = conf_mat.accuracy() - logging.info("[EVAL] #image={} acc={:.4f} IoU={:.4f}".format( - len(eval_dataset), macc, miou)) - logging.info("[EVAL] Category IoU: " + str(category_iou)) - logging.info("[EVAL] Category Acc: " + str(category_acc)) - logging.info("[EVAL] Kappa:{:.4f} ".format(conf_mat.kappa())) - return miou, macc - - def main(args): env_info = get_environ_info() places = fluid.CUDAPlace(ParallelEnv().dev_id) \ -- GitLab