diff --git a/dygraph/core/__init__.py b/dygraph/core/__init__.py deleted file mode 100644 index 202629f542f40a2741cb12022adb10d7a56861b5..0000000000000000000000000000000000000000 --- a/dygraph/core/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .train import train -from .val import evaluate -from .infer import infer - -__all__ = ['train', 'evaluate', 'infer'] diff --git a/dygraph/core/train.py b/dygraph/core/train.py deleted file mode 100644 index e7d33a1f0cbb59b39aeabc1fbeb1a4225ea2db33..0000000000000000000000000000000000000000 --- a/dygraph/core/train.py +++ /dev/null @@ -1,192 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import paddle -import paddle.fluid as fluid -from paddle.fluid.dygraph.parallel import ParallelEnv -from paddle.fluid.io import DataLoader -# from paddle.incubate.hapi.distributed import DistributedBatchSampler -from paddle.io import DistributedBatchSampler -import paddle.nn.functional as F - -import dygraph.utils.logger as logger -from dygraph.utils import load_pretrained_model -from dygraph.utils import resume -from dygraph.utils import Timer, calculate_eta -from .val import evaluate - - -def check_logits_losses(logits, losses): - len_logits = len(logits) - len_losses = len(losses['types']) - if len_logits != len_losses: - raise RuntimeError( - 'The length of logits should equal to the types of loss config: {} != {}.' - .format(len_logits, len_losses)) - - -def loss_computation(logits, label, losses): - check_logits_losses(logits, losses) - loss = 0 - for i in range(len(logits)): - logit = logits[i] - if logit.shape[-2:] != label.shape[-2:]: - logit = F.resize_bilinear(logit, label.shape[-2:]) - loss_i = losses['types'][i](logit, label) - loss += losses['coef'][i] * loss_i - return loss - - -def train(model, - train_dataset, - places=None, - eval_dataset=None, - optimizer=None, - save_dir='output', - iters=10000, - batch_size=2, - resume_model=None, - save_interval_iters=1000, - log_iters=10, - num_classes=None, - num_workers=8, - use_vdl=False, - losses=None, - ignore_index=255): - nranks = ParallelEnv().nranks - - start_iter = 0 - if resume_model is not None: - start_iter = resume(model, optimizer, resume_model) - - if not os.path.isdir(save_dir): - if os.path.exists(save_dir): - os.remove(save_dir) - os.makedirs(save_dir) - - if nranks > 1: - strategy = fluid.dygraph.prepare_context() - ddp_model = fluid.dygraph.DataParallel(model, strategy) - - batch_sampler = DistributedBatchSampler( - train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) - loader = DataLoader( - train_dataset, - batch_sampler=batch_sampler, - places=places, - num_workers=num_workers, - return_list=True, - ) - - if use_vdl: - from visualdl import LogWriter - log_writer = LogWriter(save_dir) - - timer = Timer() - avg_loss = 0.0 - iters_per_epoch = len(batch_sampler) - best_mean_iou = -1.0 - best_model_iter = -1 - train_reader_cost = 0.0 - train_batch_cost = 0.0 - timer.start() - - iter = start_iter - while iter < iters: - for data in loader: - iter += 1 - if iter > iters: - break - train_reader_cost += timer.elapsed_time() - images = data[0] - labels = data[1].astype('int64') - if nranks > 1: - logits = ddp_model(images) - loss = loss_computation(logits, labels, losses) - # loss = ddp_model(images, labels) - # apply_collective_grads sum grads over multiple gpus. - loss = ddp_model.scale_loss(loss) - loss.backward() - ddp_model.apply_collective_grads() - else: - logits = model(images) - loss = loss_computation(logits, labels, losses) - # loss = model(images, labels) - loss.backward() - optimizer.minimize(loss) - model.clear_gradients() - avg_loss += loss.numpy()[0] - lr = optimizer.current_step_lr() - train_batch_cost += timer.elapsed_time() - if (iter) % log_iters == 0 and ParallelEnv().local_rank == 0: - avg_loss /= log_iters - avg_train_reader_cost = train_reader_cost / log_iters - avg_train_batch_cost = train_batch_cost / log_iters - train_reader_cost = 0.0 - train_batch_cost = 0.0 - remain_iters = iters - iter - eta = calculate_eta(remain_iters, avg_train_batch_cost) - logger.info( - "[TRAIN] epoch={}, iter={}/{}, loss={:.4f}, lr={:.6f}, batch_cost={:.4f}, reader_cost={:.4f} | ETA {}" - .format((iter - 1) // iters_per_epoch + 1, iter, iters, - avg_loss * nranks, lr, avg_train_batch_cost, - avg_train_reader_cost, eta)) - if use_vdl: - log_writer.add_scalar('Train/loss', avg_loss * nranks, iter) - log_writer.add_scalar('Train/lr', lr, iter) - log_writer.add_scalar('Train/batch_cost', - avg_train_batch_cost, iter) - log_writer.add_scalar('Train/reader_cost', - avg_train_reader_cost, iter) - avg_loss = 0.0 - - if (iter % save_interval_iters == 0 - or iter == iters) and ParallelEnv().local_rank == 0: - current_save_dir = os.path.join(save_dir, - "iter_{}".format(iter)) - if not os.path.isdir(current_save_dir): - os.makedirs(current_save_dir) - fluid.save_dygraph(model.state_dict(), - os.path.join(current_save_dir, 'model')) - fluid.save_dygraph(optimizer.state_dict(), - os.path.join(current_save_dir, 'model')) - - if eval_dataset is not None: - mean_iou, avg_acc = evaluate( - model, - eval_dataset, - model_dir=current_save_dir, - num_classes=num_classes, - ignore_index=ignore_index, - iter_id=iter) - if mean_iou > best_mean_iou: - best_mean_iou = mean_iou - best_model_iter = iter - best_model_dir = os.path.join(save_dir, "best_model") - fluid.save_dygraph( - model.state_dict(), - os.path.join(best_model_dir, 'model')) - logger.info( - 'Current evaluated best model in eval_dataset is iter_{}, miou={:4f}' - .format(best_model_iter, best_mean_iou)) - - if use_vdl: - log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) - log_writer.add_scalar('Evaluate/aAcc', avg_acc, iter) - model.train() - timer.restart() - if use_vdl: - log_writer.close() diff --git a/dygraph/core/val.py b/dygraph/core/val.py deleted file mode 100644 index 22e84a314cd4ffe8093f81dad724f3d7d12a05fe..0000000000000000000000000000000000000000 --- a/dygraph/core/val.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -import numpy as np -import tqdm -import cv2 -from paddle.fluid.dygraph.base import to_variable -import paddle.fluid as fluid -import paddle.nn.functional as F -import paddle - -import dygraph.utils.logger as logger -from dygraph.utils import ConfusionMatrix -from dygraph.utils import Timer, calculate_eta - - -def evaluate(model, - eval_dataset=None, - model_dir=None, - num_classes=None, - ignore_index=255, - iter_id=None): - ckpt_path = os.path.join(model_dir, 'model') - para_state_dict, opti_state_dict = fluid.load_dygraph(ckpt_path) - model.set_dict(para_state_dict) - model.eval() - - total_iters = len(eval_dataset) - conf_mat = ConfusionMatrix(num_classes, streaming=True) - - logger.info( - "Start to evaluating(total_samples={}, total_iters={})...".format( - len(eval_dataset), total_iters)) - timer = Timer() - timer.start() - for iter, (im, im_info, label) in tqdm.tqdm( - enumerate(eval_dataset), total=total_iters): - im = to_variable(im) - # pred, _ = model(im) - logits = model(im) - pred = paddle.argmax(logits[0], axis=1) - pred = pred.numpy().astype('float32') - pred = np.squeeze(pred) - for info in im_info[::-1]: - if info[0] == 'resize': - h, w = info[1][0], info[1][1] - pred = cv2.resize(pred, (w, h), cv2.INTER_NEAREST) - elif info[0] == 'padding': - h, w = info[1][0], info[1][1] - pred = pred[0:h, 0:w] - else: - raise Exception("Unexpected info '{}' in im_info".format( - info[0])) - pred = pred[np.newaxis, :, :, np.newaxis] - pred = pred.astype('int64') - mask = label != ignore_index - - conf_mat.calculate(pred=pred, label=label, ignore=mask) - _, iou = conf_mat.mean_iou() - - time_iter = timer.elapsed_time() - remain_iter = total_iters - iter - 1 - logger.debug( - "[EVAL] iter_id={}, iter={}/{}, iou={:4f}, sec/iter={:.4f} | ETA {}" - .format(iter_id, iter + 1, total_iters, iou, time_iter, - calculate_eta(remain_iter, time_iter))) - timer.restart() - - category_iou, miou = conf_mat.mean_iou() - category_acc, macc = conf_mat.accuracy() - logger.info("[EVAL] #Images={} mAcc={:.4f} mIoU={:.4f}".format( - len(eval_dataset), macc, miou)) - logger.info("[EVAL] Category IoU: " + str(category_iou)) - logger.info("[EVAL] Category Acc: " + str(category_acc)) - logger.info("[EVAL] Kappa:{:.4f} ".format(conf_mat.kappa())) - return miou, macc