未验证 提交 523b0b41 编写于 作者: 走神的阿圆's avatar 走神的阿圆 提交者: GitHub

Switch TensorBoard to VisualDL 2.0 (#242)

* add_vdl

* Update docs for visualdl.
上级 6e54823f
......@@ -2,8 +2,6 @@ pre-commit
yapf == 0.26.0
flake8
pyyaml >= 5.1
tb-paddle
tensorboard >= 1.15.0
Pillow
numpy
six
......@@ -11,3 +9,4 @@ opencv-python
tqdm
requests
sklearn
visualdl == 2.0.0-alpha.2
......@@ -78,14 +78,14 @@ def parse_args():
help='debug mode, display detail information of training',
action='store_true')
parser.add_argument(
'--use_tb',
dest='use_tb',
help='whether to record the data during training to Tensorboard',
'--use_vdl',
dest='use_vdl',
help='whether to record the data during training to VisualDL',
action='store_true')
parser.add_argument(
'--tb_log_dir',
dest='tb_log_dir',
help='Tensorboard logging directory',
'--vdl_log_dir',
dest='vdl_log_dir',
help='VisualDL logging directory',
default=None,
type=str)
parser.add_argument(
......@@ -327,17 +327,17 @@ def train(cfg):
fetch_list.extend([pred.name, grts.name, masks.name])
# cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)
if args.use_tb:
if not args.tb_log_dir:
print_info("Please specify the log directory by --tb_log_dir.")
if args.use_vdl:
if not args.vdl_log_dir:
print_info("Please specify the log directory by --vdl_log_dir.")
exit(1)
from tb_paddle import SummaryWriter
log_writer = SummaryWriter(args.tb_log_dir)
from visualdl import LogWriter
log_writer = LogWriter(args.vdl_log_dir)
# trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
# num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
global_step = 0
step = 0
all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
all_step += 1
......@@ -377,9 +377,9 @@ def train(cfg):
avg_acc += np.mean(out_acc)
avg_fp += np.mean(out_fp)
avg_fn += np.mean(out_fn)
global_step += 1
step += 1
if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
avg_loss /= args.log_steps
avg_seg_loss /= args.log_steps
avg_emb_loss /= args.log_steps
......@@ -389,14 +389,14 @@ def train(cfg):
speed = args.log_steps / timer.elapsed_time()
print((
"epoch={} step={} lr={:.5f} loss={:.4f} seg_loss={:.4f} emb_loss={:.4f} accuracy={:.4} fp={:.4} fn={:.4} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, avg_seg_loss,
).format(epoch, step, lr[0], avg_loss, avg_seg_loss,
avg_emb_loss, avg_acc, avg_fp, avg_fn, speed,
calculate_eta(all_step - global_step, speed)))
if args.use_tb:
calculate_eta(all_step - step, speed)))
if args.use_vdl:
log_writer.add_scalar('Train/loss', avg_loss,
global_step)
log_writer.add_scalar('Train/lr', lr[0], global_step)
log_writer.add_scalar('Train/speed', speed, global_step)
step)
log_writer.add_scalar('Train/lr', lr[0], step)
log_writer.add_scalar('Train/speed', speed, step)
sys.stdout.flush()
avg_loss = 0.0
avg_seg_loss = 0.0
......@@ -422,14 +422,14 @@ def train(cfg):
ckpt_dir=ckpt_dir,
use_gpu=args.use_gpu,
use_mpio=args.use_mpio)
if args.use_tb:
if args.use_vdl:
log_writer.add_scalar('Evaluate/accuracy', accuracy,
global_step)
log_writer.add_scalar('Evaluate/fp', fp, global_step)
log_writer.add_scalar('Evaluate/fn', fn, global_step)
step)
log_writer.add_scalar('Evaluate/fp', fp, step)
log_writer.add_scalar('Evaluate/fn', fn, step)
# Use Tensorboard to visualize results
if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None:
# Use VisualDL to visualize results
if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
visualize(
cfg=cfg,
use_gpu=args.use_gpu,
......
......@@ -68,7 +68,7 @@ cfg.DATASET.VAL_TOTAL_IMAGES = 500
cfg.DATASET.TEST_FILE_LIST = './dataset/cityscapes/test.list'
# 测试数据数量
cfg.DATASET.TEST_TOTAL_IMAGES = 500
# Tensorboard 可视化的数据集
# VisualDL 可视化的数据集
cfg.DATASET.VIS_FILE_LIST = None
# 类别数(需包括背景类)
cfg.DATASET.NUM_CLASSES = 19
......
......@@ -27,10 +27,10 @@ python pdseg/train.py BATCH_SIZE 1 --cfg configs/unet_optic.yaml
|--cfg|配置文件路径|ALL|None||
|--use_gpu|是否使用GPU进行训练|train/eval/vis|False||
|--use_mpio|是否使用多进程进行IO处理|train/eval|False|打开该开关会占用一定量的CPU内存,但是可以提高训练速度。</br> **NOTE:** windows平台下不支持该功能, 建议使用自定义数据初次训练时不打开,打开会导致数据读取异常不可见。 |
|--use_tb|是否使用TensorBoard记录训练数据|train|False||
|--use_vdl|是否使用VisualDL记录训练数据|train|False||
|--log_steps|训练日志的打印周期(单位为step)|train|10||
|--debug|是否打印debug信息|train|False|IOU等指标涉及到混淆矩阵的计算,会降低训练速度|
|--tb_log_dir &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|TensorBoard的日志路径|train|None||
|--vdl_log_dir &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|VisualDL的日志路径|train|None||
|--do_eval|是否在保存模型时进行效果评估 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|train|False||
|--vis_dir|保存可视化图片的路径|vis|"visual"||
......@@ -80,7 +80,7 @@ DATASET:
VAL_FILE_LIST: './dataset/cityscapes/val.list'
# 测试数据列表
TEST_FILE_LIST: './dataset/cityscapes/test.list'
# Tensorboard 可视化的数据集
# VisualDL 可视化的数据集
VIS_FILE_LIST: None
# 类别数(需包括背景类)
NUM_CLASSES: 19
......
......@@ -62,7 +62,7 @@ DATASET Group存放所有与数据集相关的配置
## `VIS_FILE_LIST`
可视化列表,调用`pdseg/train.py`进行训练时,如果打开了--use_tb开关,则在每次模型保存的时候,会读取该列表中的图片进行可视化
可视化列表,调用`pdseg/train.py`进行训练时,如果打开了--use_vdl开关,则在每次模型保存的时候,会读取该列表中的图片进行可视化
文件列表由多行组成,每一行的格式为
```
......
......@@ -49,8 +49,8 @@ export CUDA_VISIBLE_DEVICES=0
python pdseg/train.py --cfg configs/unet_optic.yaml \
--use_gpu \
--do_eval \
--use_tb \
--tb_log_dir train_log \
--use_vdl \
--vdl_log_dir train_log \
BATCH_SIZE 4 \
SOLVER.LR 0.001
......@@ -70,22 +70,22 @@ export CUDA_VISIBLE_DEVICES=0,1,2
## 5.训练过程可视化
当打开do_eval和use_tb两个开关后,我们可以通过TensorBoard查看边训练边评估的效果。
当打开do_eval和use_vdl两个开关后,我们可以通过VisualDL查看边训练边评估的效果。
```shell
tensorboard --logdir train_log --host {$HOST_IP} --port {$PORT}
visualdl --logdir train_log --host {$HOST_IP} --port {$PORT}
```
NOTE:
1. 上述示例中,$HOST\_IP为机器IP地址,请替换为实际IP,$PORT请替换为可访问的端口。
2. 数据量较大时,前端加载速度会比较慢,请耐心等待。
启动TensorBoard命令后,我们可以在浏览器中查看对应的训练数据。
启动VisualDL命令后,我们可以在浏览器中查看对应的训练数据。
`SCALAR`这个tab中,查看训练loss、iou、acc的变化趋势。
![](./imgs/tensorboard_scalar.JPG)
![](./imgs/visualdl_scalar.png)
`IMAGE`这个tab中,查看样本图片。
![](./imgs/tensorboard_image.JPG)
![](./imgs/visualdl_image.png)
## 6.模型评估
训练完成后,我们可以通过eval.py来评估模型效果。由于我们设置的训练EPOCH数量为10,保存间隔为5,因此一共会产生2个定期保存的模型,加上最终保存的final模型,一共有3个模型。我们选择最后保存的模型进行效果的评估:
......
......@@ -77,14 +77,14 @@ def parse_args():
help='debug mode, display detail information of training',
action='store_true')
parser.add_argument(
'--use_tb',
dest='use_tb',
help='whether to record the data during training to Tensorboard',
'--use_vdl',
dest='use_vdl',
help='whether to record the data during training to VisualDL',
action='store_true')
parser.add_argument(
'--tb_log_dir',
dest='tb_log_dir',
help='Tensorboard logging directory',
'--vdl_log_dir',
dest='vdl_log_dir',
help='VisualDL logging directory',
default=None,
type=str)
parser.add_argument(
......@@ -354,17 +354,17 @@ def train(cfg):
fetch_list.extend([pred.name, grts.name, masks.name])
cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)
if args.use_tb:
if not args.tb_log_dir:
print_info("Please specify the log directory by --tb_log_dir.")
if args.use_vdl:
if not args.vdl_log_dir:
print_info("Please specify the log directory by --vdl_log_dir.")
exit(1)
from tb_paddle import SummaryWriter
log_writer = SummaryWriter(args.tb_log_dir)
from visualdl import LogWriter
log_writer = LogWriter(args.vdl_log_dir)
# trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
# num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
global_step = 0
step = 0
all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
all_step += 1
......@@ -398,9 +398,9 @@ def train(cfg):
return_numpy=True)
cm.calculate(pred, grts, masks)
avg_loss += np.mean(np.array(loss))
global_step += 1
step += 1
if global_step % args.log_steps == 0:
if step % args.log_steps == 0:
speed = args.log_steps / timer.elapsed_time()
avg_loss /= args.log_steps
category_acc, mean_acc = cm.accuracy()
......@@ -408,22 +408,22 @@ def train(cfg):
print_info((
"epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, mean_acc,
).format(epoch, step, lr[0], avg_loss, mean_acc,
mean_iou, speed,
calculate_eta(all_step - global_step, speed)))
calculate_eta(all_step - step, speed)))
print_info("Category IoU: ", category_iou)
print_info("Category Acc: ", category_acc)
if args.use_tb:
if args.use_vdl:
log_writer.add_scalar('Train/mean_iou', mean_iou,
global_step)
step)
log_writer.add_scalar('Train/mean_acc', mean_acc,
global_step)
step)
log_writer.add_scalar('Train/loss', avg_loss,
global_step)
step)
log_writer.add_scalar('Train/lr', lr[0],
global_step)
step)
log_writer.add_scalar('Train/step/sec', speed,
global_step)
step)
sys.stdout.flush()
avg_loss = 0.0
cm.zero_matrix()
......@@ -435,30 +435,30 @@ def train(cfg):
fetch_list=fetch_list,
return_numpy=True)
avg_loss += np.mean(np.array(loss))
global_step += 1
step += 1
if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
avg_loss /= args.log_steps
speed = args.log_steps / timer.elapsed_time()
print((
"epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, speed,
calculate_eta(all_step - global_step, speed)))
if args.use_tb:
).format(epoch, step, lr[0], avg_loss, speed,
calculate_eta(all_step - step, speed)))
if args.use_vdl:
log_writer.add_scalar('Train/loss', avg_loss,
global_step)
step)
log_writer.add_scalar('Train/lr', lr[0],
global_step)
step)
log_writer.add_scalar('Train/speed', speed,
global_step)
step)
sys.stdout.flush()
avg_loss = 0.0
timer.restart()
# NOTE : used for benchmark, profiler tools
if args.is_profiler and epoch == 1 and global_step == args.log_steps:
if args.is_profiler and epoch == 1 and step == args.log_steps:
profiler.start_profiler("All")
elif args.is_profiler and epoch == 1 and global_step == args.log_steps + 5:
elif args.is_profiler and epoch == 1 and step == args.log_steps + 5:
profiler.stop_profiler("total", args.profiler_path)
return
......@@ -479,11 +479,11 @@ def train(cfg):
ckpt_dir=ckpt_dir,
use_gpu=args.use_gpu,
use_mpio=args.use_mpio)
if args.use_tb:
if args.use_vdl:
log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
global_step)
step)
log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
global_step)
step)
if mean_iou > best_mIoU:
best_mIoU = mean_iou
......@@ -493,8 +493,8 @@ def train(cfg):
os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'),
mean_iou))
# Use Tensorboard to visualize results
if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None:
# Use VisualDL to visualize results
if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
visualize(
cfg=cfg,
use_gpu=args.use_gpu,
......
......@@ -56,7 +56,7 @@ cfg.DATASET.VAL_TOTAL_IMAGES = 500
cfg.DATASET.TEST_FILE_LIST = './dataset/cityscapes/test.list'
# 测试数据数量
cfg.DATASET.TEST_TOTAL_IMAGES = 500
# Tensorboard 可视化的数据集
# VisualDL 可视化的数据集
cfg.DATASET.VIS_FILE_LIST = None
# 类别数(需包括背景类)
cfg.DATASET.NUM_CLASSES = 19
......
......@@ -162,18 +162,17 @@ def visualize(cfg,
img_cnt += 1
print("#{} visualize image path: {}".format(img_cnt, vis_fn))
# Use Tensorboard to visualize image
# Use VisualDL to visualize image
if log_writer is not None:
# Calulate epoch from ckpt_dir folder name
epoch = int(os.path.split(ckpt_dir)[-1])
print("Tensorboard visualization epoch", epoch)
print("VisualDL visualization epoch", epoch)
pred_mask_np = np.array(pred_mask.convert("RGB"))
log_writer.add_image(
"Predict/{}".format(img_name),
pred_mask_np,
epoch,
dataformats='HWC')
epoch)
# Original image
# BGR->RGB
img = cv2.imread(
......@@ -181,8 +180,7 @@ def visualize(cfg,
log_writer.add_image(
"Images/{}".format(img_name),
img,
epoch,
dataformats='HWC')
epoch)
# add ground truth (label) images
grt = grts[i]
if grt is not None:
......@@ -194,8 +192,7 @@ def visualize(cfg,
log_writer.add_image(
"Label/{}".format(img_name),
grt,
epoch,
dataformats='HWC')
epoch)
# If in local_test mode, only visualize 5 images just for testing
# procedure
......
......@@ -2,11 +2,10 @@ pre-commit
yapf == 0.26.0
flake8
pyyaml >= 5.1
tb-paddle
tensorboard >= 1.15.0
Pillow
numpy
six
opencv-python
tqdm
requests
visualdl == 2.0.0-alpha.2
......@@ -87,14 +87,14 @@ def parse_args():
help='debug mode, display detail information of training',
action='store_true')
parser.add_argument(
'--use_tb',
dest='use_tb',
help='whether to record the data during training to Tensorboard',
'--use_vdl',
dest='use_vdl',
help='whether to record the data during training to VisualDL',
action='store_true')
parser.add_argument(
'--tb_log_dir',
dest='tb_log_dir',
help='Tensorboard logging directory',
'--vdl_log_dir',
dest='vd;_log_dir',
help='VisualDL logging directory',
default=None,
type=str)
parser.add_argument(
......@@ -409,17 +409,17 @@ def train(cfg):
fetch_list.extend([pred.name, grts.name, masks.name])
cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)
if args.use_tb:
if not args.tb_log_dir:
print_info("Please specify the log directory by --tb_log_dir.")
if args.use_vdl:
if not args.vdl_log_dir:
print_info("Please specify the log directory by --vdl_log_dir.")
exit(1)
from tb_paddle import SummaryWriter
log_writer = SummaryWriter(args.tb_log_dir)
from visualdl import LogWriter
log_writer = LogWriter(args.vdl_log_dir)
# trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
# num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
global_step = 0
step = 0
all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
all_step += 1
......@@ -455,9 +455,9 @@ def train(cfg):
return_numpy=True)
cm.calculate(pred, grts, masks)
avg_loss += np.mean(np.array(loss))
global_step += 1
step += 1
if global_step % args.log_steps == 0:
if step % args.log_steps == 0:
speed = args.log_steps / timer.elapsed_time()
avg_loss /= args.log_steps
category_acc, mean_acc = cm.accuracy()
......@@ -465,22 +465,22 @@ def train(cfg):
print_info((
"epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, mean_acc,
).format(epoch, step, lr[0], avg_loss, mean_acc,
mean_iou, speed,
calculate_eta(all_step - global_step, speed)))
calculate_eta(all_step - step, speed)))
print_info("Category IoU: ", category_iou)
print_info("Category Acc: ", category_acc)
if args.use_tb:
if args.use_vdl:
log_writer.add_scalar('Train/mean_iou', mean_iou,
global_step)
step)
log_writer.add_scalar('Train/mean_acc', mean_acc,
global_step)
step)
log_writer.add_scalar('Train/loss', avg_loss,
global_step)
step)
log_writer.add_scalar('Train/lr', lr[0],
global_step)
step)
log_writer.add_scalar('Train/step/sec', speed,
global_step)
step)
sys.stdout.flush()
avg_loss = 0.0
cm.zero_matrix()
......@@ -494,25 +494,25 @@ def train(cfg):
avg_loss += np.mean(np.array(loss))
avg_t_loss += np.mean(np.array(t_loss))
avg_d_loss += np.mean(np.array(d_loss))
global_step += 1
step += 1
if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
avg_loss /= args.log_steps
avg_t_loss /= args.log_steps
avg_d_loss /= args.log_steps
speed = args.log_steps / timer.elapsed_time()
print((
"epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss,
).format(epoch, step, lr[0], avg_loss,
avg_t_loss, avg_d_loss, speed,
calculate_eta(all_step - global_step, speed)))
if args.use_tb:
calculate_eta(all_step - step, speed)))
if args.use_vdl:
log_writer.add_scalar('Train/loss', avg_loss,
global_step)
step)
log_writer.add_scalar('Train/lr', lr[0],
global_step)
step)
log_writer.add_scalar('Train/speed', speed,
global_step)
step)
sys.stdout.flush()
avg_loss = 0.0
avg_t_loss = 0.0
......@@ -536,11 +536,11 @@ def train(cfg):
ckpt_dir=ckpt_dir,
use_gpu=args.use_gpu,
use_mpio=args.use_mpio)
if args.use_tb:
if args.use_vdl:
log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
global_step)
step)
log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
global_step)
step)
if mean_iou > best_mIoU:
best_mIoU = mean_iou
......@@ -550,8 +550,8 @@ def train(cfg):
os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'),
mean_iou))
# Use Tensorboard to visualize results
if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None:
# Use VisualDL to visualize results
if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
visualize(
cfg=cfg,
use_gpu=args.use_gpu,
......
......@@ -87,14 +87,14 @@ def parse_args():
help='debug mode, display detail information of training',
action='store_true')
parser.add_argument(
'--use_tb',
dest='use_tb',
help='whether to record the data during training to Tensorboard',
'--use_vdl',
dest='use_vdl',
help='whether to record the data during training to VisualDL',
action='store_true')
parser.add_argument(
'--tb_log_dir',
dest='tb_log_dir',
help='Tensorboard logging directory',
'--vdl_log_dir',
dest='vdl_log_dir',
help='VisualDL logging directory',
default=None,
type=str)
parser.add_argument(
......
......@@ -83,14 +83,14 @@ def parse_args():
help='debug mode, display detail information of training',
action='store_true')
parser.add_argument(
'--use_tb',
dest='use_tb',
help='whether to record the data during training to Tensorboard',
'--use_vdl',
dest='use_vdl',
help='whether to record the data during training to VisualDL',
action='store_true')
parser.add_argument(
'--tb_log_dir',
dest='tb_log_dir',
help='Tensorboard logging directory',
'--vdl_log_dir',
dest='vdl_log_dir',
help='VisualDL logging directory',
default=None,
type=str)
parser.add_argument(
......@@ -335,13 +335,13 @@ def train(cfg):
fetch_list.extend([pred.name, grts.name, masks.name])
cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)
if args.use_tb:
if not args.tb_log_dir:
print_info("Please specify the log directory by --tb_log_dir.")
if args.use_vdl:
if not args.vdl_log_dir:
print_info("Please specify the log directory by --vdl_log_dir.")
exit(1)
from tb_paddle import SummaryWriter
log_writer = SummaryWriter(args.tb_log_dir)
from visualdl import LogWriter
log_writer = LogWriter(args.vdl_log_dir)
pruner = Pruner()
train_prog = pruner.prune(
......@@ -357,7 +357,7 @@ def train(cfg):
exec_strategy=exec_strategy,
build_strategy=build_strategy)
global_step = 0
step = 0
all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
all_step += 1
......@@ -389,9 +389,9 @@ def train(cfg):
return_numpy=True)
cm.calculate(pred, grts, masks)
avg_loss += np.mean(np.array(loss))
global_step += 1
step += 1
if global_step % args.log_steps == 0:
if step % args.log_steps == 0:
speed = args.log_steps / timer.elapsed_time()
avg_loss /= args.log_steps
category_acc, mean_acc = cm.accuracy()
......@@ -399,22 +399,22 @@ def train(cfg):
print_info((
"epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, mean_acc,
).format(epoch, step, lr[0], avg_loss, mean_acc,
mean_iou, speed,
calculate_eta(all_step - global_step, speed)))
calculate_eta(all_step - step, speed)))
print_info("Category IoU: ", category_iou)
print_info("Category Acc: ", category_acc)
if args.use_tb:
if args.use_vdl:
log_writer.add_scalar('Train/mean_iou', mean_iou,
global_step)
step)
log_writer.add_scalar('Train/mean_acc', mean_acc,
global_step)
step)
log_writer.add_scalar('Train/loss', avg_loss,
global_step)
step)
log_writer.add_scalar('Train/lr', lr[0],
global_step)
step)
log_writer.add_scalar('Train/step/sec', speed,
global_step)
step)
sys.stdout.flush()
avg_loss = 0.0
cm.zero_matrix()
......@@ -426,22 +426,22 @@ def train(cfg):
fetch_list=fetch_list,
return_numpy=True)
avg_loss += np.mean(np.array(loss))
global_step += 1
step += 1
if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
avg_loss /= args.log_steps
speed = args.log_steps / timer.elapsed_time()
print((
"epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, speed,
calculate_eta(all_step - global_step, speed)))
if args.use_tb:
).format(epoch, step, lr[0], avg_loss, speed,
calculate_eta(all_step - step, speed)))
if args.use_vdl:
log_writer.add_scalar('Train/loss', avg_loss,
global_step)
step)
log_writer.add_scalar('Train/lr', lr[0],
global_step)
step)
log_writer.add_scalar('Train/speed', speed,
global_step)
step)
sys.stdout.flush()
avg_loss = 0.0
timer.restart()
......@@ -463,14 +463,14 @@ def train(cfg):
ckpt_dir=ckpt_dir,
use_gpu=args.use_gpu,
use_mpio=args.use_mpio)
if args.use_tb:
if args.use_vdl:
log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
global_step)
step)
log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
global_step)
step)
# Use Tensorboard to visualize results
if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None:
# Use VisualDL to visualize results
if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
visualize(
cfg=cfg,
use_gpu=args.use_gpu,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册