未验证 提交 523b0b41 编写于 作者: 走神的阿圆's avatar 走神的阿圆 提交者: GitHub

Switch TensorBoard to VisualDL 2.0 (#242)

* add_vdl

* Update docs for visualdl.
上级 6e54823f
...@@ -2,8 +2,6 @@ pre-commit ...@@ -2,8 +2,6 @@ pre-commit
yapf == 0.26.0 yapf == 0.26.0
flake8 flake8
pyyaml >= 5.1 pyyaml >= 5.1
tb-paddle
tensorboard >= 1.15.0
Pillow Pillow
numpy numpy
six six
...@@ -11,3 +9,4 @@ opencv-python ...@@ -11,3 +9,4 @@ opencv-python
tqdm tqdm
requests requests
sklearn sklearn
visualdl == 2.0.0-alpha.2
...@@ -78,14 +78,14 @@ def parse_args(): ...@@ -78,14 +78,14 @@ def parse_args():
help='debug mode, display detail information of training', help='debug mode, display detail information of training',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--use_tb', '--use_vdl',
dest='use_tb', dest='use_vdl',
help='whether to record the data during training to Tensorboard', help='whether to record the data during training to VisualDL',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--tb_log_dir', '--vdl_log_dir',
dest='tb_log_dir', dest='vdl_log_dir',
help='Tensorboard logging directory', help='VisualDL logging directory',
default=None, default=None,
type=str) type=str)
parser.add_argument( parser.add_argument(
...@@ -327,17 +327,17 @@ def train(cfg): ...@@ -327,17 +327,17 @@ def train(cfg):
fetch_list.extend([pred.name, grts.name, masks.name]) fetch_list.extend([pred.name, grts.name, masks.name])
# cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) # cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)
if args.use_tb: if args.use_vdl:
if not args.tb_log_dir: if not args.vdl_log_dir:
print_info("Please specify the log directory by --tb_log_dir.") print_info("Please specify the log directory by --vdl_log_dir.")
exit(1) exit(1)
from tb_paddle import SummaryWriter from visualdl import LogWriter
log_writer = SummaryWriter(args.tb_log_dir) log_writer = LogWriter(args.vdl_log_dir)
# trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
# num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
global_step = 0 step = 0
all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
all_step += 1 all_step += 1
...@@ -377,9 +377,9 @@ def train(cfg): ...@@ -377,9 +377,9 @@ def train(cfg):
avg_acc += np.mean(out_acc) avg_acc += np.mean(out_acc)
avg_fp += np.mean(out_fp) avg_fp += np.mean(out_fp)
avg_fn += np.mean(out_fn) avg_fn += np.mean(out_fn)
global_step += 1 step += 1
if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
avg_loss /= args.log_steps avg_loss /= args.log_steps
avg_seg_loss /= args.log_steps avg_seg_loss /= args.log_steps
avg_emb_loss /= args.log_steps avg_emb_loss /= args.log_steps
...@@ -389,14 +389,14 @@ def train(cfg): ...@@ -389,14 +389,14 @@ def train(cfg):
speed = args.log_steps / timer.elapsed_time() speed = args.log_steps / timer.elapsed_time()
print(( print((
"epoch={} step={} lr={:.5f} loss={:.4f} seg_loss={:.4f} emb_loss={:.4f} accuracy={:.4} fp={:.4} fn={:.4} step/sec={:.3f} | ETA {}" "epoch={} step={} lr={:.5f} loss={:.4f} seg_loss={:.4f} emb_loss={:.4f} accuracy={:.4} fp={:.4} fn={:.4} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, avg_seg_loss, ).format(epoch, step, lr[0], avg_loss, avg_seg_loss,
avg_emb_loss, avg_acc, avg_fp, avg_fn, speed, avg_emb_loss, avg_acc, avg_fp, avg_fn, speed,
calculate_eta(all_step - global_step, speed))) calculate_eta(all_step - step, speed)))
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Train/loss', avg_loss, log_writer.add_scalar('Train/loss', avg_loss,
global_step) step)
log_writer.add_scalar('Train/lr', lr[0], global_step) log_writer.add_scalar('Train/lr', lr[0], step)
log_writer.add_scalar('Train/speed', speed, global_step) log_writer.add_scalar('Train/speed', speed, step)
sys.stdout.flush() sys.stdout.flush()
avg_loss = 0.0 avg_loss = 0.0
avg_seg_loss = 0.0 avg_seg_loss = 0.0
...@@ -422,14 +422,14 @@ def train(cfg): ...@@ -422,14 +422,14 @@ def train(cfg):
ckpt_dir=ckpt_dir, ckpt_dir=ckpt_dir,
use_gpu=args.use_gpu, use_gpu=args.use_gpu,
use_mpio=args.use_mpio) use_mpio=args.use_mpio)
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Evaluate/accuracy', accuracy, log_writer.add_scalar('Evaluate/accuracy', accuracy,
global_step) step)
log_writer.add_scalar('Evaluate/fp', fp, global_step) log_writer.add_scalar('Evaluate/fp', fp, step)
log_writer.add_scalar('Evaluate/fn', fn, global_step) log_writer.add_scalar('Evaluate/fn', fn, step)
# Use Tensorboard to visualize results # Use VisualDL to visualize results
if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
visualize( visualize(
cfg=cfg, cfg=cfg,
use_gpu=args.use_gpu, use_gpu=args.use_gpu,
......
...@@ -68,7 +68,7 @@ cfg.DATASET.VAL_TOTAL_IMAGES = 500 ...@@ -68,7 +68,7 @@ cfg.DATASET.VAL_TOTAL_IMAGES = 500
cfg.DATASET.TEST_FILE_LIST = './dataset/cityscapes/test.list' cfg.DATASET.TEST_FILE_LIST = './dataset/cityscapes/test.list'
# 测试数据数量 # 测试数据数量
cfg.DATASET.TEST_TOTAL_IMAGES = 500 cfg.DATASET.TEST_TOTAL_IMAGES = 500
# Tensorboard 可视化的数据集 # VisualDL 可视化的数据集
cfg.DATASET.VIS_FILE_LIST = None cfg.DATASET.VIS_FILE_LIST = None
# 类别数(需包括背景类) # 类别数(需包括背景类)
cfg.DATASET.NUM_CLASSES = 19 cfg.DATASET.NUM_CLASSES = 19
......
...@@ -27,10 +27,10 @@ python pdseg/train.py BATCH_SIZE 1 --cfg configs/unet_optic.yaml ...@@ -27,10 +27,10 @@ python pdseg/train.py BATCH_SIZE 1 --cfg configs/unet_optic.yaml
|--cfg|配置文件路径|ALL|None|| |--cfg|配置文件路径|ALL|None||
|--use_gpu|是否使用GPU进行训练|train/eval/vis|False|| |--use_gpu|是否使用GPU进行训练|train/eval/vis|False||
|--use_mpio|是否使用多进程进行IO处理|train/eval|False|打开该开关会占用一定量的CPU内存,但是可以提高训练速度。</br> **NOTE:** windows平台下不支持该功能, 建议使用自定义数据初次训练时不打开,打开会导致数据读取异常不可见。 | |--use_mpio|是否使用多进程进行IO处理|train/eval|False|打开该开关会占用一定量的CPU内存,但是可以提高训练速度。</br> **NOTE:** windows平台下不支持该功能, 建议使用自定义数据初次训练时不打开,打开会导致数据读取异常不可见。 |
|--use_tb|是否使用TensorBoard记录训练数据|train|False|| |--use_vdl|是否使用VisualDL记录训练数据|train|False||
|--log_steps|训练日志的打印周期(单位为step)|train|10|| |--log_steps|训练日志的打印周期(单位为step)|train|10||
|--debug|是否打印debug信息|train|False|IOU等指标涉及到混淆矩阵的计算,会降低训练速度| |--debug|是否打印debug信息|train|False|IOU等指标涉及到混淆矩阵的计算,会降低训练速度|
|--tb_log_dir &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|TensorBoard的日志路径|train|None|| |--vdl_log_dir &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|VisualDL的日志路径|train|None||
|--do_eval|是否在保存模型时进行效果评估 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|train|False|| |--do_eval|是否在保存模型时进行效果评估 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|train|False||
|--vis_dir|保存可视化图片的路径|vis|"visual"|| |--vis_dir|保存可视化图片的路径|vis|"visual"||
...@@ -80,7 +80,7 @@ DATASET: ...@@ -80,7 +80,7 @@ DATASET:
VAL_FILE_LIST: './dataset/cityscapes/val.list' VAL_FILE_LIST: './dataset/cityscapes/val.list'
# 测试数据列表 # 测试数据列表
TEST_FILE_LIST: './dataset/cityscapes/test.list' TEST_FILE_LIST: './dataset/cityscapes/test.list'
# Tensorboard 可视化的数据集 # VisualDL 可视化的数据集
VIS_FILE_LIST: None VIS_FILE_LIST: None
# 类别数(需包括背景类) # 类别数(需包括背景类)
NUM_CLASSES: 19 NUM_CLASSES: 19
......
...@@ -62,7 +62,7 @@ DATASET Group存放所有与数据集相关的配置 ...@@ -62,7 +62,7 @@ DATASET Group存放所有与数据集相关的配置
## `VIS_FILE_LIST` ## `VIS_FILE_LIST`
可视化列表,调用`pdseg/train.py`进行训练时,如果打开了--use_tb开关,则在每次模型保存的时候,会读取该列表中的图片进行可视化 可视化列表,调用`pdseg/train.py`进行训练时,如果打开了--use_vdl开关,则在每次模型保存的时候,会读取该列表中的图片进行可视化
文件列表由多行组成,每一行的格式为 文件列表由多行组成,每一行的格式为
``` ```
......
...@@ -49,8 +49,8 @@ export CUDA_VISIBLE_DEVICES=0 ...@@ -49,8 +49,8 @@ export CUDA_VISIBLE_DEVICES=0
python pdseg/train.py --cfg configs/unet_optic.yaml \ python pdseg/train.py --cfg configs/unet_optic.yaml \
--use_gpu \ --use_gpu \
--do_eval \ --do_eval \
--use_tb \ --use_vdl \
--tb_log_dir train_log \ --vdl_log_dir train_log \
BATCH_SIZE 4 \ BATCH_SIZE 4 \
SOLVER.LR 0.001 SOLVER.LR 0.001
...@@ -70,22 +70,22 @@ export CUDA_VISIBLE_DEVICES=0,1,2 ...@@ -70,22 +70,22 @@ export CUDA_VISIBLE_DEVICES=0,1,2
## 5.训练过程可视化 ## 5.训练过程可视化
当打开do_eval和use_tb两个开关后,我们可以通过TensorBoard查看边训练边评估的效果。 当打开do_eval和use_vdl两个开关后,我们可以通过VisualDL查看边训练边评估的效果。
```shell ```shell
tensorboard --logdir train_log --host {$HOST_IP} --port {$PORT} visualdl --logdir train_log --host {$HOST_IP} --port {$PORT}
``` ```
NOTE: NOTE:
1. 上述示例中,$HOST\_IP为机器IP地址,请替换为实际IP,$PORT请替换为可访问的端口。 1. 上述示例中,$HOST\_IP为机器IP地址,请替换为实际IP,$PORT请替换为可访问的端口。
2. 数据量较大时,前端加载速度会比较慢,请耐心等待。 2. 数据量较大时,前端加载速度会比较慢,请耐心等待。
启动TensorBoard命令后,我们可以在浏览器中查看对应的训练数据。 启动VisualDL命令后,我们可以在浏览器中查看对应的训练数据。
`SCALAR`这个tab中,查看训练loss、iou、acc的变化趋势。 `SCALAR`这个tab中,查看训练loss、iou、acc的变化趋势。
![](./imgs/tensorboard_scalar.JPG) ![](./imgs/visualdl_scalar.png)
`IMAGE`这个tab中,查看样本图片。 `IMAGE`这个tab中,查看样本图片。
![](./imgs/tensorboard_image.JPG) ![](./imgs/visualdl_image.png)
## 6.模型评估 ## 6.模型评估
训练完成后,我们可以通过eval.py来评估模型效果。由于我们设置的训练EPOCH数量为10,保存间隔为5,因此一共会产生2个定期保存的模型,加上最终保存的final模型,一共有3个模型。我们选择最后保存的模型进行效果的评估: 训练完成后,我们可以通过eval.py来评估模型效果。由于我们设置的训练EPOCH数量为10,保存间隔为5,因此一共会产生2个定期保存的模型,加上最终保存的final模型,一共有3个模型。我们选择最后保存的模型进行效果的评估:
......
...@@ -77,14 +77,14 @@ def parse_args(): ...@@ -77,14 +77,14 @@ def parse_args():
help='debug mode, display detail information of training', help='debug mode, display detail information of training',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--use_tb', '--use_vdl',
dest='use_tb', dest='use_vdl',
help='whether to record the data during training to Tensorboard', help='whether to record the data during training to VisualDL',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--tb_log_dir', '--vdl_log_dir',
dest='tb_log_dir', dest='vdl_log_dir',
help='Tensorboard logging directory', help='VisualDL logging directory',
default=None, default=None,
type=str) type=str)
parser.add_argument( parser.add_argument(
...@@ -354,17 +354,17 @@ def train(cfg): ...@@ -354,17 +354,17 @@ def train(cfg):
fetch_list.extend([pred.name, grts.name, masks.name]) fetch_list.extend([pred.name, grts.name, masks.name])
cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)
if args.use_tb: if args.use_vdl:
if not args.tb_log_dir: if not args.vdl_log_dir:
print_info("Please specify the log directory by --tb_log_dir.") print_info("Please specify the log directory by --vdl_log_dir.")
exit(1) exit(1)
from tb_paddle import SummaryWriter from visualdl import LogWriter
log_writer = SummaryWriter(args.tb_log_dir) log_writer = LogWriter(args.vdl_log_dir)
# trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
# num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
global_step = 0 step = 0
all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
all_step += 1 all_step += 1
...@@ -398,9 +398,9 @@ def train(cfg): ...@@ -398,9 +398,9 @@ def train(cfg):
return_numpy=True) return_numpy=True)
cm.calculate(pred, grts, masks) cm.calculate(pred, grts, masks)
avg_loss += np.mean(np.array(loss)) avg_loss += np.mean(np.array(loss))
global_step += 1 step += 1
if global_step % args.log_steps == 0: if step % args.log_steps == 0:
speed = args.log_steps / timer.elapsed_time() speed = args.log_steps / timer.elapsed_time()
avg_loss /= args.log_steps avg_loss /= args.log_steps
category_acc, mean_acc = cm.accuracy() category_acc, mean_acc = cm.accuracy()
...@@ -408,22 +408,22 @@ def train(cfg): ...@@ -408,22 +408,22 @@ def train(cfg):
print_info(( print_info((
"epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, mean_acc, ).format(epoch, step, lr[0], avg_loss, mean_acc,
mean_iou, speed, mean_iou, speed,
calculate_eta(all_step - global_step, speed))) calculate_eta(all_step - step, speed)))
print_info("Category IoU: ", category_iou) print_info("Category IoU: ", category_iou)
print_info("Category Acc: ", category_acc) print_info("Category Acc: ", category_acc)
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Train/mean_iou', mean_iou, log_writer.add_scalar('Train/mean_iou', mean_iou,
global_step) step)
log_writer.add_scalar('Train/mean_acc', mean_acc, log_writer.add_scalar('Train/mean_acc', mean_acc,
global_step) step)
log_writer.add_scalar('Train/loss', avg_loss, log_writer.add_scalar('Train/loss', avg_loss,
global_step) step)
log_writer.add_scalar('Train/lr', lr[0], log_writer.add_scalar('Train/lr', lr[0],
global_step) step)
log_writer.add_scalar('Train/step/sec', speed, log_writer.add_scalar('Train/step/sec', speed,
global_step) step)
sys.stdout.flush() sys.stdout.flush()
avg_loss = 0.0 avg_loss = 0.0
cm.zero_matrix() cm.zero_matrix()
...@@ -435,30 +435,30 @@ def train(cfg): ...@@ -435,30 +435,30 @@ def train(cfg):
fetch_list=fetch_list, fetch_list=fetch_list,
return_numpy=True) return_numpy=True)
avg_loss += np.mean(np.array(loss)) avg_loss += np.mean(np.array(loss))
global_step += 1 step += 1
if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
avg_loss /= args.log_steps avg_loss /= args.log_steps
speed = args.log_steps / timer.elapsed_time() speed = args.log_steps / timer.elapsed_time()
print(( print((
"epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, speed, ).format(epoch, step, lr[0], avg_loss, speed,
calculate_eta(all_step - global_step, speed))) calculate_eta(all_step - step, speed)))
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Train/loss', avg_loss, log_writer.add_scalar('Train/loss', avg_loss,
global_step) step)
log_writer.add_scalar('Train/lr', lr[0], log_writer.add_scalar('Train/lr', lr[0],
global_step) step)
log_writer.add_scalar('Train/speed', speed, log_writer.add_scalar('Train/speed', speed,
global_step) step)
sys.stdout.flush() sys.stdout.flush()
avg_loss = 0.0 avg_loss = 0.0
timer.restart() timer.restart()
# NOTE : used for benchmark, profiler tools # NOTE : used for benchmark, profiler tools
if args.is_profiler and epoch == 1 and global_step == args.log_steps: if args.is_profiler and epoch == 1 and step == args.log_steps:
profiler.start_profiler("All") profiler.start_profiler("All")
elif args.is_profiler and epoch == 1 and global_step == args.log_steps + 5: elif args.is_profiler and epoch == 1 and step == args.log_steps + 5:
profiler.stop_profiler("total", args.profiler_path) profiler.stop_profiler("total", args.profiler_path)
return return
...@@ -479,11 +479,11 @@ def train(cfg): ...@@ -479,11 +479,11 @@ def train(cfg):
ckpt_dir=ckpt_dir, ckpt_dir=ckpt_dir,
use_gpu=args.use_gpu, use_gpu=args.use_gpu,
use_mpio=args.use_mpio) use_mpio=args.use_mpio)
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Evaluate/mean_iou', mean_iou, log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
global_step) step)
log_writer.add_scalar('Evaluate/mean_acc', mean_acc, log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
global_step) step)
if mean_iou > best_mIoU: if mean_iou > best_mIoU:
best_mIoU = mean_iou best_mIoU = mean_iou
...@@ -493,8 +493,8 @@ def train(cfg): ...@@ -493,8 +493,8 @@ def train(cfg):
os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'),
mean_iou)) mean_iou))
# Use Tensorboard to visualize results # Use VisualDL to visualize results
if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
visualize( visualize(
cfg=cfg, cfg=cfg,
use_gpu=args.use_gpu, use_gpu=args.use_gpu,
......
...@@ -56,7 +56,7 @@ cfg.DATASET.VAL_TOTAL_IMAGES = 500 ...@@ -56,7 +56,7 @@ cfg.DATASET.VAL_TOTAL_IMAGES = 500
cfg.DATASET.TEST_FILE_LIST = './dataset/cityscapes/test.list' cfg.DATASET.TEST_FILE_LIST = './dataset/cityscapes/test.list'
# 测试数据数量 # 测试数据数量
cfg.DATASET.TEST_TOTAL_IMAGES = 500 cfg.DATASET.TEST_TOTAL_IMAGES = 500
# Tensorboard 可视化的数据集 # VisualDL 可视化的数据集
cfg.DATASET.VIS_FILE_LIST = None cfg.DATASET.VIS_FILE_LIST = None
# 类别数(需包括背景类) # 类别数(需包括背景类)
cfg.DATASET.NUM_CLASSES = 19 cfg.DATASET.NUM_CLASSES = 19
......
...@@ -162,18 +162,17 @@ def visualize(cfg, ...@@ -162,18 +162,17 @@ def visualize(cfg,
img_cnt += 1 img_cnt += 1
print("#{} visualize image path: {}".format(img_cnt, vis_fn)) print("#{} visualize image path: {}".format(img_cnt, vis_fn))
# Use Tensorboard to visualize image # Use VisualDL to visualize image
if log_writer is not None: if log_writer is not None:
# Calulate epoch from ckpt_dir folder name # Calulate epoch from ckpt_dir folder name
epoch = int(os.path.split(ckpt_dir)[-1]) epoch = int(os.path.split(ckpt_dir)[-1])
print("Tensorboard visualization epoch", epoch) print("VisualDL visualization epoch", epoch)
pred_mask_np = np.array(pred_mask.convert("RGB")) pred_mask_np = np.array(pred_mask.convert("RGB"))
log_writer.add_image( log_writer.add_image(
"Predict/{}".format(img_name), "Predict/{}".format(img_name),
pred_mask_np, pred_mask_np,
epoch, epoch)
dataformats='HWC')
# Original image # Original image
# BGR->RGB # BGR->RGB
img = cv2.imread( img = cv2.imread(
...@@ -181,8 +180,7 @@ def visualize(cfg, ...@@ -181,8 +180,7 @@ def visualize(cfg,
log_writer.add_image( log_writer.add_image(
"Images/{}".format(img_name), "Images/{}".format(img_name),
img, img,
epoch, epoch)
dataformats='HWC')
# add ground truth (label) images # add ground truth (label) images
grt = grts[i] grt = grts[i]
if grt is not None: if grt is not None:
...@@ -194,8 +192,7 @@ def visualize(cfg, ...@@ -194,8 +192,7 @@ def visualize(cfg,
log_writer.add_image( log_writer.add_image(
"Label/{}".format(img_name), "Label/{}".format(img_name),
grt, grt,
epoch, epoch)
dataformats='HWC')
# If in local_test mode, only visualize 5 images just for testing # If in local_test mode, only visualize 5 images just for testing
# procedure # procedure
......
...@@ -2,11 +2,10 @@ pre-commit ...@@ -2,11 +2,10 @@ pre-commit
yapf == 0.26.0 yapf == 0.26.0
flake8 flake8
pyyaml >= 5.1 pyyaml >= 5.1
tb-paddle
tensorboard >= 1.15.0
Pillow Pillow
numpy numpy
six six
opencv-python opencv-python
tqdm tqdm
requests requests
visualdl == 2.0.0-alpha.2
...@@ -87,14 +87,14 @@ def parse_args(): ...@@ -87,14 +87,14 @@ def parse_args():
help='debug mode, display detail information of training', help='debug mode, display detail information of training',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--use_tb', '--use_vdl',
dest='use_tb', dest='use_vdl',
help='whether to record the data during training to Tensorboard', help='whether to record the data during training to VisualDL',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--tb_log_dir', '--vdl_log_dir',
dest='tb_log_dir', dest='vd;_log_dir',
help='Tensorboard logging directory', help='VisualDL logging directory',
default=None, default=None,
type=str) type=str)
parser.add_argument( parser.add_argument(
...@@ -409,17 +409,17 @@ def train(cfg): ...@@ -409,17 +409,17 @@ def train(cfg):
fetch_list.extend([pred.name, grts.name, masks.name]) fetch_list.extend([pred.name, grts.name, masks.name])
cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)
if args.use_tb: if args.use_vdl:
if not args.tb_log_dir: if not args.vdl_log_dir:
print_info("Please specify the log directory by --tb_log_dir.") print_info("Please specify the log directory by --vdl_log_dir.")
exit(1) exit(1)
from tb_paddle import SummaryWriter from visualdl import LogWriter
log_writer = SummaryWriter(args.tb_log_dir) log_writer = LogWriter(args.vdl_log_dir)
# trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0))
# num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
global_step = 0 step = 0
all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
all_step += 1 all_step += 1
...@@ -455,9 +455,9 @@ def train(cfg): ...@@ -455,9 +455,9 @@ def train(cfg):
return_numpy=True) return_numpy=True)
cm.calculate(pred, grts, masks) cm.calculate(pred, grts, masks)
avg_loss += np.mean(np.array(loss)) avg_loss += np.mean(np.array(loss))
global_step += 1 step += 1
if global_step % args.log_steps == 0: if step % args.log_steps == 0:
speed = args.log_steps / timer.elapsed_time() speed = args.log_steps / timer.elapsed_time()
avg_loss /= args.log_steps avg_loss /= args.log_steps
category_acc, mean_acc = cm.accuracy() category_acc, mean_acc = cm.accuracy()
...@@ -465,22 +465,22 @@ def train(cfg): ...@@ -465,22 +465,22 @@ def train(cfg):
print_info(( print_info((
"epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, mean_acc, ).format(epoch, step, lr[0], avg_loss, mean_acc,
mean_iou, speed, mean_iou, speed,
calculate_eta(all_step - global_step, speed))) calculate_eta(all_step - step, speed)))
print_info("Category IoU: ", category_iou) print_info("Category IoU: ", category_iou)
print_info("Category Acc: ", category_acc) print_info("Category Acc: ", category_acc)
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Train/mean_iou', mean_iou, log_writer.add_scalar('Train/mean_iou', mean_iou,
global_step) step)
log_writer.add_scalar('Train/mean_acc', mean_acc, log_writer.add_scalar('Train/mean_acc', mean_acc,
global_step) step)
log_writer.add_scalar('Train/loss', avg_loss, log_writer.add_scalar('Train/loss', avg_loss,
global_step) step)
log_writer.add_scalar('Train/lr', lr[0], log_writer.add_scalar('Train/lr', lr[0],
global_step) step)
log_writer.add_scalar('Train/step/sec', speed, log_writer.add_scalar('Train/step/sec', speed,
global_step) step)
sys.stdout.flush() sys.stdout.flush()
avg_loss = 0.0 avg_loss = 0.0
cm.zero_matrix() cm.zero_matrix()
...@@ -494,25 +494,25 @@ def train(cfg): ...@@ -494,25 +494,25 @@ def train(cfg):
avg_loss += np.mean(np.array(loss)) avg_loss += np.mean(np.array(loss))
avg_t_loss += np.mean(np.array(t_loss)) avg_t_loss += np.mean(np.array(t_loss))
avg_d_loss += np.mean(np.array(d_loss)) avg_d_loss += np.mean(np.array(d_loss))
global_step += 1 step += 1
if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
avg_loss /= args.log_steps avg_loss /= args.log_steps
avg_t_loss /= args.log_steps avg_t_loss /= args.log_steps
avg_d_loss /= args.log_steps avg_d_loss /= args.log_steps
speed = args.log_steps / timer.elapsed_time() speed = args.log_steps / timer.elapsed_time()
print(( print((
"epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}" "epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, ).format(epoch, step, lr[0], avg_loss,
avg_t_loss, avg_d_loss, speed, avg_t_loss, avg_d_loss, speed,
calculate_eta(all_step - global_step, speed))) calculate_eta(all_step - step, speed)))
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Train/loss', avg_loss, log_writer.add_scalar('Train/loss', avg_loss,
global_step) step)
log_writer.add_scalar('Train/lr', lr[0], log_writer.add_scalar('Train/lr', lr[0],
global_step) step)
log_writer.add_scalar('Train/speed', speed, log_writer.add_scalar('Train/speed', speed,
global_step) step)
sys.stdout.flush() sys.stdout.flush()
avg_loss = 0.0 avg_loss = 0.0
avg_t_loss = 0.0 avg_t_loss = 0.0
...@@ -536,11 +536,11 @@ def train(cfg): ...@@ -536,11 +536,11 @@ def train(cfg):
ckpt_dir=ckpt_dir, ckpt_dir=ckpt_dir,
use_gpu=args.use_gpu, use_gpu=args.use_gpu,
use_mpio=args.use_mpio) use_mpio=args.use_mpio)
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Evaluate/mean_iou', mean_iou, log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
global_step) step)
log_writer.add_scalar('Evaluate/mean_acc', mean_acc, log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
global_step) step)
if mean_iou > best_mIoU: if mean_iou > best_mIoU:
best_mIoU = mean_iou best_mIoU = mean_iou
...@@ -550,8 +550,8 @@ def train(cfg): ...@@ -550,8 +550,8 @@ def train(cfg):
os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'),
mean_iou)) mean_iou))
# Use Tensorboard to visualize results # Use VisualDL to visualize results
if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
visualize( visualize(
cfg=cfg, cfg=cfg,
use_gpu=args.use_gpu, use_gpu=args.use_gpu,
......
...@@ -87,14 +87,14 @@ def parse_args(): ...@@ -87,14 +87,14 @@ def parse_args():
help='debug mode, display detail information of training', help='debug mode, display detail information of training',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--use_tb', '--use_vdl',
dest='use_tb', dest='use_vdl',
help='whether to record the data during training to Tensorboard', help='whether to record the data during training to VisualDL',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--tb_log_dir', '--vdl_log_dir',
dest='tb_log_dir', dest='vdl_log_dir',
help='Tensorboard logging directory', help='VisualDL logging directory',
default=None, default=None,
type=str) type=str)
parser.add_argument( parser.add_argument(
......
...@@ -83,14 +83,14 @@ def parse_args(): ...@@ -83,14 +83,14 @@ def parse_args():
help='debug mode, display detail information of training', help='debug mode, display detail information of training',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--use_tb', '--use_vdl',
dest='use_tb', dest='use_vdl',
help='whether to record the data during training to Tensorboard', help='whether to record the data during training to VisualDL',
action='store_true') action='store_true')
parser.add_argument( parser.add_argument(
'--tb_log_dir', '--vdl_log_dir',
dest='tb_log_dir', dest='vdl_log_dir',
help='Tensorboard logging directory', help='VisualDL logging directory',
default=None, default=None,
type=str) type=str)
parser.add_argument( parser.add_argument(
...@@ -335,13 +335,13 @@ def train(cfg): ...@@ -335,13 +335,13 @@ def train(cfg):
fetch_list.extend([pred.name, grts.name, masks.name]) fetch_list.extend([pred.name, grts.name, masks.name])
cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True)
if args.use_tb: if args.use_vdl:
if not args.tb_log_dir: if not args.vdl_log_dir:
print_info("Please specify the log directory by --tb_log_dir.") print_info("Please specify the log directory by --vdl_log_dir.")
exit(1) exit(1)
from tb_paddle import SummaryWriter from visualdl import LogWriter
log_writer = SummaryWriter(args.tb_log_dir) log_writer = LogWriter(args.vdl_log_dir)
pruner = Pruner() pruner = Pruner()
train_prog = pruner.prune( train_prog = pruner.prune(
...@@ -357,7 +357,7 @@ def train(cfg): ...@@ -357,7 +357,7 @@ def train(cfg):
exec_strategy=exec_strategy, exec_strategy=exec_strategy,
build_strategy=build_strategy) build_strategy=build_strategy)
global_step = 0 step = 0
all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE
if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True:
all_step += 1 all_step += 1
...@@ -389,9 +389,9 @@ def train(cfg): ...@@ -389,9 +389,9 @@ def train(cfg):
return_numpy=True) return_numpy=True)
cm.calculate(pred, grts, masks) cm.calculate(pred, grts, masks)
avg_loss += np.mean(np.array(loss)) avg_loss += np.mean(np.array(loss))
global_step += 1 step += 1
if global_step % args.log_steps == 0: if step % args.log_steps == 0:
speed = args.log_steps / timer.elapsed_time() speed = args.log_steps / timer.elapsed_time()
avg_loss /= args.log_steps avg_loss /= args.log_steps
category_acc, mean_acc = cm.accuracy() category_acc, mean_acc = cm.accuracy()
...@@ -399,22 +399,22 @@ def train(cfg): ...@@ -399,22 +399,22 @@ def train(cfg):
print_info(( print_info((
"epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, mean_acc, ).format(epoch, step, lr[0], avg_loss, mean_acc,
mean_iou, speed, mean_iou, speed,
calculate_eta(all_step - global_step, speed))) calculate_eta(all_step - step, speed)))
print_info("Category IoU: ", category_iou) print_info("Category IoU: ", category_iou)
print_info("Category Acc: ", category_acc) print_info("Category Acc: ", category_acc)
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Train/mean_iou', mean_iou, log_writer.add_scalar('Train/mean_iou', mean_iou,
global_step) step)
log_writer.add_scalar('Train/mean_acc', mean_acc, log_writer.add_scalar('Train/mean_acc', mean_acc,
global_step) step)
log_writer.add_scalar('Train/loss', avg_loss, log_writer.add_scalar('Train/loss', avg_loss,
global_step) step)
log_writer.add_scalar('Train/lr', lr[0], log_writer.add_scalar('Train/lr', lr[0],
global_step) step)
log_writer.add_scalar('Train/step/sec', speed, log_writer.add_scalar('Train/step/sec', speed,
global_step) step)
sys.stdout.flush() sys.stdout.flush()
avg_loss = 0.0 avg_loss = 0.0
cm.zero_matrix() cm.zero_matrix()
...@@ -426,22 +426,22 @@ def train(cfg): ...@@ -426,22 +426,22 @@ def train(cfg):
fetch_list=fetch_list, fetch_list=fetch_list,
return_numpy=True) return_numpy=True)
avg_loss += np.mean(np.array(loss)) avg_loss += np.mean(np.array(loss))
global_step += 1 step += 1
if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: if step % args.log_steps == 0 and cfg.TRAINER_ID == 0:
avg_loss /= args.log_steps avg_loss /= args.log_steps
speed = args.log_steps / timer.elapsed_time() speed = args.log_steps / timer.elapsed_time()
print(( print((
"epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}"
).format(epoch, global_step, lr[0], avg_loss, speed, ).format(epoch, step, lr[0], avg_loss, speed,
calculate_eta(all_step - global_step, speed))) calculate_eta(all_step - step, speed)))
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Train/loss', avg_loss, log_writer.add_scalar('Train/loss', avg_loss,
global_step) step)
log_writer.add_scalar('Train/lr', lr[0], log_writer.add_scalar('Train/lr', lr[0],
global_step) step)
log_writer.add_scalar('Train/speed', speed, log_writer.add_scalar('Train/speed', speed,
global_step) step)
sys.stdout.flush() sys.stdout.flush()
avg_loss = 0.0 avg_loss = 0.0
timer.restart() timer.restart()
...@@ -463,14 +463,14 @@ def train(cfg): ...@@ -463,14 +463,14 @@ def train(cfg):
ckpt_dir=ckpt_dir, ckpt_dir=ckpt_dir,
use_gpu=args.use_gpu, use_gpu=args.use_gpu,
use_mpio=args.use_mpio) use_mpio=args.use_mpio)
if args.use_tb: if args.use_vdl:
log_writer.add_scalar('Evaluate/mean_iou', mean_iou, log_writer.add_scalar('Evaluate/mean_iou', mean_iou,
global_step) step)
log_writer.add_scalar('Evaluate/mean_acc', mean_acc, log_writer.add_scalar('Evaluate/mean_acc', mean_acc,
global_step) step)
# Use Tensorboard to visualize results # Use VisualDL to visualize results
if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None:
visualize( visualize(
cfg=cfg, cfg=cfg,
use_gpu=args.use_gpu, use_gpu=args.use_gpu,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册