diff --git a/contrib/LaneNet/requirements.txt b/contrib/LaneNet/requirements.txt index 2b5eb8643803e1177297d2a766227e274dcdc29d..b084ca5748e061d31190b9e29bdb932f0a2c9ec8 100644 --- a/contrib/LaneNet/requirements.txt +++ b/contrib/LaneNet/requirements.txt @@ -2,8 +2,6 @@ pre-commit yapf == 0.26.0 flake8 pyyaml >= 5.1 -tb-paddle -tensorboard >= 1.15.0 Pillow numpy six @@ -11,3 +9,4 @@ opencv-python tqdm requests sklearn +visualdl == 2.0.0-alpha.2 diff --git a/contrib/LaneNet/train.py b/contrib/LaneNet/train.py index c2f5bee7547eabe9ef5c998b197fbaf59130d679..d9d22ba999cbbc3a9252f258e973612c68fe4ee4 100644 --- a/contrib/LaneNet/train.py +++ b/contrib/LaneNet/train.py @@ -78,14 +78,14 @@ def parse_args(): help='debug mode, display detail information of training', action='store_true') parser.add_argument( - '--use_tb', - dest='use_tb', - help='whether to record the data during training to Tensorboard', + '--use_vdl', + dest='use_vdl', + help='whether to record the data during training to VisualDL', action='store_true') parser.add_argument( - '--tb_log_dir', - dest='tb_log_dir', - help='Tensorboard logging directory', + '--vdl_log_dir', + dest='vdl_log_dir', + help='VisualDL logging directory', default=None, type=str) parser.add_argument( @@ -327,17 +327,17 @@ def train(cfg): fetch_list.extend([pred.name, grts.name, masks.name]) # cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) - if args.use_tb: - if not args.tb_log_dir: - print_info("Please specify the log directory by --tb_log_dir.") + if args.use_vdl: + if not args.vdl_log_dir: + print_info("Please specify the log directory by --vdl_log_dir.") exit(1) - from tb_paddle import SummaryWriter - log_writer = SummaryWriter(args.tb_log_dir) + from visualdl import LogWriter + log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) - global_step = 0 + step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 @@ -377,9 +377,9 @@ def train(cfg): avg_acc += np.mean(out_acc) avg_fp += np.mean(out_fp) avg_fn += np.mean(out_fn) - global_step += 1 + step += 1 - if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: + if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps avg_seg_loss /= args.log_steps avg_emb_loss /= args.log_steps @@ -389,14 +389,14 @@ def train(cfg): speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} seg_loss={:.4f} emb_loss={:.4f} accuracy={:.4} fp={:.4} fn={:.4} step/sec={:.3f} | ETA {}" - ).format(epoch, global_step, lr[0], avg_loss, avg_seg_loss, + ).format(epoch, step, lr[0], avg_loss, avg_seg_loss, avg_emb_loss, avg_acc, avg_fp, avg_fn, speed, - calculate_eta(all_step - global_step, speed))) - if args.use_tb: + calculate_eta(all_step - step, speed))) + if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, - global_step) - log_writer.add_scalar('Train/lr', lr[0], global_step) - log_writer.add_scalar('Train/speed', speed, global_step) + step) + log_writer.add_scalar('Train/lr', lr[0], step) + log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 avg_seg_loss = 0.0 @@ -422,14 +422,14 @@ def train(cfg): ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) - if args.use_tb: + if args.use_vdl: log_writer.add_scalar('Evaluate/accuracy', accuracy, - global_step) - log_writer.add_scalar('Evaluate/fp', fp, global_step) - log_writer.add_scalar('Evaluate/fn', fn, global_step) + step) + log_writer.add_scalar('Evaluate/fp', fp, step) + log_writer.add_scalar('Evaluate/fn', fn, step) - # Use Tensorboard to visualize results - if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: + # Use VisualDL to visualize results + if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu, diff --git a/contrib/LaneNet/utils/config.py b/contrib/LaneNet/utils/config.py index d1186636c7d2b8004756bdfbaaca74aa47d32b7f..7c2019d44a100b033520138632fc0e7b56d65676 100644 --- a/contrib/LaneNet/utils/config.py +++ b/contrib/LaneNet/utils/config.py @@ -68,7 +68,7 @@ cfg.DATASET.VAL_TOTAL_IMAGES = 500 cfg.DATASET.TEST_FILE_LIST = './dataset/cityscapes/test.list' # 测试数据数量 cfg.DATASET.TEST_TOTAL_IMAGES = 500 -# Tensorboard 可视化的数据集 +# VisualDL 可视化的数据集 cfg.DATASET.VIS_FILE_LIST = None # 类别数(需包括背景类) cfg.DATASET.NUM_CLASSES = 19 diff --git a/docs/config.md b/docs/config.md index 67e1353a7d88994b584d5bd3da4dd36d81430a59..24d11bd4ced6d53a961ffb5d8bbd379e821def01 100644 --- a/docs/config.md +++ b/docs/config.md @@ -27,10 +27,10 @@ python pdseg/train.py BATCH_SIZE 1 --cfg configs/unet_optic.yaml |--cfg|配置文件路径|ALL|None|| |--use_gpu|是否使用GPU进行训练|train/eval/vis|False|| |--use_mpio|是否使用多进程进行IO处理|train/eval|False|打开该开关会占用一定量的CPU内存,但是可以提高训练速度。
**NOTE:** windows平台下不支持该功能, 建议使用自定义数据初次训练时不打开,打开会导致数据读取异常不可见。 | -|--use_tb|是否使用TensorBoard记录训练数据|train|False|| +|--use_vdl|是否使用VisualDL记录训练数据|train|False|| |--log_steps|训练日志的打印周期(单位为step)|train|10|| |--debug|是否打印debug信息|train|False|IOU等指标涉及到混淆矩阵的计算,会降低训练速度| -|--tb_log_dir                      |TensorBoard的日志路径|train|None|| +|--vdl_log_dir                      |VisualDL的日志路径|train|None|| |--do_eval|是否在保存模型时进行效果评估                                                        |train|False|| |--vis_dir|保存可视化图片的路径|vis|"visual"|| @@ -80,7 +80,7 @@ DATASET: VAL_FILE_LIST: './dataset/cityscapes/val.list' # 测试数据列表 TEST_FILE_LIST: './dataset/cityscapes/test.list' - # Tensorboard 可视化的数据集 + # VisualDL 可视化的数据集 VIS_FILE_LIST: None # 类别数(需包括背景类) NUM_CLASSES: 19 diff --git a/docs/configs/dataset_group.md b/docs/configs/dataset_group.md index 917f01ade91598916a9399417a6c7ee62337dff5..7623c4f199db49571e15b7efef997d144c36301b 100644 --- a/docs/configs/dataset_group.md +++ b/docs/configs/dataset_group.md @@ -62,7 +62,7 @@ DATASET Group存放所有与数据集相关的配置 ## `VIS_FILE_LIST` -可视化列表,调用`pdseg/train.py`进行训练时,如果打开了--use_tb开关,则在每次模型保存的时候,会读取该列表中的图片进行可视化 +可视化列表,调用`pdseg/train.py`进行训练时,如果打开了--use_vdl开关,则在每次模型保存的时候,会读取该列表中的图片进行可视化 文件列表由多行组成,每一行的格式为 ``` diff --git a/docs/imgs/tensorboard_image.JPG b/docs/imgs/tensorboard_image.JPG deleted file mode 100644 index 140aa2a0ed6a9b1a2d0a98477685b9e6d434a113..0000000000000000000000000000000000000000 Binary files a/docs/imgs/tensorboard_image.JPG and /dev/null differ diff --git a/docs/imgs/tensorboard_scalar.JPG b/docs/imgs/tensorboard_scalar.JPG deleted file mode 100644 index 322c98dc8ba7e5ca96477f3dbe193a70a8cf4609..0000000000000000000000000000000000000000 Binary files a/docs/imgs/tensorboard_scalar.JPG and /dev/null differ diff --git a/docs/imgs/visualdl_image.png b/docs/imgs/visualdl_image.png new file mode 100644 index 0000000000000000000000000000000000000000..49ecc661739139e896413611f8daa1a7875b8dd2 Binary files /dev/null and b/docs/imgs/visualdl_image.png differ diff --git a/docs/imgs/visualdl_scalar.png b/docs/imgs/visualdl_scalar.png new file mode 100644 index 0000000000000000000000000000000000000000..196d0ab728f859b2d32960ba8f50df4eb6361556 Binary files /dev/null and b/docs/imgs/visualdl_scalar.png differ diff --git a/docs/usage.md b/docs/usage.md index 6da85a2de7b8be220e955a9e20a351c2d306b489..e088bfd39cee24545a958745a3e6266074049f8a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -49,8 +49,8 @@ export CUDA_VISIBLE_DEVICES=0 python pdseg/train.py --cfg configs/unet_optic.yaml \ --use_gpu \ --do_eval \ - --use_tb \ - --tb_log_dir train_log \ + --use_vdl \ + --vdl_log_dir train_log \ BATCH_SIZE 4 \ SOLVER.LR 0.001 @@ -70,22 +70,22 @@ export CUDA_VISIBLE_DEVICES=0,1,2 ## 5.训练过程可视化 -当打开do_eval和use_tb两个开关后,我们可以通过TensorBoard查看边训练边评估的效果。 +当打开do_eval和use_vdl两个开关后,我们可以通过VisualDL查看边训练边评估的效果。 ```shell -tensorboard --logdir train_log --host {$HOST_IP} --port {$PORT} +visualdl --logdir train_log --host {$HOST_IP} --port {$PORT} ``` NOTE: 1. 上述示例中,$HOST\_IP为机器IP地址,请替换为实际IP,$PORT请替换为可访问的端口。 2. 数据量较大时,前端加载速度会比较慢,请耐心等待。 -启动TensorBoard命令后,我们可以在浏览器中查看对应的训练数据。 +启动VisualDL命令后,我们可以在浏览器中查看对应的训练数据。 在`SCALAR`这个tab中,查看训练loss、iou、acc的变化趋势。 -![](./imgs/tensorboard_scalar.JPG) +![](./imgs/visualdl_scalar.png) 在`IMAGE`这个tab中,查看样本图片。 -![](./imgs/tensorboard_image.JPG) +![](./imgs/visualdl_image.png) ## 6.模型评估 训练完成后,我们可以通过eval.py来评估模型效果。由于我们设置的训练EPOCH数量为10,保存间隔为5,因此一共会产生2个定期保存的模型,加上最终保存的final模型,一共有3个模型。我们选择最后保存的模型进行效果的评估: diff --git a/pdseg/train.py b/pdseg/train.py index 9e30c0f2050bd4987d84675985a86922e1c993c3..e1c498a4355950af155efc79e69a6788ad86e0ba 100644 --- a/pdseg/train.py +++ b/pdseg/train.py @@ -77,14 +77,14 @@ def parse_args(): help='debug mode, display detail information of training', action='store_true') parser.add_argument( - '--use_tb', - dest='use_tb', - help='whether to record the data during training to Tensorboard', + '--use_vdl', + dest='use_vdl', + help='whether to record the data during training to VisualDL', action='store_true') parser.add_argument( - '--tb_log_dir', - dest='tb_log_dir', - help='Tensorboard logging directory', + '--vdl_log_dir', + dest='vdl_log_dir', + help='VisualDL logging directory', default=None, type=str) parser.add_argument( @@ -354,17 +354,17 @@ def train(cfg): fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) - if args.use_tb: - if not args.tb_log_dir: - print_info("Please specify the log directory by --tb_log_dir.") + if args.use_vdl: + if not args.vdl_log_dir: + print_info("Please specify the log directory by --vdl_log_dir.") exit(1) - from tb_paddle import SummaryWriter - log_writer = SummaryWriter(args.tb_log_dir) + from visualdl import LogWriter + log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) - global_step = 0 + step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 @@ -398,9 +398,9 @@ def train(cfg): return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) - global_step += 1 + step += 1 - if global_step % args.log_steps == 0: + if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() @@ -408,22 +408,22 @@ def train(cfg): print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" - ).format(epoch, global_step, lr[0], avg_loss, mean_acc, + ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, - calculate_eta(all_step - global_step, speed))) + calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) - if args.use_tb: + if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, - global_step) + step) log_writer.add_scalar('Train/mean_acc', mean_acc, - global_step) + step) log_writer.add_scalar('Train/loss', avg_loss, - global_step) + step) log_writer.add_scalar('Train/lr', lr[0], - global_step) + step) log_writer.add_scalar('Train/step/sec', speed, - global_step) + step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() @@ -435,30 +435,30 @@ def train(cfg): fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) - global_step += 1 + step += 1 - if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: + if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" - ).format(epoch, global_step, lr[0], avg_loss, speed, - calculate_eta(all_step - global_step, speed))) - if args.use_tb: + ).format(epoch, step, lr[0], avg_loss, speed, + calculate_eta(all_step - step, speed))) + if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, - global_step) + step) log_writer.add_scalar('Train/lr', lr[0], - global_step) + step) log_writer.add_scalar('Train/speed', speed, - global_step) + step) sys.stdout.flush() avg_loss = 0.0 timer.restart() # NOTE : used for benchmark, profiler tools - if args.is_profiler and epoch == 1 and global_step == args.log_steps: + if args.is_profiler and epoch == 1 and step == args.log_steps: profiler.start_profiler("All") - elif args.is_profiler and epoch == 1 and global_step == args.log_steps + 5: + elif args.is_profiler and epoch == 1 and step == args.log_steps + 5: profiler.stop_profiler("total", args.profiler_path) return @@ -479,11 +479,11 @@ def train(cfg): ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) - if args.use_tb: + if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, - global_step) + step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, - global_step) + step) if mean_iou > best_mIoU: best_mIoU = mean_iou @@ -493,8 +493,8 @@ def train(cfg): os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) - # Use Tensorboard to visualize results - if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: + # Use VisualDL to visualize results + if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu, diff --git a/pdseg/utils/config.py b/pdseg/utils/config.py index 141b17ce24df1f78310975ef236290011ebffb56..e58bc39695f62c733c39dde9270d1d7fdd96a677 100644 --- a/pdseg/utils/config.py +++ b/pdseg/utils/config.py @@ -56,7 +56,7 @@ cfg.DATASET.VAL_TOTAL_IMAGES = 500 cfg.DATASET.TEST_FILE_LIST = './dataset/cityscapes/test.list' # 测试数据数量 cfg.DATASET.TEST_TOTAL_IMAGES = 500 -# Tensorboard 可视化的数据集 +# VisualDL 可视化的数据集 cfg.DATASET.VIS_FILE_LIST = None # 类别数(需包括背景类) cfg.DATASET.NUM_CLASSES = 19 diff --git a/pdseg/vis.py b/pdseg/vis.py index d94221c0be1a0b4abe241e75966215863d8fd35d..0dc30273b8bf8e7c61ffeb09336959e09949ac8d 100644 --- a/pdseg/vis.py +++ b/pdseg/vis.py @@ -162,18 +162,17 @@ def visualize(cfg, img_cnt += 1 print("#{} visualize image path: {}".format(img_cnt, vis_fn)) - # Use Tensorboard to visualize image + # Use VisualDL to visualize image if log_writer is not None: # Calulate epoch from ckpt_dir folder name epoch = int(os.path.split(ckpt_dir)[-1]) - print("Tensorboard visualization epoch", epoch) + print("VisualDL visualization epoch", epoch) pred_mask_np = np.array(pred_mask.convert("RGB")) log_writer.add_image( "Predict/{}".format(img_name), pred_mask_np, - epoch, - dataformats='HWC') + epoch) # Original image # BGR->RGB img = cv2.imread( @@ -181,8 +180,7 @@ def visualize(cfg, log_writer.add_image( "Images/{}".format(img_name), img, - epoch, - dataformats='HWC') + epoch) # add ground truth (label) images grt = grts[i] if grt is not None: @@ -194,8 +192,7 @@ def visualize(cfg, log_writer.add_image( "Label/{}".format(img_name), grt, - epoch, - dataformats='HWC') + epoch) # If in local_test mode, only visualize 5 images just for testing # procedure diff --git a/requirements.txt b/requirements.txt index 5a04fa523ced707663c197b6a51467552692ede5..04105e18c6e036ddb802e5f280026fe0e80ca609 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,11 +2,10 @@ pre-commit yapf == 0.26.0 flake8 pyyaml >= 5.1 -tb-paddle -tensorboard >= 1.15.0 Pillow numpy six opencv-python tqdm requests +visualdl == 2.0.0-alpha.2 diff --git a/slim/distillation/train_distill.py b/slim/distillation/train_distill.py index e354107f173eea203d9df3f01f93fae62f41eabc..995cab1f11a8f6d88d19a7b10f9f768f4d6ccbf1 100644 --- a/slim/distillation/train_distill.py +++ b/slim/distillation/train_distill.py @@ -87,14 +87,14 @@ def parse_args(): help='debug mode, display detail information of training', action='store_true') parser.add_argument( - '--use_tb', - dest='use_tb', - help='whether to record the data during training to Tensorboard', + '--use_vdl', + dest='use_vdl', + help='whether to record the data during training to VisualDL', action='store_true') parser.add_argument( - '--tb_log_dir', - dest='tb_log_dir', - help='Tensorboard logging directory', + '--vdl_log_dir', + dest='vd;_log_dir', + help='VisualDL logging directory', default=None, type=str) parser.add_argument( @@ -409,17 +409,17 @@ def train(cfg): fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) - if args.use_tb: - if not args.tb_log_dir: - print_info("Please specify the log directory by --tb_log_dir.") + if args.use_vdl: + if not args.vdl_log_dir: + print_info("Please specify the log directory by --vdl_log_dir.") exit(1) - from tb_paddle import SummaryWriter - log_writer = SummaryWriter(args.tb_log_dir) + from visualdl import LogWriter + log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) - global_step = 0 + step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 @@ -455,9 +455,9 @@ def train(cfg): return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) - global_step += 1 + step += 1 - if global_step % args.log_steps == 0: + if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() @@ -465,22 +465,22 @@ def train(cfg): print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" - ).format(epoch, global_step, lr[0], avg_loss, mean_acc, + ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, - calculate_eta(all_step - global_step, speed))) + calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) - if args.use_tb: + if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, - global_step) + step) log_writer.add_scalar('Train/mean_acc', mean_acc, - global_step) + step) log_writer.add_scalar('Train/loss', avg_loss, - global_step) + step) log_writer.add_scalar('Train/lr', lr[0], - global_step) + step) log_writer.add_scalar('Train/step/sec', speed, - global_step) + step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() @@ -494,25 +494,25 @@ def train(cfg): avg_loss += np.mean(np.array(loss)) avg_t_loss += np.mean(np.array(t_loss)) avg_d_loss += np.mean(np.array(d_loss)) - global_step += 1 + step += 1 - if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: + if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps avg_t_loss /= args.log_steps avg_d_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} teacher loss={:.4f} distill loss={:.4f} step/sec={:.3f} | ETA {}" - ).format(epoch, global_step, lr[0], avg_loss, + ).format(epoch, step, lr[0], avg_loss, avg_t_loss, avg_d_loss, speed, - calculate_eta(all_step - global_step, speed))) - if args.use_tb: + calculate_eta(all_step - step, speed))) + if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, - global_step) + step) log_writer.add_scalar('Train/lr', lr[0], - global_step) + step) log_writer.add_scalar('Train/speed', speed, - global_step) + step) sys.stdout.flush() avg_loss = 0.0 avg_t_loss = 0.0 @@ -536,11 +536,11 @@ def train(cfg): ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) - if args.use_tb: + if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, - global_step) + step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, - global_step) + step) if mean_iou > best_mIoU: best_mIoU = mean_iou @@ -550,8 +550,8 @@ def train(cfg): os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) - # Use Tensorboard to visualize results - if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: + # Use VisualDL to visualize results + if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu, diff --git a/slim/nas/train_nas.py b/slim/nas/train_nas.py index 6ab4d899dc2406275daf3fecd3738fb4b3b82c49..f4cd8f81b1f73b7d30ce948be700beba9932c314 100644 --- a/slim/nas/train_nas.py +++ b/slim/nas/train_nas.py @@ -87,14 +87,14 @@ def parse_args(): help='debug mode, display detail information of training', action='store_true') parser.add_argument( - '--use_tb', - dest='use_tb', - help='whether to record the data during training to Tensorboard', + '--use_vdl', + dest='use_vdl', + help='whether to record the data during training to VisualDL', action='store_true') parser.add_argument( - '--tb_log_dir', - dest='tb_log_dir', - help='Tensorboard logging directory', + '--vdl_log_dir', + dest='vdl_log_dir', + help='VisualDL logging directory', default=None, type=str) parser.add_argument( diff --git a/slim/prune/train_prune.py b/slim/prune/train_prune.py index 05c599e3327728ee1ef5e3f2dea359ab9dab5834..6c41e74beb62423354445b45d250bb0f2f75b2d3 100644 --- a/slim/prune/train_prune.py +++ b/slim/prune/train_prune.py @@ -83,14 +83,14 @@ def parse_args(): help='debug mode, display detail information of training', action='store_true') parser.add_argument( - '--use_tb', - dest='use_tb', - help='whether to record the data during training to Tensorboard', + '--use_vdl', + dest='use_vdl', + help='whether to record the data during training to VisualDL', action='store_true') parser.add_argument( - '--tb_log_dir', - dest='tb_log_dir', - help='Tensorboard logging directory', + '--vdl_log_dir', + dest='vdl_log_dir', + help='VisualDL logging directory', default=None, type=str) parser.add_argument( @@ -335,13 +335,13 @@ def train(cfg): fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) - if args.use_tb: - if not args.tb_log_dir: - print_info("Please specify the log directory by --tb_log_dir.") + if args.use_vdl: + if not args.vdl_log_dir: + print_info("Please specify the log directory by --vdl_log_dir.") exit(1) - from tb_paddle import SummaryWriter - log_writer = SummaryWriter(args.tb_log_dir) + from visualdl import LogWriter + log_writer = LogWriter(args.vdl_log_dir) pruner = Pruner() train_prog = pruner.prune( @@ -357,7 +357,7 @@ def train(cfg): exec_strategy=exec_strategy, build_strategy=build_strategy) - global_step = 0 + step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 @@ -389,9 +389,9 @@ def train(cfg): return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) - global_step += 1 + step += 1 - if global_step % args.log_steps == 0: + if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() @@ -399,22 +399,22 @@ def train(cfg): print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" - ).format(epoch, global_step, lr[0], avg_loss, mean_acc, + ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, - calculate_eta(all_step - global_step, speed))) + calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) - if args.use_tb: + if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, - global_step) + step) log_writer.add_scalar('Train/mean_acc', mean_acc, - global_step) + step) log_writer.add_scalar('Train/loss', avg_loss, - global_step) + step) log_writer.add_scalar('Train/lr', lr[0], - global_step) + step) log_writer.add_scalar('Train/step/sec', speed, - global_step) + step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() @@ -426,22 +426,22 @@ def train(cfg): fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) - global_step += 1 + step += 1 - if global_step % args.log_steps == 0 and cfg.TRAINER_ID == 0: + if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" - ).format(epoch, global_step, lr[0], avg_loss, speed, - calculate_eta(all_step - global_step, speed))) - if args.use_tb: + ).format(epoch, step, lr[0], avg_loss, speed, + calculate_eta(all_step - step, speed))) + if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, - global_step) + step) log_writer.add_scalar('Train/lr', lr[0], - global_step) + step) log_writer.add_scalar('Train/speed', speed, - global_step) + step) sys.stdout.flush() avg_loss = 0.0 timer.restart() @@ -463,14 +463,14 @@ def train(cfg): ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) - if args.use_tb: + if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, - global_step) + step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, - global_step) + step) - # Use Tensorboard to visualize results - if args.use_tb and cfg.DATASET.VIS_FILE_LIST is not None: + # Use VisualDL to visualize results + if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu,