From 49f010a1c6311972eebff9016c0ed0744cba1871 Mon Sep 17 00:00:00 2001 From: dongshuilong Date: Wed, 15 Jun 2022 14:33:17 +0000 Subject: [PATCH] add r50 dynamic fp16 train for benchmark --- ppcls/data/dataloader/dali.py | 1 - ...> ResNet50_train_ampfp16_infer_python.txt} | 6 +- .../ResNet50_train_purefp16_infer_python.txt | 56 +++++++++++++++++++ test_tipc/docs/benchmark_train.md | 6 +- 4 files changed, 64 insertions(+), 5 deletions(-) rename test_tipc/config/ResNet/{ResNet50_train_amp_infer_python.txt => ResNet50_train_ampfp16_infer_python.txt} (82%) create mode 100644 test_tipc/config/ResNet/ResNet50_train_purefp16_infer_python.txt diff --git a/ppcls/data/dataloader/dali.py b/ppcls/data/dataloader/dali.py index a340a946..faef45e2 100644 --- a/ppcls/data/dataloader/dali.py +++ b/ppcls/data/dataloader/dali.py @@ -23,7 +23,6 @@ import nvidia.dali.types as types import paddle from nvidia.dali import fn from nvidia.dali.pipeline import Pipeline -from nvidia.dali.plugin.base_iterator import LastBatchPolicy from nvidia.dali.plugin.paddle import DALIGenericIterator diff --git a/test_tipc/config/ResNet/ResNet50_train_amp_infer_python.txt b/test_tipc/config/ResNet/ResNet50_train_ampfp16_infer_python.txt similarity index 82% rename from test_tipc/config/ResNet/ResNet50_train_amp_infer_python.txt rename to test_tipc/config/ResNet/ResNet50_train_ampfp16_infer_python.txt index a398086a..8256c2a3 100644 --- a/test_tipc/config/ResNet/ResNet50_train_amp_infer_python.txt +++ b/test_tipc/config/ResNet/ResNet50_train_ampfp16_infer_python.txt @@ -13,7 +13,7 @@ train_infer_img_dir:./dataset/ILSVRC2012/val null:null ## trainer:amp_train -amp_train:tools/train.py -c ppcls/configs/ImageNet/ResNet/ResNet50.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o AMP.scale_loss=128 -o AMP.use_dynamic_loss_scaling=True -o AMP.level=O2 +amp_train:tools/train.py -c ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o AMP.scale_loss=128 -o AMP.use_dynamic_loss_scaling=True -o AMP.level=O1 pact_train:null fpgm_train:null distill_train:null @@ -50,3 +50,7 @@ inference:python/predict_cls.py -c configs/inference_cls.yaml -o Global.benchmark:True null:null null:null +===========================train_benchmark_params========================== +batch_size:128 +fp_items:amp_fp16 +epoch:1 diff --git a/test_tipc/config/ResNet/ResNet50_train_purefp16_infer_python.txt b/test_tipc/config/ResNet/ResNet50_train_purefp16_infer_python.txt new file mode 100644 index 00000000..b48e5e1c --- /dev/null +++ b/test_tipc/config/ResNet/ResNet50_train_purefp16_infer_python.txt @@ -0,0 +1,56 @@ +===========================train_params=========================== +model_name:ResNet50 +python:python3.7 +gpu_list:0|0,1 +-o Global.device:gpu +-o Global.auto_cast:null +-o Global.epochs:lite_train_lite_infer=2|whole_train_whole_infer=120 +-o Global.output_dir:./output/ +-o DataLoader.Train.sampler.batch_size:8 +-o Global.pretrained_model:null +train_model_name:latest +train_infer_img_dir:./dataset/ILSVRC2012/val +null:null +## +trainer:amp_train +amp_train:tools/train.py -c ppcls/configs/ImageNet/ResNet/ResNet50_amp_O1.yaml -o Global.seed=1234 -o DataLoader.Train.sampler.shuffle=False -o DataLoader.Train.loader.num_workers=0 -o DataLoader.Train.loader.use_shared_memory=False -o AMP.scale_loss=128 -o AMP.use_dynamic_loss_scaling=True -o AMP.level=O2 +pact_train:null +fpgm_train:null +distill_train:null +to_static_train:-o Global.to_static=True +null:null +## +===========================eval_params=========================== +eval:tools/eval.py -c ppcls/configs/ImageNet/ResNet/ResNet50.yaml +null:null +## +===========================infer_params========================== +-o Global.save_inference_dir:./inference +-o Global.pretrained_model: +norm_export:tools/export_model.py -c ppcls/configs/ImageNet/ResNet/ResNet50.yaml +quant_export:null +fpgm_export:null +distill_export:null +kl_quant:null +export2:null +pretrained_model_url:https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams +infer_model:../inference/ +infer_export:True +infer_quant:Fasle +inference:python/predict_cls.py -c configs/inference_cls.yaml +-o Global.use_gpu:True|False +-o Global.enable_mkldnn:True|False +-o Global.cpu_num_threads:1|6 +-o Global.batch_size:1|16 +-o Global.use_tensorrt:True|False +-o Global.use_fp16:True|False +-o Global.inference_model_dir:../inference +-o Global.infer_imgs:../dataset/ILSVRC2012/val +-o Global.save_log_path:null +-o Global.benchmark:True +null:null +null:null +===========================train_benchmark_params========================== +batch_size:128 +fp_items:pure_fp16 +epoch:1 diff --git a/test_tipc/docs/benchmark_train.md b/test_tipc/docs/benchmark_train.md index 20cf9287..a2b01ed9 100644 --- a/test_tipc/docs/benchmark_train.md +++ b/test_tipc/docs/benchmark_train.md @@ -9,7 +9,7 @@ ```shell # 运行格式:bash test_tipc/prepare.sh train_benchmark.txt mode -bash test_tipc/prepare.sh test_tipc/configs/MobileNetV2/MobileNetV2_train_infer_python.txt benchmark_train +bash test_tipc/prepare.sh test_tipc/config/MobileNetV2/MobileNetV2_train_infer_python.txt benchmark_train ``` ## 1.2 功能测试 @@ -24,7 +24,7 @@ bash test_tipc/benchmark_train.sh test_tipc/config/MobileNetV2/MobileNetV2_train `test_tipc/benchmark_train.sh`支持根据传入的第三个参数实现只运行某一个训练配置,如下: ```shell # 运行格式:bash test_tipc/benchmark_train.sh train_benchmark.txt mode params -bash test_tipc/benchmark_train.sh test_tipc/configs/MobileNetV2/MobileNetV2_train_infer_python.txt benchmark_train dynamic_bs8_fp32_DP_N1C1 +bash test_tipc/benchmark_train.sh test_tipc/config/MobileNetV2/MobileNetV2_train_infer_python.txt benchmark_train dynamic_bs8_fp32_DP_N1C1 ``` dynamic_bs8_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式如下: `${modeltype}_${batch_size}_${fp_item}_${run_mode}_${device_num}` @@ -33,7 +33,7 @@ dynamic_bs8_fp32_DP_N1C1为test_tipc/benchmark_train.sh传入的参数,格式 ## 2. 日志输出 -运行后将保存模型的训练日志和解析日志,使用 `test_tipc/configs/MobileNetV2/MobileNetV2_train_infer_python.txt` 参数文件的训练日志解析结果是: +运行后将保存模型的训练日志和解析日志,使用 `test_tipc/config/MobileNetV2/MobileNetV2_train_infer_python.txt` 参数文件的训练日志解析结果是: ``` {"model_branch": "dygaph", "model_commit": "7c39a1996b19087737c05d883fd346d2f39dbcc0", "model_name": "cls_MobileNetV2_bs8_fp32_SingleP_DP", "batch_size": 8, "fp_item": "fp32", "run_process_type": "SingleP", "run_mode": "DP", "convergence_value": "5.413110", "convergence_key": "loss:", "ips": 19.333, "speed_unit": "samples/s", "device_num": "N1C1", "model_run_time": "0", "frame_commit": "8cc09552473b842c651ead3b9848d41827a3dbab", "frame_version": "0.0.0"} -- GitLab