未验证 提交 d8e29086 编写于 作者: W wangxinxin08 提交者: GitHub

[Dygraph]add fp16 and fleet api (#2185)

* add fp16 and fleet api

* add instructions for fleet api and fp16

* modify example of fleet api

* modify docs of fleet api
上级 9bf7d041
...@@ -22,6 +22,8 @@ PaddleDetection在[tools](https://github.com/PaddlePaddle/PaddleDetection/tree/m ...@@ -22,6 +22,8 @@ PaddleDetection在[tools](https://github.com/PaddlePaddle/PaddleDetection/tree/m
| :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: | | :----------------------: | :------------: | :---------------: | :--------------: | :-----------------: |
| -c | ALL | 指定配置文件 | None | **必选**,例如-c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml | | -c | ALL | 指定配置文件 | None | **必选**,例如-c configs/ppyolo/ppyolo_r50vd_dcn_1x_coco.yml |
| --eval | train | 是否边训练边测试 | False | 可选,如需指定,直接`--eval`即可 | | --eval | train | 是否边训练边测试 | False | 可选,如需指定,直接`--eval`即可 |
| --fleet | train | 是否使用fleet API训练 | False | 可以使用--fleet来指定使用fleet API进行多机训练 |
| --fp16 | train | 是否开启混合精度训练 | False | 可以使用--fp16来指定使用混合精度训练 |
| -o | ALL | 设置或更改配置文件里的参数内容 | None | 可选,例如:`-o use_gpu=False` | | -o | ALL | 设置或更改配置文件里的参数内容 | None | 可选,例如:`-o use_gpu=False` |
| --slim_config | ALL | 模型压缩策略配置文件 | None | 可选,例如`--slim_config configs/slim/prune/yolov3_prune_l1_norm.yml` | | --slim_config | ALL | 模型压缩策略配置文件 | None | 可选,例如`--slim_config configs/slim/prune/yolov3_prune_l1_norm.yml` |
| --output_dir | infer/export_model | 预测后结果或导出模型保存路径 | `./output` | 可选,例如`--output_dir=output` | | --output_dir | infer/export_model | 预测后结果或导出模型保存路径 | `./output` | 可选,例如`--output_dir=output` |
...@@ -29,7 +31,6 @@ PaddleDetection在[tools](https://github.com/PaddlePaddle/PaddleDetection/tree/m ...@@ -29,7 +31,6 @@ PaddleDetection在[tools](https://github.com/PaddlePaddle/PaddleDetection/tree/m
| --infer_dir | infer | 用于预测的图片文件夹路径 | None | 可选 | | --infer_dir | infer | 用于预测的图片文件夹路径 | None | 可选 |
| --infer_img | infer | 用于预测的图片路径 | None | 可选,`--infer_img``--infer_dir`必须至少设置一个 | | --infer_img | infer | 用于预测的图片路径 | None | 可选,`--infer_img``--infer_dir`必须至少设置一个 |
### 训练 ### 训练
- 单卡训练 - 单卡训练
...@@ -45,6 +46,20 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ...@@ -45,6 +46,20 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml python -m paddle.distributed.launch --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml
``` ```
- 混合精度训练
```bash
export CUDA_VISIBLE_DEVICES=0
python tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml --fp16
```
- fleet API训练
```bash
# fleet API用于多机训练,启动方式与单机多卡训练方式基本一致,只不过需要使用--ips指定ip列表以及--fleet开启多机训练
python -m paddle.distributed.launch --ips="xx.xx.xx.xx,yy.yy.yy.yy" --gpus 0,1,2,3,4,5,6,7 tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml --fleet
```
- 边训练边评估 - 边训练边评估
```bash ```bash
python tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml --eval python tools/train.py -c configs/faster_rcnn/faster_rcnn_r50_fpn_1x_coco.yml --eval
......
...@@ -21,9 +21,13 @@ import random ...@@ -21,9 +21,13 @@ import random
import numpy as np import numpy as np
import paddle import paddle
from paddle.distributed import ParallelEnv from paddle.distributed import ParallelEnv, fleet
__all__ = ['init_parallel_env', 'set_random_seed'] __all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env']
def init_fleet_env():
fleet.init(is_collective=True)
def init_parallel_env(): def init_parallel_env():
......
...@@ -24,7 +24,8 @@ import numpy as np ...@@ -24,7 +24,8 @@ import numpy as np
from PIL import Image from PIL import Image
import paddle import paddle
from paddle.distributed import ParallelEnv from paddle.distributed import ParallelEnv, fleet
from paddle import amp
from paddle.static import InputSpec from paddle.static import InputSpec
from ppdet.core.workspace import create from ppdet.core.workspace import create
...@@ -174,10 +175,17 @@ class Trainer(object): ...@@ -174,10 +175,17 @@ class Trainer(object):
self.load_weights(self.cfg.pretrain_weights) self.load_weights(self.cfg.pretrain_weights)
model = self.model model = self.model
if self._nranks > 1: if self.cfg.fleet:
model = fleet.distributed_model(model)
self.optimizer = fleet.distributed_optimizer(
self.optimizer).user_defined_optimizer
elif self._nranks > 1:
model = paddle.DataParallel(self.model) model = paddle.DataParallel(self.model)
else:
model = self.model # initial fp16
if self.cfg.fp16:
scaler = amp.GradScaler(
enable=self.cfg.use_gpu, init_loss_scaling=1024)
self.status.update({ self.status.update({
'epoch_id': self.start_epoch, 'epoch_id': self.start_epoch,
...@@ -203,13 +211,25 @@ class Trainer(object): ...@@ -203,13 +211,25 @@ class Trainer(object):
self.status['step_id'] = step_id self.status['step_id'] = step_id
self._compose_callback.on_step_begin(self.status) self._compose_callback.on_step_begin(self.status)
# model forward if self.cfg.fp16:
outputs = model(data) with amp.auto_cast(enable=self.cfg.use_gpu):
loss = outputs['loss'] # model forward
outputs = model(data)
loss = outputs['loss']
# model backward
scaled_loss = scaler.scale(loss)
scaled_loss.backward()
# in dygraph mode, optimizer.minimize is equal to optimizer.step
scaler.minimize(self.optimizer, scaled_loss)
else:
# model forward
outputs = model(data)
loss = outputs['loss']
# model backward
loss.backward()
self.optimizer.step()
# model backward
loss.backward()
self.optimizer.step()
curr_lr = self.optimizer.get_lr() curr_lr = self.optimizer.get_lr()
self.lr.step() self.lr.step()
self.optimizer.clear_grad() self.optimizer.clear_grad()
......
...@@ -33,7 +33,7 @@ from paddle.distributed import ParallelEnv ...@@ -33,7 +33,7 @@ from paddle.distributed import ParallelEnv
from ppdet.core.workspace import load_config, merge_config, create from ppdet.core.workspace import load_config, merge_config, create
from ppdet.utils.checkpoint import load_weight, load_pretrain_weight from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
from ppdet.engine import Trainer, init_parallel_env, set_random_seed from ppdet.engine import Trainer, init_parallel_env, set_random_seed, init_fleet_env
import ppdet.utils.cli as cli import ppdet.utils.cli as cli
import ppdet.utils.check as check import ppdet.utils.check as check
...@@ -65,13 +65,24 @@ def parse_args(): ...@@ -65,13 +65,24 @@ def parse_args():
default=False, default=False,
help="If set True, enable continuous evaluation job." help="If set True, enable continuous evaluation job."
"This flag is only used for internal test.") "This flag is only used for internal test.")
parser.add_argument(
"--fp16",
action='store_true',
default=False,
help="Enable mixed precision training.")
parser.add_argument(
"--fleet", action='store_true', default=False, help="Use fleet or not")
args = parser.parse_args() args = parser.parse_args()
return args return args
def run(FLAGS, cfg): def run(FLAGS, cfg):
# init parallel environment if nranks > 1 # init fleet environment
init_parallel_env() if cfg.fleet:
init_fleet_env()
else:
# init parallel environment if nranks > 1
init_parallel_env()
if FLAGS.enable_ce: if FLAGS.enable_ce:
set_random_seed(0) set_random_seed(0)
...@@ -91,6 +102,8 @@ def main(): ...@@ -91,6 +102,8 @@ def main():
FLAGS = parse_args() FLAGS = parse_args()
cfg = load_config(FLAGS.config) cfg = load_config(FLAGS.config)
cfg['fp16'] = FLAGS.fp16
cfg['fleet'] = FLAGS.fleet
merge_config(FLAGS.opt) merge_config(FLAGS.opt)
if FLAGS.slim_config: if FLAGS.slim_config:
slim_cfg = load_config(FLAGS.slim_config) slim_cfg = load_config(FLAGS.slim_config)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册