From d2bd6ca8bc3507f1f88d73fa8874826ac99711bb Mon Sep 17 00:00:00 2001 From: liuyuhui <1029880267@qq.com> Date: Thu, 21 Jan 2021 12:46:39 +0800 Subject: [PATCH] add multi xpu train support for PaddleClas, multi card per process mode (#571) * add multixpu train support for PaddleClas, multi card per process mode * add kunlun ResNet50 config * fix ResNet50.yaml learning rate --- configs/kunlun/ResNet50.yaml | 76 +++++++++++++++++++ .../extension/train_on_multiplatform_xpu.md | 15 ++++ tools/train_multi_platform.py | 12 ++- 3 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 configs/kunlun/ResNet50.yaml create mode 100644 docs/zh_CN/extension/train_on_multiplatform_xpu.md diff --git a/configs/kunlun/ResNet50.yaml b/configs/kunlun/ResNet50.yaml new file mode 100644 index 00000000..496d7df1 --- /dev/null +++ b/configs/kunlun/ResNet50.yaml @@ -0,0 +1,76 @@ +mode: 'train' +ARCHITECTURE: + name: 'ResNet50' + +pretrained_model: "" +model_save_dir: "./output/" +classes_num: 1000 +total_images: 1281167 +save_interval: 1 +validate: True +valid_interval: 1 +epochs: 120 +topk: 5 +image_shape: [3, 224, 224] + +use_mix: False +ls_epsilon: -1 + +LEARNING_RATE: + function: 'Piecewise' + params: + lr: 0.0078125 + decay_epochs: [30, 60, 90] + gamma: 0.1 + +OPTIMIZER: + function: 'Momentum' + params: + momentum: 0.9 + regularizer: + function: 'L2' + factor: 0.000100 + +TRAIN: + batch_size: 20 + num_workers: 4 + file_list: "./dataset/ILSVRC2012/train_list.txt" + data_dir: "./dataset/ILSVRC2012/" + shuffle_seed: 0 + transforms: + - DecodeImage: + to_rgb: True + to_np: False + channel_first: False + - RandCropImage: + size: 224 + - RandFlipImage: + flip_code: 1 + - NormalizeImage: + scale: 1./255. + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: + +VALID: + batch_size: 20 + num_workers: 4 + file_list: "./dataset/ILSVRC2012/val_list.txt" + data_dir: "./dataset/ILSVRC2012/" + shuffle_seed: 0 + transforms: + - DecodeImage: + to_rgb: True + to_np: False + channel_first: False + - ResizeImage: + resize_short: 256 + - CropImage: + size: 224 + - NormalizeImage: + scale: 1.0/255.0 + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] + order: '' + - ToCHWImage: diff --git a/docs/zh_CN/extension/train_on_multiplatform_xpu.md b/docs/zh_CN/extension/train_on_multiplatform_xpu.md new file mode 100644 index 00000000..d8b8918b --- /dev/null +++ b/docs/zh_CN/extension/train_on_multiplatform_xpu.md @@ -0,0 +1,15 @@ +# 图像分类昆仑模型介绍(持续更新中) + +## 前言 + +* 文档介绍了目前昆仑支持的模型以及如何在昆仑设备上训练这些模型。支持昆仑的pddlePaddle安装参考install_kunlun(https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/paddle/install/install_Kunlun_zh.md) + +## 昆仑训练 +* 数据来源参考[ImageNet1k](https://github.com/PaddlePaddle/PaddleClas/blob/dygraph/docs/en/tutorials/data_en.md)。昆仑训练效果与CPU/GPU对齐。 + +### ResNet50 +* 命令: + +```python3.7 tools/train_multi_platform.py -c configs/kunlun/ResNet50.yaml -o use_gpu=False -o use_xpu=True``` + +与cpu/gpu训练的区别是加上-o use_xpu=True, 表示执行在昆仑设备上。 diff --git a/tools/train_multi_platform.py b/tools/train_multi_platform.py index 6362d8be..b5e1bbce 100644 --- a/tools/train_multi_platform.py +++ b/tools/train_multi_platform.py @@ -63,7 +63,17 @@ def main(args): config = get_config(args.config, overrides=args.override, show=True) # assign the place use_gpu = config.get("use_gpu", True) - places = fluid.cuda_places() if use_gpu else fluid.cpu_places() + use_xpu = config.get("use_xpu", False) + assert ( + use_gpu and use_xpu + ) is not True, "gpu and xpu can not be true in the same time in static mode!" + + if use_gpu: + places = fluid.cuda_places() + elif use_xpu: + places = fluid.xpu_places() + else: + places = fluid.cpu_places() # startup_prog is used to do some parameter init work, # and train prog is used to hold the network -- GitLab