From 82540aa2f45e4a538c1929b01590f94982e064d0 Mon Sep 17 00:00:00 2001 From: lichenever Date: Fri, 31 Jul 2020 16:52:24 +0800 Subject: [PATCH] update distributed training gpu doc --- .../source_zh_cn/advanced_use/distributed_training_gpu.md | 6 +++++- tutorials/tutorial_code/distributed_training/run_gpu.sh | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md b/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md index bba26fa7..69c843b7 100644 --- a/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md +++ b/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md @@ -79,6 +79,10 @@ if __name__ == "__main__": 在GPU硬件平台上,MindSpore采用OpenMPI的`mpirun`进行分布式训练。下面以使用8张卡的分布式训练脚本为例,演示如何运行脚本: +> 你可以在这里找到样例的运行脚本: +> +> 。 + ```bash #!/bin/bash @@ -93,7 +97,7 @@ echo "start training" mpirun -n 8 pytest -s -v ./resnet50_distributed_training.py > train.log 2>&1 & ``` -脚本需要传入变量`DATA_PATH`,表示数据集的路径,resnet50_distributed_training.py是适配GPU后的Python文件。日志文件保存`device`目录下,关于Loss部分结果保存在`train.log`中。将loss值 `grep`出来后,示例如下: +脚本需要传入变量`DATA_PATH`,表示数据集的路径。此外,我们需要修改下`resnet50_distributed_training.py`文件,将`device_target`设置为`GPU`,并调用`init("nccl")`来使能NCCL。日志文件保存`device`目录下,关于Loss部分结果保存在`train.log`中。将loss值 `grep`出来后,示例如下: ``` epoch: 1 step: 1, loss is 2.3025854 diff --git a/tutorials/tutorial_code/distributed_training/run_gpu.sh b/tutorials/tutorial_code/distributed_training/run_gpu.sh index a2a840d9..76650522 100644 --- a/tutorials/tutorial_code/distributed_training/run_gpu.sh +++ b/tutorials/tutorial_code/distributed_training/run_gpu.sh @@ -8,4 +8,4 @@ mkdir device cp ./resnet50_distributed_training.py ./resnet.py ./device cd ./device echo "start training" -pytest -s -v ./resnet50_distributed_training.py > train.log 2>&1 & +mpirun -n 8 pytest -s -v ./resnet50_distributed_training.py > train.log 2>&1 & -- GitLab