From 1ed177e6ca31f18eb669772428548fd071bc0eb5 Mon Sep 17 00:00:00 2001 From: lichenever Date: Wed, 5 Aug 2020 10:30:24 +0800 Subject: [PATCH] update_distributed_training_doc_r0.6 --- .../source_zh_cn/advanced_use/distributed_training_ascend.md | 2 +- .../source_zh_cn/advanced_use/distributed_training_gpu.md | 4 +++- .../distributed_training/resnet50_distributed_training.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tutorials/source_zh_cn/advanced_use/distributed_training_ascend.md b/tutorials/source_zh_cn/advanced_use/distributed_training_ascend.md index 4a311467..d451bf7f 100644 --- a/tutorials/source_zh_cn/advanced_use/distributed_training_ascend.md +++ b/tutorials/source_zh_cn/advanced_use/distributed_training_ascend.md @@ -182,7 +182,7 @@ class SoftmaxCrossEntropyExpand(nn.Cell): self.onehot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) - self.div = P.Div() + self.div = P.RealDiv() self.log = P.Log() self.sum_cross_entropy = P.ReduceSum(keep_dims=False) self.mul = P.Mul() diff --git a/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md b/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md index 69c843b7..ac84f2cf 100644 --- a/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md +++ b/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md @@ -82,6 +82,8 @@ if __name__ == "__main__": > 你可以在这里找到样例的运行脚本: > > 。 +> +> 如果通过root用户执行脚本,`mpirun`需要加上`--allow-run-as-root`参数。 ```bash #!/bin/bash @@ -97,7 +99,7 @@ echo "start training" mpirun -n 8 pytest -s -v ./resnet50_distributed_training.py > train.log 2>&1 & ``` -脚本需要传入变量`DATA_PATH`,表示数据集的路径。此外,我们需要修改下`resnet50_distributed_training.py`文件,将`device_target`设置为`GPU`,并调用`init("nccl")`来使能NCCL。日志文件保存`device`目录下,关于Loss部分结果保存在`train.log`中。将loss值 `grep`出来后,示例如下: +脚本需要传入变量`DATA_PATH`,表示数据集的路径。此外,我们需要修改下`resnet50_distributed_training.py`文件,由于在GPU上,我们无需设置`DEVICE_ID`环境变量,因此,在脚本中不需要调用`int(os.getenv('DEVICE_ID'))`来获取卡的物理序号,同时`context`中也无需传入`device_id`。我们需要将`device_target`设置为`GPU`,并调用`init("nccl")`来使能NCCL。日志文件保存到device目录下,关于Loss部分结果保存在train.log中。将loss值grep出来后,示例如下: ``` epoch: 1 step: 1, loss is 2.3025854 diff --git a/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py b/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py index ec152dc1..a8a42109 100644 --- a/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py +++ b/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py @@ -89,7 +89,7 @@ class SoftmaxCrossEntropyExpand(nn.Cell): self.onehot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) - self.div = P.Div() + self.div = P.RealDiv() self.log = P.Log() self.sum_cross_entropy = P.ReduceSum(keep_dims=False) self.mul = P.Mul() -- GitLab