diff --git a/tutorials/source_zh_cn/advanced_use/distributed_training_ascend.md b/tutorials/source_zh_cn/advanced_use/distributed_training_ascend.md index 4a311467794895b8bf64ef8d6f5d2799cd9ccc18..d451bf7f3c8f6688df43dc9b34a989bd522dc05d 100644 --- a/tutorials/source_zh_cn/advanced_use/distributed_training_ascend.md +++ b/tutorials/source_zh_cn/advanced_use/distributed_training_ascend.md @@ -182,7 +182,7 @@ class SoftmaxCrossEntropyExpand(nn.Cell): self.onehot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) - self.div = P.Div() + self.div = P.RealDiv() self.log = P.Log() self.sum_cross_entropy = P.ReduceSum(keep_dims=False) self.mul = P.Mul() diff --git a/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md b/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md index 69c843b78e636388ed14a3a5722f7a51ef8b46fc..ac84f2cf522c1ed3f15954036fd8c0596550b31f 100644 --- a/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md +++ b/tutorials/source_zh_cn/advanced_use/distributed_training_gpu.md @@ -82,6 +82,8 @@ if __name__ == "__main__": > 你可以在这里找到样例的运行脚本: > > 。 +> +> 如果通过root用户执行脚本,`mpirun`需要加上`--allow-run-as-root`参数。 ```bash #!/bin/bash @@ -97,7 +99,7 @@ echo "start training" mpirun -n 8 pytest -s -v ./resnet50_distributed_training.py > train.log 2>&1 & ``` -脚本需要传入变量`DATA_PATH`,表示数据集的路径。此外,我们需要修改下`resnet50_distributed_training.py`文件,将`device_target`设置为`GPU`,并调用`init("nccl")`来使能NCCL。日志文件保存`device`目录下,关于Loss部分结果保存在`train.log`中。将loss值 `grep`出来后,示例如下: +脚本需要传入变量`DATA_PATH`,表示数据集的路径。此外,我们需要修改下`resnet50_distributed_training.py`文件,由于在GPU上,我们无需设置`DEVICE_ID`环境变量,因此,在脚本中不需要调用`int(os.getenv('DEVICE_ID'))`来获取卡的物理序号,同时`context`中也无需传入`device_id`。我们需要将`device_target`设置为`GPU`,并调用`init("nccl")`来使能NCCL。日志文件保存到device目录下,关于Loss部分结果保存在train.log中。将loss值grep出来后,示例如下: ``` epoch: 1 step: 1, loss is 2.3025854 diff --git a/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py b/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py index ec152dc17f8f9672f77196280b392954bfb83ee3..a8a42109f9d23e740323b5ad7cf9c59027145b54 100644 --- a/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py +++ b/tutorials/tutorial_code/distributed_training/resnet50_distributed_training.py @@ -89,7 +89,7 @@ class SoftmaxCrossEntropyExpand(nn.Cell): self.onehot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) - self.div = P.Div() + self.div = P.RealDiv() self.log = P.Log() self.sum_cross_entropy = P.ReduceSum(keep_dims=False) self.mul = P.Mul()