!4275 add allredcue grouping for resnet gpu version

Merge pull request !4275 from yuchaojie/add_allreduce_group_for_resnet_gpu

!4275 add allredcue grouping for resnet gpu version
Merge pull request !4275 from yuchaojie/add_allreduce_group_for_resnet_gpu
993a28bc · mindspore-ci-bot · Gitee · 756b8346 · 64a1560f · 993a28bc
3 changed file
--- a/mindspore/parallel/_auto_parallel_context.py
+++ b/mindspore/parallel/_auto_parallel_context.py
@@ -275,7 +275,7 @@ class _AutoParallelContext:
        Args:
            indices (list): Indices list.
-            group (str): The hccl communication group.
+            group (str): The communication group of hccl/nccl.
        Raises:
            TypeError: If type of indices item is not int.
@@ -311,7 +311,7 @@ class _AutoParallelContext:
        Get allreduce fusion split indices.
        Args:
-            group (str): The hccl communication group.
+            group (str): The communication group of hccl/nccl.
        Returns:
            Return split sizes list according to the group.
@@ -340,7 +340,7 @@ class _AutoParallelContext:
        Args:
            sizes (list): Sizes list.
-            group (str): The hccl communication group.
+            group (str): The communication group of hccl/nccl.
        Raises:
            TypeError: If type of sizes item is not int.
@@ -376,7 +376,7 @@ class _AutoParallelContext:
        Get allreduce fusion split sizes.
        Args:
-            group (str): The hccl communication group.
+            group (str): The communication group of hccl/nccl.
        Returns:
            Return split sizes list according to the group.

--- a/model_zoo/official/cv/resnet/README.md
+++ b/model_zoo/official/cv/resnet/README.md
@@ -44,7 +44,7 @@ ImageNet2012
    ├── run_distribute_train.sh            # launch distributed training(8 pcs)
    ├── run_parameter_server_train.sh      # launch Ascend parameter server training(8 pcs)
    ├── run_eval.sh                        # launch evaluation
-    └── run_standalone_train.sh            # launch standalone training(1 pcs)
+    ├── run_standalone_train.sh            # launch standalone training(1 pcs)
    ├── run_distribute_train_gpu.sh        # launch gpu distributed training(8 pcs)
    ├── run_parameter_server_train_gpu.sh  # launch gpu parameter server training(8 pcs)
    ├── run_eval_gpu.sh                    # launch gpu evaluation

--- a/model_zoo/official/cv/resnet/train.py
+++ b/model_zoo/official/cv/resnet/train.py
@@ -81,9 +81,11 @@ if __name__ == '__main__':
            init()
        # GPU target
        else:
-            init("nccl")
            context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
                                              mirror_mean=True)
+            if args_opt.net == "resnet50":
+                auto_parallel_context().set_all_reduce_fusion_split_indices([85, 160])
+            init("nccl")
            ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
    # create dataset